mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-01 22:14:53 +08:00
Compare commits
105 Commits
predispatc
...
test-myst-
| Author | SHA1 | Date | |
|---|---|---|---|
| b1abd9ec11 | |||
| 00da8e63eb | |||
| 576253c476 | |||
| f5314f89c8 | |||
| 671e22a951 | |||
| d3d9bc1c31 | |||
| 7d296d5c19 | |||
| 2a60b8fc97 | |||
| d898d0d437 | |||
| 5998cd4eaa | |||
| 57024913c4 | |||
| ee72338f0c | |||
| c665594c1e | |||
| 255a04baf1 | |||
| 1d302eaee8 | |||
| a6b7bea244 | |||
| be72bcf828 | |||
| f80f97d192 | |||
| 42a69f7c2b | |||
| b87471e66f | |||
| f10e4430e2 | |||
| 2dccff7dcf | |||
| dec0d3101c | |||
| 9df0f56597 | |||
| 91602a9254 | |||
| cc372ad557 | |||
| 84058d1179 | |||
| 096dc35d77 | |||
| 56d07d0bde | |||
| 39b54b78d7 | |||
| 3703dabe42 | |||
| d3f9107d68 | |||
| cab96b5879 | |||
| 6100ed457c | |||
| badfebf29e | |||
| fc5a404eb1 | |||
| 04a393507b | |||
| a626dc8f16 | |||
| fd47401536 | |||
| e44e05f7ae | |||
| ddd74d10fc | |||
| 823e223893 | |||
| 6499420e45 | |||
| e17538022a | |||
| 37ded2ac90 | |||
| 767791943d | |||
| c917c63282 | |||
| 659bfbf443 | |||
| 832ab990c9 | |||
| 56df025d51 | |||
| 55ff4f85e9 | |||
| 7d2ceaff21 | |||
| 2a249f1967 | |||
| 52c294008e | |||
| 0971637c11 | |||
| 7d6f340238 | |||
| 4060f30042 | |||
| 9a28e23d97 | |||
| d0c00d9a69 | |||
| 371ffaf415 | |||
| 1b772de397 | |||
| 8e99714204 | |||
| 9b4d938f04 | |||
| 0142d5f4e2 | |||
| 91b69deeb0 | |||
| 392fa75411 | |||
| 3a67bf9c62 | |||
| d984143a74 | |||
| 21c97bd565 | |||
| a155f742ad | |||
| 3639d29ea1 | |||
| aee8a2e985 | |||
| eac777c4f4 | |||
| 1a6b21c59f | |||
| abe0c9538a | |||
| 95b658427d | |||
| 6341311333 | |||
| 350d6af52c | |||
| 9281625a9b | |||
| 2c37acfd89 | |||
| 08540b13c6 | |||
| 187c2deb40 | |||
| 67be2f27e1 | |||
| d293022c47 | |||
| ee5a434f8c | |||
| 4c18e85300 | |||
| 920f26c761 | |||
| 99cc3633f6 | |||
| 15a50dcf1c | |||
| 1227ed6674 | |||
| 2bb684304d | |||
| f09a484b81 | |||
| feaa02f9ad | |||
| b3c868d603 | |||
| cab28330f8 | |||
| 4366610f5a | |||
| dd0adc9386 | |||
| 734826d88e | |||
| 5a56e6a72b | |||
| e8af168ee0 | |||
| 97d7dc197f | |||
| 9498d95b9c | |||
| 0e46f54286 | |||
| 216ba6e5f2 | |||
| c774180e59 |
@ -12,7 +12,7 @@ fi
|
||||
SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
|
||||
source $SCRIPTPATH/../manywheel/set_desired_python.sh
|
||||
|
||||
pip install -q numpy==${NUMPY_VERSION} pyyaml==6.0.2 scons==4.7.0 ninja==1.11.1.4 patchelf==0.17.2
|
||||
pip install -q numpy==${NUMPY_VERSION} pyyaml==6.0.2 scons==4.7.0 ninja==1.11.1 patchelf==0.17.2
|
||||
|
||||
for tool in python python3 pip pip3 ninja scons patchelf; do
|
||||
ln -sf ${DESIRED_PYTHON_BIN_DIR}/${tool} /usr/local/bin;
|
||||
|
||||
@ -36,3 +36,105 @@ See `build.sh` for valid build environments (it's the giant switch).
|
||||
# Set flags (see build.sh) and build image
|
||||
sudo bash -c 'TRITON=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
|
||||
```
|
||||
|
||||
## [Guidance] Adding a New Base Docker Image
|
||||
|
||||
### Background
|
||||
|
||||
The base Docker images in directory `.ci/docker/` are built by the `docker-builds.yml` workflow. Those images are used throughout the PyTorch CI/CD pipeline. You should only create or modify a base Docker image if you need specific environment changes or dependencies before building PyTorch on CI.
|
||||
|
||||
1. **Automatic Rebuilding**:
|
||||
- The Docker image building process is triggered automatically when changes are made to files in the `.ci/docker/*` directory
|
||||
- This ensures all images stay up-to-date with the latest dependencies and configurations
|
||||
|
||||
2. **Image Reuse in PyTorch Build Workflows** (example: linux-build):
|
||||
- The images generated by `docker-builds.yml` are reused in `_linux-build.yml` through the `calculate-docker-image` step
|
||||
- The `_linux-build.yml` workflow:
|
||||
- Pulls the Docker image determined by the `calculate-docker-image` step
|
||||
- Runs a Docker container with that image
|
||||
- Executes `.ci/pytorch/build.sh` inside the container to build PyTorch
|
||||
|
||||
3. **Usage in Test Workflows** (example: linux-test):
|
||||
- The same Docker images are also used in `_linux-test.yml` for running tests
|
||||
- The `_linux-test.yml` workflow follows a similar pattern:
|
||||
- It uses the `calculate-docker-image` step to determine which Docker image to use
|
||||
- It pulls the Docker image and runs a container with that image
|
||||
- It installs the wheels from the artifacts generated by PyTorch build jobs
|
||||
- It executes test scripts (like `.ci/pytorch/test.sh` or `.ci/pytorch/multigpu-test.sh`) inside the container
|
||||
|
||||
### Understanding File Purposes
|
||||
|
||||
#### `.ci/docker/build.sh` vs `.ci/pytorch/build.sh`
|
||||
- **`.ci/docker/build.sh`**:
|
||||
- Used for building base Docker images
|
||||
- Executed by the `docker-builds.yml` workflow to pre-build Docker images for CI
|
||||
- Contains configurations for different Docker build environments
|
||||
|
||||
- **`.ci/pytorch/build.sh`**:
|
||||
- Used for building PyTorch inside a Docker container
|
||||
- Called by workflows like `_linux-build.yml` after the Docker container is started
|
||||
- Builds PyTorch wheels and other artifacts
|
||||
|
||||
#### `.ci/docker/ci_commit_pins/` vs `.github/ci_commit_pins`
|
||||
- **`.ci/docker/ci_commit_pins/`**:
|
||||
- Used for pinning dependency versions during base Docker image building
|
||||
- Ensures consistent environments for building PyTorch
|
||||
- Changes here trigger base Docker image rebuilds
|
||||
|
||||
- **`.github/ci_commit_pins`**:
|
||||
- Used for pinning dependency versions during PyTorch building and tests
|
||||
- Ensures consistent dependencies for PyTorch across different builds
|
||||
- Used by build scripts running inside Docker containers
|
||||
|
||||
### Step-by-Step Guide for Adding a New Base Docker Image
|
||||
|
||||
#### 1. Add Pinned Commits (If Applicable)
|
||||
|
||||
We use pinned commits for build stability. The `nightly.yml` workflow checks and updates pinned commits for certain repository dependencies daily.
|
||||
|
||||
If your new Docker image needs a library installed from a specific pinned commit or built from source:
|
||||
|
||||
1. Add the repository you want to track in `nightly.yml` and `merge-rules.yml`
|
||||
2. Add the initial pinned commit in `.ci/docker/ci_commit_pins/`. The text filename should match the one defined in step 1
|
||||
|
||||
#### 2. Configure the Base Docker Image
|
||||
1. **Add new Base Docker image configuration** (if applicable):
|
||||
|
||||
Add the configuration in `.ci/docker/build.sh`. For example:
|
||||
```bash
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-new1)
|
||||
CUDA_VERSION=12.8.1
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
NEW_ARG_1=yes
|
||||
;;
|
||||
```
|
||||
|
||||
2. **Add build arguments to Docker build command**:
|
||||
|
||||
If you're introducing a new argument to the Docker build, make sure to add it in the Docker build step in `.ci/docker/build.sh`:
|
||||
```bash
|
||||
docker build \
|
||||
....
|
||||
--build-arg "NEW_ARG_1=${NEW_ARG_1}"
|
||||
```
|
||||
|
||||
3. **Update Dockerfile logic**:
|
||||
|
||||
Update the Dockerfile to use the new argument. For example, in `ubuntu/Dockerfile`:
|
||||
```dockerfile
|
||||
ARG NEW_ARG_1
|
||||
# Set up environment for NEW_ARG_1
|
||||
RUN if [ -n "${NEW_ARG_1}" ]; then bash ./do_something.sh; fi
|
||||
```
|
||||
|
||||
4. **Add the Docker configuration** in `.github/workflows/docker-builds.yml`:
|
||||
|
||||
The `docker-builds.yml` workflow pre-builds the Docker images whenever changes occur in the `.ci/docker/` directory. This includes the
|
||||
pinned commit updates.
|
||||
|
||||
@ -160,6 +160,17 @@ case "$tag" in
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm)
|
||||
CUDA_VERSION=12.8.1
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.6
|
||||
CUDNN_VERSION=9
|
||||
@ -276,7 +287,7 @@ case "$tag" in
|
||||
NINJA_VERSION=1.9.0
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
|
||||
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
|
||||
@ -87,7 +87,7 @@ EOF
|
||||
if [[ $(ver $ROCM_VERSION) -ge $(ver 6.4) ]] && [[ $(ver $ROCM_VERSION) -lt $(ver 7.0) ]]; then
|
||||
if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.1) ]]; then
|
||||
HIP_BRANCH=release/rocm-rel-6.4
|
||||
CLR_HASH=ca18eb3f77fa09292fcda62bc60c3e565d752ada # branch release/rocm-rel-6.4.1-statco-hotfix
|
||||
CLR_HASH=606bc820b4b1f315d135da02a1f0b176ca50a92c # branch release/rocm-rel-6.4.1-statco-hotfix
|
||||
elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
|
||||
HIP_BRANCH=release/rocm-rel-6.4
|
||||
CLR_HASH=600f5b0d2baed94d5121e2174a9de0851b040b0c # branch release/rocm-rel-6.4-statco-hotfix
|
||||
|
||||
@ -128,7 +128,7 @@ ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
|
||||
ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
|
||||
# Install setuptools and wheel for python 3.12/3.13
|
||||
RUN for cpython_version in "cp312-cp312" "cp313-cp313" "cp313-cp313t"; do \
|
||||
/opt/python/${cpython_version}/bin/python -m pip install "setuptools>=77.0.0" "packaging>=24.2" wheel; \
|
||||
/opt/python/${cpython_version}/bin/python -m pip install setuptools wheel; \
|
||||
done;
|
||||
|
||||
|
||||
|
||||
@ -124,9 +124,10 @@ RUN python3 -mpip install cmake==3.28.0
|
||||
# install newest flatbuffers version first:
|
||||
# for some reason old version is getting pulled in otherwise.
|
||||
# packaging package is required for onnxruntime wheel build.
|
||||
RUN pip3 install 'setuptools>=77.0' 'packaging>=24.2' && \
|
||||
pip3 install flatbuffers cython 'pkgconfig>=1.5.5' 'numpy<2.3.0' && \
|
||||
RUN pip3 install flatbuffers && \
|
||||
pip3 install cython 'pkgconfig>=1.5.5' 'setuptools>=77' 'numpy<2.3.0' && \
|
||||
pip3 install --no-build-isolation h5py==3.11.0 && \
|
||||
pip3 install packaging && \
|
||||
git clone https://github.com/microsoft/onnxruntime && \
|
||||
cd onnxruntime && git checkout v1.21.0 && \
|
||||
git submodule update --init --recursive && \
|
||||
|
||||
@ -50,7 +50,7 @@ flatbuffers==24.12.23
|
||||
hypothesis==5.35.1
|
||||
# Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
|
||||
#Description: advanced library for generating parametrized tests
|
||||
#Pinned versions: 5.35.1
|
||||
#Pinned versions: 3.44.6, 4.53.2
|
||||
#test that import: test_xnnpack_integration.py, test_pruning_op.py, test_nn.py
|
||||
|
||||
junitparser==2.1.1
|
||||
@ -104,10 +104,10 @@ networkx==2.8.8
|
||||
#Pinned versions: 2.8.8
|
||||
#test that import: functorch
|
||||
|
||||
ninja==1.11.1.4
|
||||
ninja==1.11.1.3
|
||||
#Description: build system. Used in some tests. Used in build to generate build
|
||||
#time tracing information
|
||||
#Pinned versions: 1.11.1.4
|
||||
#Pinned versions: 1.11.1.3
|
||||
#test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
|
||||
|
||||
numba==0.49.0 ; python_version < "3.9"
|
||||
@ -307,7 +307,7 @@ pytest-cpp==2.3.0
|
||||
#Pinned versions: 2.3.0
|
||||
#test that import:
|
||||
|
||||
z3-solver==4.15.1.0
|
||||
z3-solver==4.12.6.0
|
||||
#Description: The Z3 Theorem Prover Project
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
@ -363,10 +363,9 @@ pwlf==2.2.1
|
||||
|
||||
|
||||
# To build PyTorch itself
|
||||
packaging>=24.2
|
||||
pyyaml
|
||||
pyzstd
|
||||
setuptools>=77.0.0
|
||||
setuptools>=70.1.0
|
||||
six
|
||||
|
||||
scons==4.5.2 ; platform_machine == "aarch64"
|
||||
|
||||
@ -4,8 +4,8 @@ sphinx==5.3.0
|
||||
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2
|
||||
|
||||
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
|
||||
# but it doesn't seem to work and hangs around idly. The initial thought it is probably
|
||||
# something related to Docker setup. We can investigate this later
|
||||
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
|
||||
# something related to Docker setup. We can investigate this later.
|
||||
|
||||
sphinxcontrib.katex==0.8.6
|
||||
#Description: This is used to generate PyTorch docs
|
||||
|
||||
@ -269,9 +269,6 @@ if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then
|
||||
tools/bazel build --config=no-tty "${BAZEL_MEM_LIMIT}" "${BAZEL_CPU_LIMIT}" //...
|
||||
fi
|
||||
else
|
||||
# install build-system requirements before running setup.py commands
|
||||
python -m pip install -r requirements-build.txt
|
||||
|
||||
# check that setup.py would fail with bad arguments
|
||||
echo "The next three invocations are expected to fail with invalid command error messages."
|
||||
( ! get_exit_code python setup.py bad_argument )
|
||||
|
||||
@ -204,8 +204,32 @@ function install_torchrec_and_fbgemm() {
|
||||
pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
|
||||
pip_uninstall fbgemm-gpu-nightly
|
||||
|
||||
# Set ROCM_HOME isn't available, use ROCM_PATH if set or /opt/rocm
|
||||
ROCM_HOME="${ROCM_HOME:-${ROCM_PATH:-/opt/rocm}}"
|
||||
|
||||
# Find rocm_version.h header file for ROCm version extract
|
||||
rocm_version_h="${ROCM_HOME}/include/rocm-core/rocm_version.h"
|
||||
if [ ! -f "$rocm_version_h" ]; then
|
||||
rocm_version_h="${ROCM_HOME}/include/rocm_version.h"
|
||||
fi
|
||||
|
||||
# Error out if rocm_version.h not found
|
||||
if [ ! -f "$rocm_version_h" ]; then
|
||||
echo "Error: rocm_version.h not found in expected locations." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Extract major, minor and patch ROCm version numbers
|
||||
MAJOR_VERSION=$(grep 'ROCM_VERSION_MAJOR' "$rocm_version_h" | awk '{print $3}')
|
||||
MINOR_VERSION=$(grep 'ROCM_VERSION_MINOR' "$rocm_version_h" | awk '{print $3}')
|
||||
PATCH_VERSION=$(grep 'ROCM_VERSION_PATCH' "$rocm_version_h" | awk '{print $3}')
|
||||
ROCM_INT=$((MAJOR_VERSION * 10000 + MINOR_VERSION * 100 + PATCH_VERSION))
|
||||
echo "ROCm version: $ROCM_INT"
|
||||
export BUILD_ROCM_VERSION="$MAJOR_VERSION.$MINOR_VERSION"
|
||||
|
||||
pip_install tabulate # needed for newer fbgemm
|
||||
pip_install patchelf # needed for rocm fbgemm
|
||||
pushd /tmp
|
||||
|
||||
local wheel_dir=dist/fbgemm_gpu
|
||||
local found_whl=0
|
||||
@ -223,7 +247,7 @@ function install_torchrec_and_fbgemm() {
|
||||
pushd fbgemm/fbgemm_gpu
|
||||
git checkout "${fbgemm_commit}"
|
||||
python setup.py bdist_wheel \
|
||||
--package_variant=rocm \
|
||||
--build-variant=rocm \
|
||||
-DHIP_ROOT_DIR="${ROCM_PATH}" \
|
||||
-DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \
|
||||
-DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
|
||||
@ -240,6 +264,7 @@ function install_torchrec_and_fbgemm() {
|
||||
done
|
||||
|
||||
rm -rf fbgemm
|
||||
popd
|
||||
else
|
||||
pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
|
||||
pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu
|
||||
|
||||
@ -201,7 +201,7 @@ fi
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]] ; then
|
||||
# JIT C++ extensions require ninja.
|
||||
pip_install "ninja==1.11.1.4"
|
||||
pip_install "ninja==1.10.2"
|
||||
# ninja is installed in $HOME/.local/bin, e.g., /var/lib/jenkins/.local/bin for CI user jenkins
|
||||
# but this script should be runnable by any user, including root
|
||||
export PATH="$HOME/.local/bin:$PATH"
|
||||
@ -345,6 +345,12 @@ test_h100_symm_mem() {
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
test_h100_cutlass_backend() {
|
||||
# cutlass backend tests for H100
|
||||
TORCHINDUCTOR_CUTLASS_DIR=$(realpath "./third_party/cutlass") python test/run_test.py --include inductor/test_cutlass_backend -k "not addmm" $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
TORCHINDUCTOR_CUTLASS_DIR=$(realpath "./third_party/cutlass") python test/run_test.py --include inductor/test_cutlass_evt $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
}
|
||||
|
||||
test_lazy_tensor_meta_reference_disabled() {
|
||||
export TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE=1
|
||||
echo "Testing lazy tensor operations without meta reference"
|
||||
@ -1769,6 +1775,8 @@ elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
|
||||
test_h100_distributed
|
||||
elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
|
||||
test_h100_symm_mem
|
||||
elif [[ "${TEST_CONFIG}" == h100_cutlass_backend ]]; then
|
||||
test_h100_cutlass_backend
|
||||
else
|
||||
install_torchvision
|
||||
install_monkeytype
|
||||
|
||||
34
.ci/pytorch/win-arm64-build.ps1
Normal file
34
.ci/pytorch/win-arm64-build.ps1
Normal file
@ -0,0 +1,34 @@
|
||||
# If you want to rebuild, run this with $env:REBUILD=1
|
||||
# If you want to build with CUDA, run this with $env:USE_CUDA=1
|
||||
# If you want to build without CUDA, run this with $env:USE_CUDA=0
|
||||
|
||||
# Check for setup.py in the current directory
|
||||
if (-not (Test-Path "setup.py")) {
|
||||
Write-Host "ERROR: Please run this build script from PyTorch root directory."
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Get the script's parent directory
|
||||
$ScriptParentDir = Split-Path -Parent $MyInvocation.MyCommand.Definition
|
||||
|
||||
# Set TMP_DIR and convert to Windows path
|
||||
$env:TMP_DIR = Join-Path (Get-Location) "build\win_tmp"
|
||||
$env:TMP_DIR_WIN = $env:TMP_DIR # Already in Windows format, no cygpath needed
|
||||
|
||||
# Set final package directory with default fallback
|
||||
if (-not $env:PYTORCH_FINAL_PACKAGE_DIR) {
|
||||
$env:PYTORCH_FINAL_PACKAGE_DIR = "C:\w\build-results"
|
||||
}
|
||||
|
||||
# Create the final package directory if it doesn't exist
|
||||
if (-not (Test-Path $env:PYTORCH_FINAL_PACKAGE_DIR)) {
|
||||
New-Item -Path $env:PYTORCH_FINAL_PACKAGE_DIR -ItemType Directory -Force | Out-Null
|
||||
}
|
||||
|
||||
# Set script helpers directory
|
||||
$env:SCRIPT_HELPERS_DIR = Join-Path $ScriptParentDir "win-test-helpers\arm64"
|
||||
|
||||
# Run the main build script
|
||||
& "$env:SCRIPT_HELPERS_DIR\build_pytorch.ps1"
|
||||
|
||||
Write-Host "BUILD PASSED"
|
||||
24
.ci/pytorch/win-arm64-test.sh
Normal file
24
.ci/pytorch/win-arm64-test.sh
Normal file
@ -0,0 +1,24 @@
|
||||
#!/bin/bash
|
||||
set -ex -o pipefail
|
||||
|
||||
SCRIPT_PARENT_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
|
||||
# shellcheck source=./common.sh
|
||||
source "$SCRIPT_PARENT_DIR/common.sh"
|
||||
|
||||
run_tests() {
|
||||
echo Running smoke_test.py...
|
||||
python ./.ci/pytorch/smoke_test/smoke_test.py --package torchonly
|
||||
|
||||
echo Running test_autograd.oy, test_nn.py, test_torch.py...
|
||||
cd test
|
||||
|
||||
CORE_TEST_LIST=("test_autograd.py" "test_nn.py" "test_modules.py")
|
||||
|
||||
for t in "${CORE_TEST_LIST[@]}"; do
|
||||
echo "Running test: $t"
|
||||
python "$t" --verbose --save-xml --use-pytest -vvvv -rfEsxXP -p no:xdist
|
||||
done
|
||||
}
|
||||
|
||||
run_tests
|
||||
echo "TEST PASSED"
|
||||
98
.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1
Normal file
98
.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1
Normal file
@ -0,0 +1,98 @@
|
||||
# TODO: we may can use existing build_pytorch.bat for arm64
|
||||
|
||||
if ($env:DEBUG -eq "1") {
|
||||
$env:BUILD_TYPE = "debug"
|
||||
} else {
|
||||
$env:BUILD_TYPE = "release"
|
||||
}
|
||||
|
||||
# This inflates our log size slightly, but it is REALLY useful to be
|
||||
# able to see what our cl.exe commands are. (since you can actually
|
||||
# just copy-paste them into a local Windows setup to just rebuild a
|
||||
# single file.)
|
||||
# log sizes are too long, but leaving this here in case someone wants to use it locally
|
||||
# $env:CMAKE_VERBOSE_MAKEFILE = "1"
|
||||
|
||||
$env:INSTALLER_DIR = Join-Path $env:SCRIPT_HELPERS_DIR "installation-helpers"
|
||||
|
||||
cd ..
|
||||
|
||||
# Environment variables
|
||||
$env:SCCACHE_IDLE_TIMEOUT = "0"
|
||||
$env:SCCACHE_IGNORE_SERVER_IO_ERROR = "1"
|
||||
$env:CMAKE_BUILD_TYPE = $env:BUILD_TYPE
|
||||
$env:CMAKE_C_COMPILER_LAUNCHER = "sccache"
|
||||
$env:CMAKE_CXX_COMPILER_LAUNCHER = "sccache"
|
||||
$env:libuv_ROOT = Join-Path $env:DEPENDENCIES_DIR "libuv\install"
|
||||
$env:MSSdk = "1"
|
||||
|
||||
if ($env:PYTORCH_BUILD_VERSION) {
|
||||
$env:PYTORCH_BUILD_VERSION = $env:PYTORCH_BUILD_VERSION
|
||||
$env:PYTORCH_BUILD_NUMBER = "1"
|
||||
}
|
||||
|
||||
$env:CMAKE_POLICY_VERSION_MINIMUM = "3.5"
|
||||
|
||||
# Set BLAS type
|
||||
if ($env:ENABLE_APL -eq "1") {
|
||||
$env:BLAS = "APL"
|
||||
$env:USE_LAPACK = "1"
|
||||
} elseif ($env:ENABLE_OPENBLAS -eq "1") {
|
||||
$env:BLAS = "OpenBLAS"
|
||||
$env:OpenBLAS_HOME = Join-Path $env:DEPENDENCIES_DIR "OpenBLAS\install"
|
||||
}
|
||||
|
||||
# Change to source directory
|
||||
Set-Location $env:PYTORCH_ROOT
|
||||
|
||||
# Copy libuv.dll
|
||||
Copy-Item -Path (Join-Path $env:libuv_ROOT "lib\Release\uv.dll") -Destination "torch\lib\uv.dll" -Force
|
||||
|
||||
# Create virtual environment
|
||||
python -m venv .venv
|
||||
.\.venv\Scripts\Activate.ps1
|
||||
where.exe python
|
||||
|
||||
# Python install dependencies
|
||||
python -m pip install --upgrade pip
|
||||
pip install setuptools pyyaml
|
||||
pip install -r requirements.txt
|
||||
|
||||
# Set after installing psutil
|
||||
$env:DISTUTILS_USE_SDK = "1"
|
||||
|
||||
# Print all environment variables
|
||||
Get-ChildItem Env:
|
||||
|
||||
# Start and inspect sccache
|
||||
sccache --start-server
|
||||
sccache --zero-stats
|
||||
sccache --show-stats
|
||||
|
||||
# Build the wheel
|
||||
python setup.py bdist_wheel
|
||||
if ($LASTEXITCODE -ne 0) { exit 1 }
|
||||
|
||||
# Install the wheel locally
|
||||
$whl = Get-ChildItem -Path "dist\*.whl" | Select-Object -First 1
|
||||
if ($whl) {
|
||||
python -mpip install --no-index --no-deps $whl.FullName
|
||||
}
|
||||
|
||||
# Copy final wheel
|
||||
robocopy "dist" "$env:PYTORCH_FINAL_PACKAGE_DIR" *.whl
|
||||
|
||||
# Export test times
|
||||
python tools/stats/export_test_times.py
|
||||
|
||||
# Copy additional CI files
|
||||
robocopy ".additional_ci_files" "$env:PYTORCH_FINAL_PACKAGE_DIR\.additional_ci_files" /E
|
||||
|
||||
# Save ninja log
|
||||
Copy-Item -Path "build\.ninja_log" -Destination $env:PYTORCH_FINAL_PACKAGE_DIR -Force
|
||||
|
||||
# Final sccache stats and stop
|
||||
sccache --show-stats
|
||||
sccache --stop-server
|
||||
|
||||
exit 0
|
||||
@ -126,11 +126,6 @@ if "%USE_CUDA%"=="1" (
|
||||
set CMAKE_CUDA_COMPILER_LAUNCHER=%TMP_DIR%/bin/randomtemp.exe;%TMP_DIR%\bin\sccache.exe
|
||||
)
|
||||
|
||||
:: Install build-system requirements before running setup.py commands
|
||||
python -m pip install -r requirements-build.txt
|
||||
if errorlevel 1 goto fail
|
||||
if not errorlevel 0 goto fail
|
||||
|
||||
:: Print all existing environment variable for debugging
|
||||
set
|
||||
|
||||
|
||||
@ -41,7 +41,7 @@ fi
|
||||
python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
|
||||
|
||||
# Install Z3 optional dependency for Windows builds.
|
||||
python -m pip install z3-solver==4.15.1.0
|
||||
python -m pip install z3-solver==4.12.2.0
|
||||
|
||||
# Install tlparse for test\dynamo\test_structured_trace.py UTs.
|
||||
python -m pip install tlparse==0.3.30
|
||||
|
||||
@ -18,5 +18,5 @@ start /wait "" python-amd64.exe /quiet InstallAllUsers=1 PrependPath=0 Include_t
|
||||
if errorlevel 1 exit /b 1
|
||||
|
||||
set "PATH=%CD%\Python\Scripts;%CD%\Python;%PATH%"
|
||||
%PYTHON_EXEC% -m pip install --upgrade pip "setuptools>=77.0.0" "packaging>=24.2" wheel
|
||||
%PYTHON_EXEC% -m pip install --upgrade pip setuptools packaging wheel
|
||||
if errorlevel 1 exit /b 1
|
||||
|
||||
@ -7,9 +7,6 @@ call "internal\install_python.bat"
|
||||
|
||||
%PYTHON_EXEC% --version
|
||||
set "PATH=%CD%\Python\Lib\site-packages\cmake\data\bin;%CD%\Python\Scripts;%CD%\Python;%PATH%"
|
||||
|
||||
%PYTHON_EXEC% -m pip install "setuptools>=77.0.0" "packaging>=24.2"
|
||||
|
||||
if "%DESIRED_PYTHON%" == "3.13t" %PYTHON_EXEC% -m pip install numpy==2.2.1 cmake
|
||||
if "%DESIRED_PYTHON%" == "3.13" %PYTHON_EXEC% -m pip install numpy==2.1.2 cmake
|
||||
if "%DESIRED_PYTHON%" == "3.12" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake
|
||||
@ -19,7 +16,7 @@ if "%DESIRED_PYTHON%" == "3.9" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake
|
||||
|
||||
%PYTHON_EXEC% -m pip install pyyaml
|
||||
%PYTHON_EXEC% -m pip install mkl-include mkl-static
|
||||
%PYTHON_EXEC% -m pip install boto3 ninja typing-extensions
|
||||
%PYTHON_EXEC% -m pip install boto3 ninja typing_extensions setuptools==72.1.0
|
||||
|
||||
where cmake.exe
|
||||
|
||||
|
||||
@ -127,7 +127,7 @@ export INSTALL_TEST=0 # dont install test binaries into site-packages
|
||||
export MACOSX_DEPLOYMENT_TARGET=10.15
|
||||
export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
|
||||
|
||||
SETUPTOOLS_PINNED_VERSION="==77.0.0"
|
||||
SETUPTOOLS_PINNED_VERSION="==70.1.0"
|
||||
PYYAML_PINNED_VERSION="=5.3"
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
@ -135,7 +135,7 @@ RENAME_WHEEL=true
|
||||
case $desired_python in
|
||||
3.13t)
|
||||
echo "Using 3.13 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=77.0.0"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
PYYAML_PINNED_VERSION=">=6.0.1"
|
||||
NUMPY_PINNED_VERSION="=2.1.0"
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
@ -145,31 +145,31 @@ case $desired_python in
|
||||
;;
|
||||
3.13)
|
||||
echo "Using 3.13 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=77.0.0"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
PYYAML_PINNED_VERSION=">=6.0.1"
|
||||
NUMPY_PINNED_VERSION="=2.1.0"
|
||||
;;
|
||||
3.12)
|
||||
echo "Using 3.12 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=77.0.0"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
PYYAML_PINNED_VERSION=">=6.0.1"
|
||||
NUMPY_PINNED_VERSION="=2.0.2"
|
||||
;;
|
||||
3.11)
|
||||
echo "Using 3.11 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=77.0.0"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
PYYAML_PINNED_VERSION=">=5.3"
|
||||
NUMPY_PINNED_VERSION="=2.0.2"
|
||||
;;
|
||||
3.10)
|
||||
echo "Using 3.10 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=77.0.0"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
PYYAML_PINNED_VERSION=">=5.3"
|
||||
NUMPY_PINNED_VERSION="=2.0.2"
|
||||
;;
|
||||
3.9)
|
||||
echo "Using 3.9 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=77.0.0"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
PYYAML_PINNED_VERSION=">=5.3"
|
||||
NUMPY_PINNED_VERSION="=2.0.2"
|
||||
;;
|
||||
|
||||
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
00b0c91db92c51a11356249262577b9fa26c18c5
|
||||
b6a3368a45aaafe05f1a6a9f10c68adc5e944d9e
|
||||
|
||||
2
.github/ci_commit_pins/fbgemm_rocm.txt
vendored
2
.github/ci_commit_pins/fbgemm_rocm.txt
vendored
@ -1 +1 @@
|
||||
5fb5024118e9bb9decf96c2b0b1a8f0010bf56be
|
||||
7f1de94a4c2d14f59ad4ca84538c36084ea6b2c8
|
||||
|
||||
1
.github/ci_commit_pins/vllm.txt
vendored
Normal file
1
.github/ci_commit_pins/vllm.txt
vendored
Normal file
@ -0,0 +1 @@
|
||||
b77c7d327f2a463bb9ef8be36f30e920bc066502
|
||||
15
.github/merge_rules.yaml
vendored
15
.github/merge_rules.yaml
vendored
@ -76,8 +76,8 @@
|
||||
- .github/ci_commit_pins/audio.txt
|
||||
- .github/ci_commit_pins/vision.txt
|
||||
- .github/ci_commit_pins/torchdynamo.txt
|
||||
- .github/ci_commit_pins/vllm.txt
|
||||
- .ci/docker/ci_commit_pins/triton.txt
|
||||
- .ci/docker/ci_commit_pins/vllm.txt
|
||||
approved_by:
|
||||
- pytorchbot
|
||||
mandatory_checks_name:
|
||||
@ -492,6 +492,19 @@
|
||||
- srossross
|
||||
- chillee
|
||||
- zou3519
|
||||
- guilhermeleobas
|
||||
mandatory_checks_name:
|
||||
- EasyCLA
|
||||
- Lint
|
||||
- pull
|
||||
|
||||
- name: Dynamo
|
||||
patterns:
|
||||
- torch/_dynamo/**
|
||||
- torch/csrc/dynamo/**
|
||||
- test/dynamo/**
|
||||
approved_by:
|
||||
- guilhermeleobas
|
||||
mandatory_checks_name:
|
||||
- EasyCLA
|
||||
- Lint
|
||||
|
||||
2
.github/pytorch-probot.yml
vendored
2
.github/pytorch-probot.yml
vendored
@ -31,7 +31,9 @@ ciflow_push_tags:
|
||||
- ciflow/pull
|
||||
- ciflow/h100
|
||||
- ciflow/h100-distributed
|
||||
- ciflow/win-arm64
|
||||
- ciflow/h100-symm-mem
|
||||
- ciflow/h100-cutlass-backend
|
||||
retryable_workflows:
|
||||
- pull
|
||||
- trunk
|
||||
|
||||
2
.github/requirements-gha-cache.txt
vendored
2
.github/requirements-gha-cache.txt
vendored
@ -8,7 +8,7 @@
|
||||
boto3==1.35.42
|
||||
jinja2==3.1.6
|
||||
lintrunner==0.10.7
|
||||
ninja==1.11.1.4
|
||||
ninja==1.10.0.post1
|
||||
nvidia-ml-py==11.525.84
|
||||
pyyaml==6.0
|
||||
requests==2.32.4
|
||||
|
||||
@ -7,12 +7,12 @@ hypothesis==6.56.4
|
||||
librosa>=0.6.2
|
||||
mpmath==1.3.0
|
||||
networkx==2.8.7
|
||||
ninja==1.11.1.4
|
||||
ninja==1.10.2.4
|
||||
numba==0.59.0
|
||||
numpy==1.26.4
|
||||
opt-einsum>=3.3
|
||||
optree==0.13.0
|
||||
packaging==25.0
|
||||
packaging==23.1
|
||||
parameterized==0.8.1
|
||||
pillow==10.3.0
|
||||
protobuf==5.29.4
|
||||
@ -26,11 +26,11 @@ pytest-xdist==3.3.1
|
||||
pytest==7.3.2
|
||||
pyyaml==6.0.2
|
||||
scipy==1.12.0
|
||||
setuptools==80.9.0
|
||||
setuptools==72.1.0
|
||||
sympy==1.13.3
|
||||
tlparse==0.3.30
|
||||
tensorboard==2.13.0
|
||||
typing-extensions==4.12.2
|
||||
unittest-xml-reporting<=3.2.0,>=2.0.0
|
||||
xdoctest==1.1.0
|
||||
z3-solver==4.15.1.0
|
||||
z3-solver==4.12.2.0
|
||||
|
||||
2
.github/scripts/lintrunner.sh
vendored
2
.github/scripts/lintrunner.sh
vendored
@ -2,7 +2,7 @@
|
||||
set -ex
|
||||
|
||||
# Use uv to speed up lintrunner init
|
||||
python3 -m pip install -U uv setuptools
|
||||
python3 -m pip install uv==0.1.45 setuptools
|
||||
|
||||
CACHE_DIRECTORY="/tmp/.lintbin"
|
||||
# Try to recover the cached binaries
|
||||
|
||||
2
.github/scripts/windows/build_triton.bat
vendored
2
.github/scripts/windows/build_triton.bat
vendored
@ -10,7 +10,7 @@ if "%PY_VERS%" == "3.13t" (
|
||||
call conda create -n %PYTHON_PREFIX% -y -c=conda-forge python=%PY_VERS%
|
||||
)
|
||||
:: Fix cmake version for issue https://github.com/pytorch/pytorch/issues/150480
|
||||
call conda run -n %PYTHON_PREFIX% pip install wheel pybind11 certifi cython cmake==3.31.6 setuptools==78.1.1 ninja
|
||||
call conda run -n %PYTHON_PREFIX% pip install wheel pybind11 certifi cython cmake==3.31.6 setuptools==72.1.0 ninja
|
||||
|
||||
dir "%VC_INSTALL_PATH%"
|
||||
|
||||
|
||||
4
.github/workflows/_get-changed-files.yml
vendored
4
.github/workflows/_get-changed-files.yml
vendored
@ -27,7 +27,7 @@ jobs:
|
||||
PR_NUMBER="${{ github.event.number }}"
|
||||
|
||||
# Use gh CLI to get changed files in the PR with explicit repo
|
||||
CHANGED_FILES=$(gh pr view "$PR_NUMBER" --repo "${{ github.repository }}" --json files --jq '.files[].path' | tr '\n' ' ' | sed 's/ $//')
|
||||
CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
|
||||
|
||||
if [ -z "$CHANGED_FILES" ]; then
|
||||
echo "No changed files found, setting to '*'"
|
||||
@ -40,4 +40,4 @@ jobs:
|
||||
else
|
||||
echo "Not in PR context, setting changed files to '*'"
|
||||
echo "changed-files=*" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
fi
|
||||
|
||||
5
.github/workflows/_mac-test.yml
vendored
5
.github/workflows/_mac-test.yml
vendored
@ -80,11 +80,6 @@ jobs:
|
||||
run: |
|
||||
sysctl machdep.cpu.brand_string kern.osproductversion
|
||||
|
||||
- name: Install build toolchain
|
||||
run: |
|
||||
brew update --quiet
|
||||
brew install --formula cmake ninja
|
||||
|
||||
- name: Clean up leftover processes on MacOS pet runner
|
||||
continue-on-error: true
|
||||
run: |
|
||||
|
||||
1
.github/workflows/docker-builds.yml
vendored
1
.github/workflows/docker-builds.yml
vendored
@ -50,6 +50,7 @@ jobs:
|
||||
runner: [linux.12xlarge]
|
||||
docker-image-name: [
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks,
|
||||
|
||||
58
.github/workflows/h100-cutlass-backend.yml
vendored
Normal file
58
.github/workflows/h100-cutlass-backend.yml
vendored
Normal file
@ -0,0 +1,58 @@
|
||||
name: Limited CI for CUTLASS backend on H100
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- .github/workflows/h100-cutlass-backend.yml
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: 22 9 * * * # every 24 hours about 2:22am PDT
|
||||
push:
|
||||
tags:
|
||||
- ciflow/h100-cutlass-backend/*
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
|
||||
get-label-type:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: get-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-cutlass-backend:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc11-sm90-cutlass-backend
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-cutlass-backend
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '9.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "h100_cutlass_backend", shard: 1, num_shards: 1, runner: "linux.aws.h100", owners: ["oncall:pt2"] },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-sm90-test:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc11-sm90-cutlass-backend
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs:
|
||||
- linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-cutlass-backend
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-cutlass-backend
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-cutlass-backend.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-cutlass-backend.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
2
.github/workflows/nightly.yml
vendored
2
.github/workflows/nightly.yml
vendored
@ -86,7 +86,7 @@ jobs:
|
||||
- repo-name: vllm
|
||||
repo-owner: vllm-project
|
||||
branch: main
|
||||
pin-folder: .ci/docker/ci_commit_pins
|
||||
pin-folder: .github/ci_commit_pins
|
||||
# Allow this to be triggered on either a schedule or on workflow_dispatch to allow for easier testing
|
||||
if: github.repository_owner == 'pytorch' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||
steps:
|
||||
|
||||
15
.github/workflows/pull.yml
vendored
15
.github/workflows/pull.yml
vendored
@ -315,21 +315,6 @@ jobs:
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3-clang18-mobile-build:
|
||||
name: linux-jammy-py3-clang18-mobile-build
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3-clang12-mobile-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
|
||||
build-generates-artifacts: false
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 1 },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-cudnn9-py3_9-clang12-build:
|
||||
name: linux-jammy-cuda12.8-cudnn9-py3.9-clang12
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
|
||||
187
.github/workflows/win-arm64-build-test.yml
vendored
Normal file
187
.github/workflows/win-arm64-build-test.yml
vendored
Normal file
@ -0,0 +1,187 @@
|
||||
name: windows-arm64-build-test
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- ciflow/win-arm64/*
|
||||
|
||||
env:
|
||||
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
|
||||
PYTHON_VERSION: "3.12"
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
DOWNLOADS_DIR: c:\temp\downloads
|
||||
DEPENDENCIES_DIR: c:\temp\dependencies
|
||||
ENABLE_APL: 1
|
||||
ENABLE_OPENBLAS: 0
|
||||
BUILD_TYPE: release
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
build:
|
||||
# Don't run on forked repos.
|
||||
if: github.repository_owner == 'pytorch'
|
||||
runs-on: "windows-11-arm64-preview"
|
||||
timeout-minutes: 240
|
||||
steps:
|
||||
- name: configure aws credentials
|
||||
id: aws_creds
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_sscache
|
||||
aws-region: us-east-1
|
||||
role-duration-seconds: 18000
|
||||
|
||||
- name: Enable long paths
|
||||
shell: cmd
|
||||
run: |
|
||||
git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
|
||||
git config --system core.longpaths true
|
||||
|
||||
- name: Git checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
path: pytorch
|
||||
submodules: recursive
|
||||
|
||||
- name: Bootstrap Python
|
||||
shell: cmd
|
||||
run: |
|
||||
"pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
|
||||
|
||||
- name: Parse ref
|
||||
id: parse-ref
|
||||
shell: bash
|
||||
run: python pytorch/.github/scripts/parse_ref.py
|
||||
|
||||
- name: Get workflow job id
|
||||
shell: bash
|
||||
id: get-job-id
|
||||
run: |
|
||||
set -eux
|
||||
python pytorch/.github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}"
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Bootstrap APL
|
||||
shell: cmd
|
||||
run: |
|
||||
"pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
|
||||
|
||||
- name: Bootstrap Rust
|
||||
shell: cmd
|
||||
run: |
|
||||
"pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
|
||||
|
||||
- name: Bootstrap sccache
|
||||
shell: cmd
|
||||
run: |
|
||||
"pytorch/.ci/pytorch/windows/arm64/bootstrap_sccache.bat"
|
||||
|
||||
- name: Bootstrap Libuv
|
||||
shell: cmd
|
||||
run: |
|
||||
"pytorch/.ci/pytorch/windows/arm64/bootstrap_libuv.bat"
|
||||
|
||||
- name: Build
|
||||
id: build
|
||||
shell: cmd
|
||||
env:
|
||||
PYTORCH_FINAL_PACKAGE_DIR: C:/${{ github.run_id }}/build-results/
|
||||
BRANCH: ${{ steps.parse-ref.outputs.branch }}
|
||||
BUILD_WHEEL: 1
|
||||
MAX_JOBS: 8
|
||||
PYTHON_VERSION: "3.12"
|
||||
SCCACHE_BUCKET: "ossci-compiler-cache"
|
||||
SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
|
||||
SCCACHE_REGION: us-east-1
|
||||
VC_PRODUCT: "BuildTools"
|
||||
VC_VERSION: ""
|
||||
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
USE_CUDA: '0'
|
||||
USE_XPU: '0'
|
||||
OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
|
||||
run: |
|
||||
cd pytorch
|
||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64
|
||||
powershell -ExecutionPolicy Bypass -File ".ci/pytorch/win-arm64-build.ps1"
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4.4.0
|
||||
if: always()
|
||||
with:
|
||||
name: torch-wheel-win-arm64-py3-12
|
||||
retention-days: 14
|
||||
if-no-files-found: error
|
||||
path: C:\${{ github.run_id }}\build-results
|
||||
|
||||
test:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
strategy:
|
||||
fail-fast: false
|
||||
runs-on: "windows-11-arm64-preview"
|
||||
needs: build
|
||||
steps:
|
||||
- name: Enable long paths
|
||||
shell: cmd
|
||||
run: |
|
||||
git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
|
||||
git config --system core.longpaths true
|
||||
|
||||
- name: Git checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
path: pytorch
|
||||
submodules: recursive
|
||||
|
||||
- name: Bootstrap Python
|
||||
shell: cmd
|
||||
run: |
|
||||
"pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
|
||||
|
||||
- name: Bootstrap Rust
|
||||
shell: cmd
|
||||
run: |
|
||||
"pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
|
||||
|
||||
- name: Get workflow job id
|
||||
shell: bash
|
||||
id: get-job-id
|
||||
run: |
|
||||
set -eux
|
||||
python pytorch/.github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}"
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Download Build Artifacts
|
||||
uses: actions/download-artifact@v4.1.7
|
||||
with:
|
||||
name: torch-wheel-win-arm64-py3-12
|
||||
path: C:\${{ github.run_id }}\build-results
|
||||
|
||||
- name: Test
|
||||
id: test
|
||||
shell: cmd
|
||||
env:
|
||||
USE_CUDA: '0'
|
||||
INSTALL_WINDOWS_SDK: 1
|
||||
PYTHON_VERSION: "3.12"
|
||||
VC_PRODUCT: "BuildTools"
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
GITHUB_REPOSITORY: ${{ github.repository }}
|
||||
GITHUB_WORKFLOW: ${{ github.workflow }}
|
||||
GITHUB_JOB: ${{ github.job }}
|
||||
GITHUB_RUN_ID: ${{ github.run_id }}
|
||||
GITHUB_RUN_NUMBER: ${{ github.run_number }}
|
||||
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
|
||||
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
|
||||
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
|
||||
PYTORCH_FINAL_PACKAGE_DIR: C:/${{ github.run_id }}/build-results/
|
||||
run: |
|
||||
mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
|
||||
call pytorch/.ci/pytorch/windows/arm64/bootstrap_tests.bat
|
||||
set GIT_BASH=C:\Program Files\Git\usr\bin\bash.exe
|
||||
"%GIT_BASH%" -c "bash --noprofile --norc .ci/pytorch/win-arm64-test.sh"
|
||||
@ -294,14 +294,12 @@ Install PyTorch
|
||||
|
||||
```bash
|
||||
export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
|
||||
python -m pip install -r requirements-build.txt
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
```
|
||||
|
||||
**On macOS**
|
||||
|
||||
```bash
|
||||
python -m pip install -r requirements-build.txt
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
```
|
||||
|
||||
|
||||
@ -14,7 +14,9 @@
|
||||
#include <ATen/cpu/FlushDenormal.h>
|
||||
|
||||
#ifdef USE_FBGEMM
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
|
||||
#include <fbgemm/Fbgemm.h>
|
||||
C10_DIAGNOSTIC_POP()
|
||||
#endif // USE_FBGEMM
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE)
|
||||
#include <cpuinfo.h>
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/CachingDeviceAllocator.h>
|
||||
#include <c10/core/DeviceType.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
|
||||
@ -73,27 +72,6 @@ TORCH_API c10::DeviceIndex exchangeDevice(c10::DeviceIndex device_index);
|
||||
// original device index that was active before the change.
|
||||
TORCH_API c10::DeviceIndex maybeExchangeDevice(c10::DeviceIndex device_index);
|
||||
|
||||
TORCH_API inline void emptyCache() {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
at::getDeviceAllocator(device_type)->emptyCache();
|
||||
}
|
||||
|
||||
TORCH_API inline at::CachingDeviceAllocator::DeviceStats getDeviceStats(
|
||||
c10::DeviceIndex device_index) {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
return at::getDeviceAllocator(device_type)->getDeviceStats(device_index);
|
||||
}
|
||||
|
||||
TORCH_API inline void resetAccumulatedStats(c10::DeviceIndex device_index) {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
at::getDeviceAllocator(device_type)->resetAccumulatedStats(device_index);
|
||||
}
|
||||
|
||||
TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
|
||||
}
|
||||
|
||||
} // namespace at::accelerator
|
||||
|
||||
namespace at {
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
#include <ATen/cuda/CUDAGraph.h>
|
||||
#include <ATen/cuda/Exceptions.h>
|
||||
#include <ATen/Functions.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
#include <c10/cuda/CUDAFunctions.h>
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
|
||||
#include <ATen/Tensor.h>
|
||||
#include <c10/core/Device.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
#include <c10/cuda/CUDAGraphsC10Utils.h>
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
#include <c10/util/flat_hash_map.h>
|
||||
|
||||
@ -258,7 +258,7 @@ DECLARE_HOST_ALLOCATOR(
|
||||
CUDACachingHostAllocator,
|
||||
CUDACachingHostAllocatorImpl,
|
||||
raw_local_deleter,
|
||||
caching_host_allocator);
|
||||
caching_host_allocator)
|
||||
|
||||
REGISTER_HOST_ALLOCATOR(at::kCUDA, &caching_host_allocator)
|
||||
|
||||
|
||||
@ -158,6 +158,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
|
||||
OP_DECOMPOSE(kron);
|
||||
OP_DECOMPOSE(l1_loss);
|
||||
m.impl("layer_norm", native::layer_norm_symint);
|
||||
m.impl("_fused_rms_norm", native::rms_norm_composite);
|
||||
OP_DECOMPOSE2(ldexp, Tensor);
|
||||
OP_DECOMPOSE2(less_equal, Tensor );
|
||||
OP_DECOMPOSE2(less, Tensor );
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/CachingDeviceAllocator.h>
|
||||
#include <c10/core/Allocator.h>
|
||||
#include <c10/core/DeviceType.h>
|
||||
|
||||
// Use of c10::hip namespace here makes hipification easier, because
|
||||
@ -10,10 +10,10 @@ namespace c10::hip {
|
||||
// Takes a valid HIPAllocator (of any sort) and turns it into
|
||||
// an allocator pretending to be a CUDA allocator. See
|
||||
// Note [Masquerading as CUDA]
|
||||
class HIPAllocatorMasqueradingAsCUDA final : public DeviceAllocator {
|
||||
DeviceAllocator* allocator_;
|
||||
class HIPAllocatorMasqueradingAsCUDA final : public Allocator {
|
||||
Allocator* allocator_;
|
||||
public:
|
||||
explicit HIPAllocatorMasqueradingAsCUDA(DeviceAllocator* allocator)
|
||||
explicit HIPAllocatorMasqueradingAsCUDA(Allocator* allocator)
|
||||
: allocator_(allocator) {}
|
||||
DataPtr allocate(size_t size) override {
|
||||
DataPtr r = allocator_->allocate(size);
|
||||
@ -26,24 +26,6 @@ public:
|
||||
void copy_data(void* dest, const void* src, std::size_t count) const final {
|
||||
allocator_->copy_data(dest, src, count);
|
||||
}
|
||||
bool initialized() override {
|
||||
return allocator_->initialized();
|
||||
}
|
||||
void emptyCache(MempoolId_t mempool_id = {0, 0}) {
|
||||
allocator_->emptyCache(mempool_id);
|
||||
}
|
||||
void recordStream(const DataPtr& ptr, c10::Stream stream) {
|
||||
allocator_->recordStream(ptr, stream);
|
||||
}
|
||||
CachingDeviceAllocator::DeviceStats getDeviceStats(c10::DeviceIndex device) {
|
||||
return allocator_->getDeviceStats(device);
|
||||
}
|
||||
void resetAccumulatedStats(c10::DeviceIndex device) {
|
||||
allocator_->resetAccumulatedStats(device);
|
||||
}
|
||||
void resetPeakStats(c10::DeviceIndex device) {
|
||||
allocator_->resetPeakStats(device);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace c10::hip
|
||||
|
||||
@ -4,9 +4,8 @@
|
||||
namespace c10 { namespace hip {
|
||||
namespace HIPCachingAllocatorMasqueradingAsCUDA {
|
||||
|
||||
static HIPAllocatorMasqueradingAsCUDA allocator(HIPCachingAllocator::get());
|
||||
|
||||
Allocator* get() {
|
||||
static HIPAllocatorMasqueradingAsCUDA allocator(HIPCachingAllocator::get());
|
||||
return &allocator;
|
||||
}
|
||||
|
||||
@ -14,9 +13,5 @@ void recordStreamMasqueradingAsCUDA(const DataPtr& ptr, HIPStreamMasqueradingAsC
|
||||
HIPCachingAllocator::recordStream(ptr, stream.hip_stream());
|
||||
}
|
||||
|
||||
// Register this HIP allocator as CUDA allocator to enable access through both
|
||||
// c10::GetAllocator(kCUDA) and c10::getDeviceAllocator(kCUDA) APIs
|
||||
REGISTER_ALLOCATOR(kCUDA, &allocator)
|
||||
|
||||
} // namespace HIPCachingAllocatorMasqueradingAsCUDA
|
||||
}} // namespace c10::hip
|
||||
|
||||
@ -36,8 +36,10 @@
|
||||
#endif
|
||||
|
||||
#ifdef USE_FBGEMM
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
|
||||
#include <fbgemm/Fbgemm.h>
|
||||
#include <fbgemm/FbgemmConvert.h>
|
||||
C10_DIAGNOSTIC_POP()
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
|
||||
@ -14,8 +14,10 @@
|
||||
#include <c10/util/Half.h>
|
||||
|
||||
#ifdef USE_FBGEMM
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
|
||||
#include <fbgemm/Fbgemm.h>
|
||||
#include <fbgemm/FbgemmConvert.h>
|
||||
C10_DIAGNOSTIC_POP()
|
||||
#else
|
||||
#include <caffe2/perfkernels/embedding_lookup_idx.h>
|
||||
#endif
|
||||
|
||||
@ -25,9 +25,11 @@
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#ifdef USE_FBGEMM
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
|
||||
#include <fbgemm/Fbgemm.h>
|
||||
#include <fbgemm/FbgemmFP16.h>
|
||||
#include <fbgemm/QuantUtils.h>
|
||||
C10_DIAGNOSTIC_POP()
|
||||
#endif // USE_FBGEMM
|
||||
|
||||
namespace caffe2 {
|
||||
@ -409,7 +411,7 @@ Tensor fbgemm_pack_gemm_matrix_fp16(const Tensor& weight) {
|
||||
Tensor fbgemm_linear_fp16_weight_fp32_activation(
|
||||
const Tensor& input,
|
||||
const Tensor& packed_weight,
|
||||
const Tensor& bias) {
|
||||
const std::optional<Tensor>& bias) {
|
||||
TORCH_WARN_ONCE("fbgemm_linear_fp16_weight_fp32_activation is deprecated "
|
||||
"and will be removed in a future PyTorch release.")
|
||||
|
||||
@ -430,7 +432,6 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation(
|
||||
|
||||
TORCH_CHECK(input.size(input.dim() - 1) == packed_weight_fp16.numRows())
|
||||
TORCH_CHECK(input.dim() >= 2);
|
||||
TORCH_CHECK(bias.dim() == 1);
|
||||
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
|
||||
@ -449,7 +450,12 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation(
|
||||
output.data_ptr<float>());
|
||||
|
||||
// Add bias term
|
||||
output.add_(bias);
|
||||
c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias);
|
||||
const Tensor& bias_ = *bias_maybe_owned;
|
||||
if (bias_.defined()) {
|
||||
TORCH_CHECK(bias_.dim() == 1);
|
||||
output.add_(bias_);
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
@ -551,7 +557,7 @@ Tensor fbgemm_pack_gemm_matrix_fp16(const Tensor& weight) {
|
||||
Tensor fbgemm_linear_fp16_weight_fp32_activation(
|
||||
const Tensor& input,
|
||||
const Tensor& packed_weight,
|
||||
const Tensor& bias) {
|
||||
const std::optional<Tensor>& bias) {
|
||||
TORCH_WARN_ONCE("fbgemm_linear_fp16_weight_fp32_activation is deprecated "
|
||||
"and will be removed in a future PyTorch release.")
|
||||
|
||||
|
||||
@ -4,9 +4,11 @@
|
||||
#include <c10/core/QScheme.h>
|
||||
|
||||
#ifdef USE_FBGEMM
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
|
||||
#include <fbgemm/Fbgemm.h>
|
||||
#include <fbgemm/FbgemmSparse.h>
|
||||
#include <ATen/native/ao_sparse/quantized/cpu/packed_params.h>
|
||||
C10_DIAGNOSTIC_POP()
|
||||
|
||||
|
||||
namespace ao::sparse {
|
||||
|
||||
@ -6,7 +6,9 @@
|
||||
#include <c10/util/llvmMathExtras.h>
|
||||
|
||||
#ifdef USE_FBGEMM
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
|
||||
#include <fbgemm/Fbgemm.h>
|
||||
C10_DIAGNOSTIC_POP()
|
||||
#endif
|
||||
|
||||
namespace at::native {
|
||||
|
||||
@ -11,25 +11,11 @@
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
// TODO(lufang): Tensor.item() on AMD HIP is not synced in the Recsys models.
|
||||
// This is just a short term workaround. Issue is tracked as FBA-388 on the AMD side.
|
||||
namespace {
|
||||
bool use_sync_mode() {
|
||||
static const bool sync_mode = c10::utils::check_env("HIP_DOUBLE_SYNC_ON_LOCAL_SCALE_DENSE") == true;
|
||||
return sync_mode;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace at::native {
|
||||
|
||||
Scalar _local_scalar_dense_cuda(const Tensor& self) {
|
||||
Scalar r;
|
||||
TORCH_CHECK(self.numel() > 0, "_local_scalar_dense: Empty tensor not supported");
|
||||
#if defined(USE_ROCM)
|
||||
if (!use_sync_mode()){
|
||||
#endif
|
||||
AT_DISPATCH_V2(
|
||||
self.scalar_type(), "_local_scalar_dense_cuda", AT_WRAP([&] {
|
||||
// Create pinned memory for the scalar value to avoid implicit
|
||||
@ -46,15 +32,6 @@ Scalar _local_scalar_dense_cuda(const Tensor& self) {
|
||||
at::cuda::memcpy_and_sync((void *)value.const_data_ptr<scalar_t>(), self.const_data_ptr<scalar_t>(), sizeof(scalar_t), cudaMemcpyDeviceToHost, stream);
|
||||
r = Scalar(*value.const_data_ptr<scalar_t>());
|
||||
}), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kComplexHalf, kHalf, kBool, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
|
||||
#if defined(USE_ROCM)
|
||||
} else {
|
||||
auto cpu_self = self.cpu();
|
||||
AT_DISPATCH_V2(
|
||||
self.scalar_type(), "_local_scalar_dense_hip", AT_WRAP([&] {
|
||||
r = Scalar(*cpu_self.const_data_ptr<scalar_t>());
|
||||
}), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kComplexHalf, kHalf, kBool, kBFloat16, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
|
||||
}
|
||||
#endif
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -261,30 +261,11 @@ std::tuple<Tensor, Tensor, Tensor> math_native_layer_norm(
|
||||
return outputs;
|
||||
}
|
||||
|
||||
Tensor rms_norm_symint(
|
||||
std::tuple<Tensor, Tensor> rms_norm_composite(
|
||||
const Tensor& input,
|
||||
c10::SymIntArrayRef normalized_shape,
|
||||
IntArrayRef normalized_shape,
|
||||
const std::optional<Tensor>& weight_opt /* optional */,
|
||||
std::optional<double> eps) {
|
||||
// See [Note: hacky wrapper removal for optional tensor]
|
||||
c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
|
||||
const Tensor& weight = *weight_maybe_owned;
|
||||
_check_rms_norm_inputs_symint(input, normalized_shape, weight);
|
||||
|
||||
#ifdef USE_MPS
|
||||
if (input.device().type() == DeviceType::MPS && weight_opt.has_value()) {
|
||||
const Tensor weight = weight_opt.value();
|
||||
const bool any_nested = input.is_nested() || weight.is_nested();
|
||||
const bool any_inputs_require_grad = input.requires_grad() || weight.requires_grad();
|
||||
const bool is_input_fp = isFloatingType(input.scalar_type());
|
||||
const bool is_weight_fp = isFloatingType(weight.scalar_type());
|
||||
|
||||
if (!(GradMode::is_enabled() && any_inputs_require_grad) && !any_nested && is_input_fp && is_weight_fp) {
|
||||
auto eps_val = eps.value_or(std::numeric_limits<double>::epsilon());
|
||||
return at::_fused_rms_norm(input.contiguous(), normalized_shape.size(), weight.contiguous(), eps_val);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
std::vector<int64_t> dims_to_reduce;
|
||||
for (const auto i : c10::irange(normalized_shape.size())) {
|
||||
@ -321,10 +302,67 @@ Tensor rms_norm_symint(
|
||||
upcasted_result = upcasted_result.mul(weight_opt.value());
|
||||
}
|
||||
|
||||
return upcasted_result;
|
||||
// if nested do not make contiguous
|
||||
if(input.is_nested() || (weight_opt.has_value() && weight_opt.value().is_nested())){
|
||||
return std::make_tuple(upcasted_result, rqrst_input);
|
||||
}
|
||||
|
||||
if(input.suggest_memory_format() == c10::MemoryFormat::ChannelsLast || input.suggest_memory_format() == c10::MemoryFormat::ChannelsLast3d){
|
||||
return std::make_tuple(upcasted_result, rqrst_input);
|
||||
}
|
||||
|
||||
return std::make_tuple(upcasted_result.contiguous(), rqrst_input.contiguous());
|
||||
});
|
||||
|
||||
return result.type_as(input);
|
||||
|
||||
return std::make_tuple(
|
||||
std::get<0>(result).type_as(input), // Cast normalized result to original input type
|
||||
std::get<1>(result) // rsqrt_val
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
Tensor rms_norm_symint(
|
||||
const Tensor& input,
|
||||
c10::SymIntArrayRef normalized_shape,
|
||||
const std::optional<Tensor>& weight_opt /* optional */,
|
||||
const std::optional<double> eps) {
|
||||
|
||||
c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
|
||||
const Tensor& weight = *weight_maybe_owned;
|
||||
_check_rms_norm_inputs_symint(input, normalized_shape, weight);
|
||||
|
||||
// composite fallback for channels last
|
||||
if(input.suggest_memory_format() == c10::MemoryFormat::ChannelsLast || input.suggest_memory_format() == c10::MemoryFormat::ChannelsLast3d){
|
||||
return std::get<0>(rms_norm_composite(input, IntArrayRef(reinterpret_cast<const int64_t*>(normalized_shape.data()), normalized_shape.size()), weight_opt, eps));
|
||||
}
|
||||
|
||||
// composite fallback for complex datatypes
|
||||
if(input.is_complex()){
|
||||
return std::get<0>(rms_norm_composite(input, IntArrayRef(reinterpret_cast<const int64_t*>(normalized_shape.data()), normalized_shape.size()), weight_opt, eps));
|
||||
}
|
||||
|
||||
if (weight_opt.has_value() && weight_opt.value().defined() && weight_opt.value().dtype() != input.dtype()) {
|
||||
TORCH_WARN_ONCE(
|
||||
"Mismatch dtype between input and module: input dtype = ", input.dtype(),
|
||||
", module dtype = ", weight_opt.value().dtype(), ", Can not dispatch to fused implementation"
|
||||
);
|
||||
return std::get<0>(rms_norm_composite(input, IntArrayRef(reinterpret_cast<const int64_t*>(normalized_shape.data()), normalized_shape.size()), weight_opt, eps));
|
||||
}
|
||||
|
||||
#ifdef USE_MPS
|
||||
if (input.device().type() == DeviceType::MPS && weight_opt.has_value()) {
|
||||
const Tensor weight = weight_opt.value();
|
||||
const bool any_inputs_require_grad = input.requires_grad() || weight.requires_grad();
|
||||
|
||||
if (!(GradMode::is_enabled() && any_inputs_require_grad)) {
|
||||
return std::get<0>(at::_fused_rms_norm(input.contiguous(), IntArrayRef(reinterpret_cast<const int64_t*>(normalized_shape.data()), normalized_shape.size()), weight_opt, eps));
|
||||
}
|
||||
}
|
||||
|
||||
if (input.device().type() == DeviceType::MPS){
|
||||
return std::get<0>(rms_norm_composite(input, IntArrayRef(reinterpret_cast<const int64_t*>(normalized_shape.data()), normalized_shape.size()), weight_opt, eps));
|
||||
}
|
||||
#endif
|
||||
return std::get<0>(at::_fused_rms_norm(input, IntArrayRef(reinterpret_cast<const int64_t*>(normalized_shape.data()), normalized_shape.size()), weight_opt, eps));
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
@ -106,6 +106,12 @@ void layer_norm_cpu_out(
|
||||
int64_t M,
|
||||
int64_t N);
|
||||
|
||||
std::tuple<Tensor, Tensor> rms_norm_composite(
|
||||
const Tensor& input,
|
||||
IntArrayRef normalized_shape,
|
||||
const std::optional<Tensor>& weight_opt /* optional */,
|
||||
std::optional<double> eps);
|
||||
|
||||
Tensor rms_norm_symint(
|
||||
const Tensor& input,
|
||||
c10::SymIntArrayRef normalized_shape,
|
||||
|
||||
@ -145,8 +145,6 @@ MPSGraphTensorData* getMPSGraphTensorData(MPSGraph* mpsGraph, MPSStream* mpsStre
|
||||
MPSGraphTensorData* getMPSGraphTensorFromScalar(MPSStream* mpsStream, MPSScalar& scalar);
|
||||
|
||||
MPSGraph* make_mps_graph();
|
||||
void printTensorNDArray(const TensorBase& t);
|
||||
MPSNDArray* ndArrayFromTensor(const TensorBase& tensor, MPSShape* shape, MPSDataType mpsType);
|
||||
|
||||
MPSGraphTensor* mpsGraphUnrankedPlaceHolder(MPSGraph* mpsGraph, MPSDataType dataType);
|
||||
MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph* mpsGraph, MPSDataType dataType, MPSShape* mpsShape);
|
||||
|
||||
@ -377,36 +377,6 @@ MPSShape* getMPSShape(IntArrayRef sizes, c10::MemoryFormat memory_format) {
|
||||
return [NSArray arrayWithObjects:numbers.data() count:numbers.size()];
|
||||
}
|
||||
|
||||
void printTensorNDArray(const TensorBase& t) {
|
||||
if (!t.is_mps())
|
||||
return;
|
||||
if (t.numel() == 0)
|
||||
return;
|
||||
// Get shape and data type
|
||||
auto selfShape = getMPSShape(t);
|
||||
auto selfDType = getMPSDataType(t.scalar_type());
|
||||
|
||||
// Initialize data
|
||||
id<MTLBuffer> selfBuf = getMTLBufferStorage(t);
|
||||
MPSGraphTensorData* tdata = [[[MPSGraphTensorData alloc] initWithMTLBuffer:selfBuf shape:selfShape
|
||||
dataType:selfDType] autorelease];
|
||||
C10_CLANG_DIAGNOSTIC_PUSH()
|
||||
#if C10_CLANG_HAS_WARNING("-Wobjc-method-access")
|
||||
C10_CLANG_DIAGNOSTIC_IGNORE("-Wobjc-method-access")
|
||||
#endif
|
||||
[tdata printNDArray];
|
||||
C10_CLANG_DIAGNOSTIC_POP()
|
||||
}
|
||||
|
||||
MPSNDArray* ndArrayFromTensor(const TensorBase& tensor, MPSShape* shape, MPSDataType mpsType) {
|
||||
id<MTLBuffer> buffer = getMTLBufferStorage(tensor);
|
||||
MPSGraphTensorData* tmpGraphTensorData = [[[MPSGraphTensorData alloc] initWithMTLBuffer:buffer
|
||||
shape:shape
|
||||
dataType:mpsType] autorelease];
|
||||
|
||||
return [tmpGraphTensorData mpsndarray];
|
||||
}
|
||||
|
||||
static std::vector<int64_t> getSortedStrides(const IntArrayRef& s) {
|
||||
std::vector<int64_t> idx(s.size());
|
||||
iota(idx.begin(), idx.end(), 0);
|
||||
@ -457,12 +427,22 @@ static MPSNDArray* permuteNDArray(MPSNDArray* inArray, const std::vector<int64_t
|
||||
return result;
|
||||
}
|
||||
|
||||
// Should be called before initWithBuffer to prevent hard crashes with
|
||||
// '[MPSNDArray initWithDevice:descriptor:isTextureBacked:] Error: NDArray dimension length > INT_MAX'
|
||||
static void check_mps_shape(MPSShape* shape) {
|
||||
for (NSNumber* elem in shape) {
|
||||
const auto val = [elem longValue];
|
||||
TORCH_CHECK(val <= std::numeric_limits<int32_t>::max(), "MPSGaph does not support tensor dims larger than INT_MAX");
|
||||
}
|
||||
}
|
||||
|
||||
MPSNDArray* getMPSNDArray(const TensorBase& t, MPSShape* sizes, MPSShape* strides) {
|
||||
id<MTLBuffer> srcBuf = getMTLBufferStorage(t);
|
||||
|
||||
MPSDataType mpsDataType = getMPSDataType(t.scalar_type());
|
||||
MPSNDArrayDescriptor* srcTensorDesc = [MPSNDArrayDescriptor descriptorWithDataType:mpsDataType shape:sizes];
|
||||
srcTensorDesc.preferPackedRows = YES;
|
||||
check_mps_shape(sizes);
|
||||
MPSNDArray* srcNDArray = [[[MPSNDArray alloc] initWithBuffer:srcBuf
|
||||
offset:t.storage_offset() * t.element_size()
|
||||
descriptor:srcTensorDesc] autorelease];
|
||||
@ -572,9 +552,9 @@ Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor,
|
||||
// Tensor is contiguous and has no storage offset.
|
||||
// Wrap it directly inside MPSGraphTensorData
|
||||
if ((_tensor.is_contiguous() && !_tensor.storage_offset()) || !useMPSStridedAPI || !is_macOS_15_0_or_newer) {
|
||||
_value = [[[MPSGraphTensorData alloc] initWithMTLBuffer:srcBuf
|
||||
shape:mpsShape_ ? mpsShape_ : getMPSShape(_tensor)
|
||||
dataType:dataType] autorelease];
|
||||
auto shape = mpsShape_ ? mpsShape_ : getMPSShape(_tensor);
|
||||
check_mps_shape(shape);
|
||||
_value = [[[MPSGraphTensorData alloc] initWithMTLBuffer:srcBuf shape:shape dataType:dataType] autorelease];
|
||||
} else {
|
||||
IntArrayRef view_shape;
|
||||
if (mpsShape_) {
|
||||
@ -583,8 +563,11 @@ Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor,
|
||||
|
||||
MPSShape* mpsShape = getMPSShape(_tensor);
|
||||
MPSShape* mpsStrides = getMPSShape(_tensor.strides());
|
||||
check_mps_shape(mpsShape);
|
||||
|
||||
auto storage_numel = src.storage().nbytes() / src.element_size();
|
||||
TORCH_CHECK(storage_numel <= std::numeric_limits<int32_t>::max(),
|
||||
"MPSGaph does not support tensor dims larger than INT_MAX");
|
||||
MPSNDArrayDescriptor* srcTensorDesc = [MPSNDArrayDescriptor descriptorWithDataType:dataType
|
||||
shape:@[ @(storage_numel) ]];
|
||||
srcTensorDesc.preferPackedRows = YES;
|
||||
|
||||
@ -62,15 +62,12 @@ static Tensor& fill_scalar_mps_impl(Tensor& self, const Scalar& value) {
|
||||
return self;
|
||||
}
|
||||
|
||||
// returns false if tensor cannot be filled with fillBuffer()
|
||||
static bool fill_mps_tensor_(Tensor& self, uint8_t value) {
|
||||
if (self.is_contiguous()) {
|
||||
MPSStream* stream = getCurrentMPSStream();
|
||||
auto storage_byte_offset = self.storage_offset() * self.itemsize();
|
||||
stream->fill(mps::getMTLBufferStorage(self), value, self.nbytes(), storage_byte_offset);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
static Tensor& fill_mps_tensor_(Tensor& self, uint8_t value) {
|
||||
TORCH_INTERNAL_ASSERT(self.is_contiguous());
|
||||
const auto stream = getCurrentMPSStream();
|
||||
auto storage_byte_offset = self.storage_offset() * self.itemsize();
|
||||
stream->fill(mps::getMTLBufferStorage(self), value, self.nbytes(), storage_byte_offset);
|
||||
return self;
|
||||
}
|
||||
|
||||
Tensor& fill_scalar_mps(Tensor& self, const Scalar& value) {
|
||||
@ -89,8 +86,20 @@ Tensor& fill_scalar_mps(Tensor& self, const Scalar& value) {
|
||||
return self;
|
||||
}
|
||||
// check if it's possible to use fillBuffer() to fill the Tensor's storage
|
||||
if (value.toDouble() == 0.0 && fill_mps_tensor_(self, 0) == true)
|
||||
return self;
|
||||
if (self.is_contiguous()) {
|
||||
if (value.toDouble() == 0.0) {
|
||||
return fill_mps_tensor_(self, 0);
|
||||
}
|
||||
if (self.scalar_type() == kBool) {
|
||||
return fill_mps_tensor_(self, value.toBool());
|
||||
}
|
||||
if (self.scalar_type() == kByte) {
|
||||
return fill_mps_tensor_(self, value.toByte());
|
||||
}
|
||||
if (self.scalar_type() == kChar) {
|
||||
return fill_mps_tensor_(self, value.toChar());
|
||||
}
|
||||
}
|
||||
|
||||
return fill_scalar_mps_impl(self, value);
|
||||
}
|
||||
@ -101,8 +110,6 @@ Tensor& fill_tensor_mps_(Tensor& self, const Tensor& value) {
|
||||
value.dim(),
|
||||
" dimensions.");
|
||||
Scalar scalar_value = value.item();
|
||||
if (scalar_value.toDouble() == 0.0 && fill_mps_tensor_(self, 0) == true)
|
||||
return self;
|
||||
return fill_scalar_mps(self, scalar_value);
|
||||
}
|
||||
|
||||
|
||||
@ -19,7 +19,14 @@ static auto& lib = MetalShaderLibrary::getBundledLibrary();
|
||||
#include <ATen/native/mps/RMSNorm_metallib.h>
|
||||
#endif
|
||||
|
||||
Tensor _fused_rms_norm_mps(const Tensor& input, const int64_t normalized_ndim, const Tensor& weight, const double eps) {
|
||||
std::tuple<Tensor, Tensor> _fused_rms_norm_mps(const Tensor& input,
|
||||
IntArrayRef normalized_shape,
|
||||
const std::optional<Tensor>& weight_opt,
|
||||
const std::optional<double> eps) {
|
||||
const Tensor weight = weight_opt.value().contiguous();
|
||||
const int64_t normalized_ndim = normalized_shape.size();
|
||||
auto eps_val = eps.value_or(std::numeric_limits<double>::epsilon());
|
||||
|
||||
TORCH_CHECK(input.is_contiguous() && weight.is_contiguous(), "Expected contiguous input and weight tensors");
|
||||
auto output = at::empty_like(input);
|
||||
const auto input_shape = input.sizes();
|
||||
@ -41,7 +48,7 @@ Tensor _fused_rms_norm_mps(const Tensor& input, const int64_t normalized_ndim, c
|
||||
const std::string kernel = fmt::format("{}_{}", name, scalarToMetalTypeString(output));
|
||||
id<MTLComputePipelineState> rms_norm_pso = lib.getPipelineStateForFunc(kernel);
|
||||
[computeEncoder setComputePipelineState:rms_norm_pso];
|
||||
mtl_setArgs(computeEncoder, input, weight, output, eps, N, 1);
|
||||
mtl_setArgs(computeEncoder, input, weight, output, eps_val, N, 1);
|
||||
|
||||
const auto maxThreadsPerGroup = static_cast<size_t>([rms_norm_pso maxTotalThreadsPerThreadgroup]);
|
||||
size_t threadgroup_size = maxThreadsPerGroup;
|
||||
@ -58,7 +65,7 @@ Tensor _fused_rms_norm_mps(const Tensor& input, const int64_t normalized_ndim, c
|
||||
}
|
||||
});
|
||||
|
||||
return output;
|
||||
return std::make_tuple(output, Tensor());
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
@ -1067,6 +1067,7 @@
|
||||
CUDA: baddbmm_out_cuda
|
||||
MPS: baddbmm_out_mps
|
||||
XPU: baddbmm_out_xpu
|
||||
MTIA: baddbmm_out_mtia
|
||||
SparseCsrCUDA: baddbmm_out_sparse_csr_cuda
|
||||
|
||||
- func: baddbmm.dtype(Tensor self, Tensor batch1, Tensor batch2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1) -> Tensor
|
||||
@ -1376,6 +1377,7 @@
|
||||
CUDA: bmm_out_cuda
|
||||
MPS: bmm_out_mps
|
||||
XPU: bmm_out_xpu
|
||||
MTIA: bmm_out_mtia
|
||||
SparseCPU: bmm_out_sparse_cpu
|
||||
SparseCUDA: bmm_out_sparse_cuda
|
||||
SparseCsrCUDA: bmm_out_sparse_csr_cuda
|
||||
@ -3314,9 +3316,15 @@
|
||||
dispatch:
|
||||
CompositeImplicitAutograd: rms_norm_symint
|
||||
|
||||
- func: _fused_rms_norm(Tensor input, int normalized_shape_ndim, Tensor weight, float eps) -> Tensor
|
||||
- func: _fused_rms_norm(Tensor input, int[] normalized_shape, Tensor? weight, float? eps) -> (Tensor, Tensor)
|
||||
dispatch:
|
||||
CUDA: _fused_rms_norm_cuda
|
||||
MPS: _fused_rms_norm_mps
|
||||
CompositeImplicitAutograd: rms_norm_composite
|
||||
|
||||
- func: _fused_rms_norm_backward(Tensor grad_out, Tensor input, int[] normalized_shape, Tensor rstd, Tensor? weight, bool[2] output_mask) -> (Tensor, Tensor)
|
||||
dispatch:
|
||||
CUDA: _fused_rms_norm_backward_cuda
|
||||
|
||||
- func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
|
||||
variants: function, method
|
||||
@ -3432,7 +3440,7 @@
|
||||
|
||||
- func: _wrapped_quantized_linear_prepacked(Tensor input, Tensor input_scale, Tensor input_zero_point, Tensor packed_weight, Tensor output_scale, Tensor output_zero_point, int out_channel) -> Tensor
|
||||
|
||||
- func: fbgemm_linear_fp16_weight_fp32_activation(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor
|
||||
- func: fbgemm_linear_fp16_weight_fp32_activation(Tensor input, Tensor packed_weight, Tensor? bias) -> Tensor
|
||||
|
||||
- func: fbgemm_linear_fp16_weight(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor
|
||||
|
||||
@ -7059,6 +7067,7 @@
|
||||
CUDA: addmm_out_cuda
|
||||
MPS: addmm_out_mps
|
||||
XPU: addmm_out_xpu
|
||||
MTIA: addmm_out_mtia
|
||||
SparseCPU: addmm_out_sparse_dense_cpu
|
||||
SparseCUDA: addmm_out_sparse_dense_cuda
|
||||
SparseCsrCPU: addmm_out_sparse_compressed_cpu
|
||||
@ -8962,7 +8971,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
device_check: NoCheck # TensorIterator
|
||||
dispatch:
|
||||
CPU, CUDA: eq_Scalar_out
|
||||
CPU, CUDA, MTIA: eq_Scalar_out
|
||||
MPS: eq_scalar_out_mps
|
||||
QuantizedCPU: eq_out_quantized_cpu
|
||||
tags: pointwise
|
||||
@ -8981,7 +8990,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
device_check: NoCheck # TensorIterator
|
||||
dispatch:
|
||||
CPU, CUDA: eq_Tensor_out
|
||||
CPU, CUDA, MTIA: eq_Tensor_out
|
||||
MPS: eq_tensor_out_mps
|
||||
QuantizedCPU: eq_out_quantized_cpu
|
||||
tags: pointwise
|
||||
@ -9374,7 +9383,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
device_check: NoCheck # TensorIterator
|
||||
dispatch:
|
||||
CPU, CUDA: addcmul_out
|
||||
CPU, CUDA, MTIA: addcmul_out
|
||||
MPS: addcmul_out_mps
|
||||
tags: pointwise
|
||||
|
||||
@ -9395,7 +9404,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
device_check: NoCheck # TensorIterator
|
||||
dispatch:
|
||||
CPU, CUDA: addcdiv_out
|
||||
CPU, CUDA, MTIA: addcdiv_out
|
||||
MPS: addcdiv_out_mps
|
||||
tags: pointwise
|
||||
|
||||
|
||||
@ -7,11 +7,13 @@
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#ifdef USE_FBGEMM
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
|
||||
#include <fbgemm/Fbgemm.h>
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Winconsistent-missing-destructor-override")
|
||||
#include <fbgemm/FbgemmFP16.h>
|
||||
C10_DIAGNOSTIC_POP()
|
||||
#include <fbgemm/QuantUtils.h>
|
||||
C10_DIAGNOSTIC_POP()
|
||||
|
||||
// The struct for the packed weight matrix (PackBMatrix) and the corresponding
|
||||
// column offsets used for the fully connect layer, which are both prepared in
|
||||
|
||||
@ -888,7 +888,7 @@ class QLinearUnpackedDynamicFp16 final {
|
||||
static at::Tensor run(
|
||||
at::Tensor input,
|
||||
const at::Tensor& weight,
|
||||
const at::Tensor& bias) {
|
||||
const std::optional<at::Tensor>& bias) {
|
||||
// We make a strong guarantee that models using these operators will have
|
||||
// the same numerics across different machines. Therefore, we do not provide
|
||||
// a fallback path and rather fail loudly if we cannot run FBGEMM.
|
||||
@ -908,7 +908,7 @@ class QLinearUnpackedDynamicFp16 final {
|
||||
static at::Tensor meta(
|
||||
at::Tensor input,
|
||||
const at::Tensor& weight,
|
||||
const at::Tensor& bias) {
|
||||
const std::optional<at::Tensor>& bias) {
|
||||
// We make a strong guarantee that models using these operators will have
|
||||
// the same numerics across different machines. Therefore, we do not provide
|
||||
// a fallback path and rather fail loudly if we cannot run FBGEMM.
|
||||
@ -929,7 +929,7 @@ class QLinearUnpackedDynamicFp16 final {
|
||||
static at::Tensor run(
|
||||
at::Tensor /* input */,
|
||||
const at::Tensor& weight,
|
||||
const at::Tensor& bias) {
|
||||
const std::optional<at::Tensor>& bias) {
|
||||
// We make a strong guarantee that models using these operators will have
|
||||
// the same numerics across different machines. Therefore, we do not provide
|
||||
// a fallback path and rather fail loudly if we cannot run FBGEMM.
|
||||
@ -940,7 +940,7 @@ class QLinearUnpackedDynamicFp16 final {
|
||||
static at::Tensor meta(
|
||||
at::Tensor /* input */,
|
||||
const at::Tensor& weight,
|
||||
const at::Tensor& bias) {
|
||||
const std::optional<at::Tensor>& bias) {
|
||||
TORCH_CHECK(
|
||||
false, "This PyTorch installation was not built with FBGEMM operators");
|
||||
}
|
||||
|
||||
@ -142,7 +142,7 @@ TORCH_LIBRARY(quantized, m) {
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y"), {at::Tag::pt2_compliant_tag});
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_relu_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y"), {at::Tag::pt2_compliant_tag});
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic_fp16(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> Tensor Y"), {at::Tag::pt2_compliant_tag});
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic_fp16_unpacked_weight(Tensor X, Tensor weight, Tensor bias) -> Tensor Y"), {at::Tag::pt2_compliant_tag});
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic_fp16_unpacked_weight(Tensor X, Tensor weight, Tensor? bias) -> Tensor Y"), {at::Tag::pt2_compliant_tag});
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_relu_dynamic_fp16(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> Tensor Y"), {at::Tag::pt2_compliant_tag});
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_leaky_relu(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i, float negative_slope) -> Tensor Y"), {at::Tag::pt2_compliant_tag});
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_tanh(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i) -> Tensor Y"), {at::Tag::pt2_compliant_tag});
|
||||
|
||||
@ -242,7 +242,11 @@ __global__ void coalesceValuesKernel(
|
||||
// `if constexpr` when CUDA codes will be compiled under C++-17, see
|
||||
// gh-56055 for blockers.
|
||||
template<typename Dtype>
|
||||
#ifdef USE_ROCM
|
||||
C10_LAUNCH_BOUNDS_1(C10_WARP_SIZE_STATIC*4)
|
||||
#else
|
||||
C10_LAUNCH_BOUNDS_1(C10_WARP_SIZE*4)
|
||||
#endif
|
||||
__global__ void coalesceValuesKernel(
|
||||
int64_t *segment_offsets, int64_t *value_indices,
|
||||
bool *values, bool *newValues,
|
||||
|
||||
@ -32,7 +32,9 @@
|
||||
#endif
|
||||
|
||||
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
|
||||
#include <cutlass/numeric_types.h>
|
||||
C10_DIAGNOSTIC_POP()
|
||||
|
||||
|
||||
#include <flash.h>
|
||||
|
||||
@ -75,7 +75,7 @@ Tensor TensorMaker::make_tensor() {
|
||||
}
|
||||
auto storage_size = size * itemsize;
|
||||
if (storage_offset_) {
|
||||
storage_size += storage_offset_.value();
|
||||
storage_size += storage_offset_.value() * itemsize;
|
||||
}
|
||||
return storage_size;
|
||||
}
|
||||
|
||||
@ -519,6 +519,15 @@ TEST(BasicTest, BasicStdTestCPU) {
|
||||
}
|
||||
|
||||
TEST(BasicTest, TestForBlobResizeCPU) {
|
||||
// Checks that for_blob can correctly create tensors with non-empty offset and resize them
|
||||
std::array<int32_t, 6> storage;
|
||||
std::iota(storage.begin(), storage.end(), 1);
|
||||
auto t = at::for_blob(storage.data(), {3,}).storage_offset(3).options(c10::TensorOptions(kInt)).make_tensor();
|
||||
auto te = *at::expand_size(t, {3, 3});
|
||||
ASSERT_EQ(te[1][1].item<int32_t>(), 5);
|
||||
}
|
||||
|
||||
TEST(BasicTest, TestForBlobStridesResizeCPU) {
|
||||
// Checks that for_blob can correctly create tensors with non-empty offset and resize them
|
||||
std::array<int32_t, 6> storage;
|
||||
std::iota(storage.begin(), storage.end(), 1);
|
||||
|
||||
@ -20,4 +20,8 @@
|
||||
#error "CAFFE2_STATIC_LINK_CUDA should not be visible in public headers"
|
||||
#endif
|
||||
|
||||
auto main() -> int {}
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
TEST(VerifyApiVisibility, Test) {
|
||||
ASSERT_EQ(1, 1);
|
||||
}
|
||||
|
||||
@ -3264,6 +3264,12 @@ def parse_args(args=None):
|
||||
instead of deleting it and creating a new one.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--caching-precompile",
|
||||
action="store_true",
|
||||
help="Enables caching precompile, serializing artifacts to DynamoCache between runs",
|
||||
)
|
||||
|
||||
group_latency = parser.add_mutually_exclusive_group()
|
||||
group_latency.add_argument(
|
||||
"--cold-start-latency",
|
||||
@ -3414,6 +3420,29 @@ def parse_args(args=None):
|
||||
return parser.parse_args(args)
|
||||
|
||||
|
||||
def process_caching_precompile():
|
||||
"""
|
||||
After every process_entry, save precompile artifacts to DynamoCache
|
||||
"""
|
||||
assert torch._dynamo.config.caching_precompile, (
|
||||
"Caching precompile should be enabled with --caching-precompile"
|
||||
)
|
||||
from torch._dynamo.precompile_context import PrecompileContext
|
||||
|
||||
# Serialize all callables, clear PrecompileContext
|
||||
# TODO: put this under torch.compiler API once ready
|
||||
serialized = PrecompileContext.serialize()
|
||||
PrecompileContext.clear()
|
||||
if serialized is not None:
|
||||
artifacts, info = serialized
|
||||
print(
|
||||
f"Saving {len(info.precompile_dynamo_artifacts)} Precompile Artifact(s)..."
|
||||
)
|
||||
results = PrecompileContext.deserialize(artifacts)
|
||||
assert results is not None
|
||||
PrecompileContext.populate_caches(results)
|
||||
|
||||
|
||||
def process_entry(rank, runner, original_dir, args):
|
||||
args.rank = rank
|
||||
with maybe_init_distributed(
|
||||
@ -3422,7 +3451,10 @@ def process_entry(rank, runner, original_dir, args):
|
||||
world_size=args.world_size,
|
||||
port=args.distributed_master_port,
|
||||
):
|
||||
return run(runner, args, original_dir)
|
||||
result = run(runner, args, original_dir)
|
||||
if args.caching_precompile:
|
||||
process_caching_precompile()
|
||||
return result
|
||||
|
||||
|
||||
def maybe_fresh_cache(args):
|
||||
@ -3458,6 +3490,10 @@ def main(runner, original_dir=None, args=None):
|
||||
)
|
||||
|
||||
with maybe_fresh_cache(args):
|
||||
if args.caching_precompile:
|
||||
os.environ["TORCH_CACHING_PRECOMPILE"] = "1"
|
||||
torch._dynamo.config.caching_precompile = True
|
||||
|
||||
args.init_distributed = args.only and args.multiprocess
|
||||
if args.init_distributed:
|
||||
# NB: Do NOT query device count before CUDA initialization; we're
|
||||
|
||||
@ -56,7 +56,11 @@ def list_benchmarks():
|
||||
print(f"Available benchmarks: {list(BENCHMARK_REGISTRY.keys())}")
|
||||
|
||||
|
||||
def run_benchmark(benchmark_name: str, should_visualize: bool = False):
|
||||
def run_benchmark(
|
||||
benchmark_name: str,
|
||||
should_visualize: bool = False,
|
||||
compile_mode: str = "max-autotune-no-cudagraphs",
|
||||
):
|
||||
"""Run a specific benchmark."""
|
||||
if benchmark_name not in BENCHMARK_REGISTRY:
|
||||
print(f"Error: Unknown benchmark '{benchmark_name}'")
|
||||
@ -64,10 +68,11 @@ def run_benchmark(benchmark_name: str, should_visualize: bool = False):
|
||||
return False
|
||||
|
||||
print(f"Running benchmark: {benchmark_name}")
|
||||
print(f"Torch compile mode: {compile_mode}")
|
||||
print("=" * 60)
|
||||
|
||||
benchmark_class = BENCHMARK_REGISTRY[benchmark_name]
|
||||
benchmark = benchmark_class()
|
||||
benchmark = benchmark_class(compile_mode)
|
||||
benchmark.benchmark()
|
||||
if should_visualize:
|
||||
benchmark.visualize()
|
||||
@ -75,14 +80,15 @@ def run_benchmark(benchmark_name: str, should_visualize: bool = False):
|
||||
return True
|
||||
|
||||
|
||||
def run_all_benchmarks(should_visualize: bool = False):
|
||||
def run_all_benchmarks(should_visualize: bool = False, compile_mode: str = "default"):
|
||||
"""Run all available benchmarks."""
|
||||
print("Running all benchmarks...")
|
||||
print(f"Torch compile mode: {compile_mode}")
|
||||
print("=" * 60)
|
||||
|
||||
for name, cls in BENCHMARK_REGISTRY.items():
|
||||
print(f"\n{'=' * 20} {name.upper()} {'=' * 20}")
|
||||
benchmark = cls()
|
||||
benchmark = cls(compile_mode)
|
||||
benchmark.benchmark()
|
||||
if should_visualize:
|
||||
benchmark.visualize()
|
||||
@ -124,6 +130,13 @@ Examples:
|
||||
help="Visualize results after running benchmarks",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--compile-mode",
|
||||
choices=["default", "max-autotune-no-cudagraphs"],
|
||||
default="max-autotune-no-cudagraphs",
|
||||
help="Torch compile mode to use (default: default)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Handle list option
|
||||
@ -133,7 +146,7 @@ Examples:
|
||||
|
||||
# Handle all option
|
||||
if args.all:
|
||||
run_all_benchmarks(args.visualize)
|
||||
run_all_benchmarks(args.visualize, args.compile_mode)
|
||||
return
|
||||
|
||||
# Handle specific benchmarks
|
||||
@ -144,7 +157,7 @@ Examples:
|
||||
sys.exit(1)
|
||||
|
||||
for benchmark_name in args.benchmarks:
|
||||
run_benchmark(benchmark_name, args.visualize)
|
||||
run_benchmark(benchmark_name, args.visualize, args.compile_mode)
|
||||
print() # Add spacing between benchmarks
|
||||
|
||||
|
||||
|
||||
@ -9,8 +9,8 @@ import torch.nn.functional as F
|
||||
|
||||
|
||||
class CrossEntropyForward(BenchmarkKernel):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
def __init__(self, compile_mode: str = "max-autotune-no-cudagraphs"):
|
||||
super().__init__(compile_mode)
|
||||
self.available_backends = ["eager", "compiled", "quack", "liger"]
|
||||
|
||||
def get_shapes(self) -> tuple[tuple[int, ...], ...]:
|
||||
@ -52,7 +52,8 @@ class CrossEntropyForward(BenchmarkKernel):
|
||||
# More discussion: https://github.com/pytorch/pytorch/issues/158455
|
||||
compiled_cross_entropy = torch.compile(
|
||||
lambda x, target: F.cross_entropy(x, target, reduction="none"),
|
||||
mode="max-autotune-no-cudagraphs",
|
||||
mode=self.compile_mode,
|
||||
fullgraph=True,
|
||||
)
|
||||
return lambda: compiled_cross_entropy(x, target)
|
||||
|
||||
@ -105,8 +106,8 @@ class CrossEntropyForward(BenchmarkKernel):
|
||||
|
||||
|
||||
class CrossEntropyBackward(BenchmarkKernel):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
def __init__(self, compile_mode: str = "max-autotune-no-cudagraphs"):
|
||||
super().__init__(compile_mode)
|
||||
self.available_backends = ["eager", "compiled", "quack", "liger"]
|
||||
|
||||
def get_shapes(self) -> tuple[tuple[int, ...], ...]:
|
||||
@ -149,7 +150,8 @@ class CrossEntropyBackward(BenchmarkKernel):
|
||||
|
||||
compiled_cross_entropy = torch.compile(
|
||||
lambda x, target: F.cross_entropy(x, target, reduction="none"),
|
||||
mode="max-autotune-no-cudagraphs",
|
||||
mode=self.compile_mode,
|
||||
fullgraph=True,
|
||||
)
|
||||
loss = compiled_cross_entropy(x, target)
|
||||
return lambda: torch.autograd.grad(
|
||||
@ -192,8 +194,8 @@ class CrossEntropyBackward(BenchmarkKernel):
|
||||
|
||||
|
||||
class SoftmaxForward(BenchmarkKernel):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
def __init__(self, compile_mode: str = "max-autotune-no-cudagraphs"):
|
||||
super().__init__(compile_mode)
|
||||
self.available_backends = ["eager", "compiled", "quack", "liger"]
|
||||
|
||||
def get_shapes(self) -> tuple[tuple[int, ...], ...]:
|
||||
@ -229,7 +231,7 @@ class SoftmaxForward(BenchmarkKernel):
|
||||
torch._dynamo.mark_dynamic(x, 0)
|
||||
|
||||
compiled_softmax = torch.compile(
|
||||
lambda x: F.softmax(x, dim=-1), mode="max-autotune-no-cudagraphs"
|
||||
lambda x: F.softmax(x, dim=-1), mode=self.compile_mode, fullgraph=True
|
||||
)
|
||||
return lambda: compiled_softmax(x)
|
||||
|
||||
@ -257,8 +259,8 @@ class SoftmaxForward(BenchmarkKernel):
|
||||
|
||||
|
||||
class SoftmaxBackward(BenchmarkKernel):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
def __init__(self, compile_mode: str = "max-autotune-no-cudagraphs"):
|
||||
super().__init__(compile_mode)
|
||||
self.available_backends = ["eager", "compiled", "quack", "liger"]
|
||||
|
||||
def get_shapes(self) -> tuple[tuple[int, ...], ...]:
|
||||
@ -292,7 +294,7 @@ class SoftmaxBackward(BenchmarkKernel):
|
||||
assert kwargs is None
|
||||
x, dy = args
|
||||
compiled_softmax = torch.compile(
|
||||
lambda x: F.softmax(x, dim=-1), mode="max-autotune-no-cudagraphs"
|
||||
lambda x: F.softmax(x, dim=-1), mode=self.compile_mode, fullgraph=True
|
||||
)
|
||||
y = compiled_softmax(x)
|
||||
return lambda: torch.autograd.grad(y, x, grad_outputs=dy, retain_graph=True)
|
||||
@ -327,8 +329,8 @@ class SoftmaxBackward(BenchmarkKernel):
|
||||
|
||||
|
||||
class RMSNormForward(BenchmarkKernel):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
def __init__(self, compile_mode: str = "max-autotune-no-cudagraphs"):
|
||||
super().__init__(compile_mode)
|
||||
self.available_backends = ["eager", "compiled", "quack", "liger"]
|
||||
|
||||
def get_shapes(self) -> tuple[tuple[int, ...], ...]:
|
||||
@ -372,7 +374,7 @@ class RMSNormForward(BenchmarkKernel):
|
||||
torch._dynamo.mark_dynamic(x, 0)
|
||||
|
||||
compiled_rms_norm = torch.compile(
|
||||
self.rms_norm_ref, mode="max-autotune-no-cudagraphs"
|
||||
self.rms_norm_ref, mode=self.compile_mode, fullgraph=True
|
||||
)
|
||||
return lambda: compiled_rms_norm(x, w)
|
||||
|
||||
@ -402,8 +404,8 @@ class RMSNormForward(BenchmarkKernel):
|
||||
|
||||
|
||||
class RMSNormBackward(BenchmarkKernel):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
def __init__(self, compile_mode: str = "max-autotune-no-cudagraphs"):
|
||||
super().__init__(compile_mode)
|
||||
self.available_backends = ["eager", "compiled", "quack", "liger"]
|
||||
|
||||
def get_shapes(self) -> tuple[tuple[int, ...], ...]:
|
||||
@ -445,7 +447,9 @@ class RMSNormBackward(BenchmarkKernel):
|
||||
def compiled(self, args, kwargs=None) -> Any:
|
||||
assert kwargs is None
|
||||
x, w, dy = args
|
||||
y = torch.compile(self.rms_norm_ref, mode="max-autotune-no-cudagraphs")(x, w)
|
||||
y = torch.compile(self.rms_norm_ref, mode=self.compile_mode, fullgraph=True)(
|
||||
x, w
|
||||
)
|
||||
return lambda: torch.autograd.grad(
|
||||
y, [x, w], grad_outputs=dy, retain_graph=True
|
||||
)
|
||||
@ -485,8 +489,8 @@ class RMSNormBackward(BenchmarkKernel):
|
||||
|
||||
|
||||
class LayerNormForward(BenchmarkKernel):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
def __init__(self, compile_mode: str = "max-autotune-no-cudagraphs"):
|
||||
super().__init__(compile_mode)
|
||||
self.available_backends = ["eager", "compiled", "quack", "liger"]
|
||||
|
||||
def get_shapes(self) -> tuple[tuple[int, ...], ...]:
|
||||
@ -526,7 +530,7 @@ class LayerNormForward(BenchmarkKernel):
|
||||
torch._dynamo.mark_dynamic(x, 0)
|
||||
|
||||
compiled_layernorm = torch.compile(
|
||||
self.layernorm_ref, mode="max-autotune-no-cudagraphs"
|
||||
self.layernorm_ref, mode=self.compile_mode, fullgraph=True
|
||||
)
|
||||
return lambda: compiled_layernorm(x, w, eps=1e-6)
|
||||
|
||||
@ -559,8 +563,8 @@ class LayerNormForward(BenchmarkKernel):
|
||||
|
||||
|
||||
class LayerNormBackward(BenchmarkKernel):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
def __init__(self, compile_mode: str = "max-autotune-no-cudagraphs"):
|
||||
super().__init__(compile_mode)
|
||||
self.available_backends = ["eager", "compiled", "liger"]
|
||||
|
||||
def get_shapes(self) -> tuple[tuple[int, ...], ...]:
|
||||
@ -603,7 +607,7 @@ class LayerNormBackward(BenchmarkKernel):
|
||||
assert kwargs is None
|
||||
x, w, dy = args
|
||||
compiled_layernorm = torch.compile(
|
||||
self.layernorm_ref, mode="max-autotune-no-cudagraphs"
|
||||
self.layernorm_ref, mode=self.compile_mode, fullgraph=True
|
||||
)
|
||||
y = compiled_layernorm(x, w)
|
||||
return lambda: torch.autograd.grad(
|
||||
|
||||
@ -13,7 +13,8 @@ def benchmark_kernel_in_milliseconds(func: Callable, *args, **kwargs) -> float:
|
||||
# warmup
|
||||
for _ in range(5):
|
||||
func(*args, **kwargs)
|
||||
return benchmarker.benchmark_gpu(lambda: func(*args, **kwargs))
|
||||
with torch.compiler.set_stance("fail_on_recompile"):
|
||||
return benchmarker.benchmark_gpu(lambda: func(*args, **kwargs))
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -41,9 +42,10 @@ class Performance:
|
||||
|
||||
|
||||
class BenchmarkKernel:
|
||||
def __init__(self):
|
||||
def __init__(self, compile_mode: str = "max-autotune-no-cudagraphs"):
|
||||
self.name = self.__class__.__name__
|
||||
self.available_backends: list[str] = []
|
||||
self.compile_mode: str = compile_mode
|
||||
|
||||
# mapping from backend to list of performance results
|
||||
self.profiling_results: defaultdict[str, list[Performance]] = defaultdict(list)
|
||||
|
||||
@ -864,7 +864,6 @@ libtorch_python_core_sources = [
|
||||
"torch/csrc/QScheme.cpp",
|
||||
"torch/csrc/Module.cpp",
|
||||
"torch/csrc/PyInterpreter.cpp",
|
||||
"torch/csrc/PyInterpreterHooks.cpp",
|
||||
"torch/csrc/python_dimname.cpp",
|
||||
"torch/csrc/Size.cpp",
|
||||
"torch/csrc/Storage.cpp",
|
||||
|
||||
@ -1,10 +0,0 @@
|
||||
#include <c10/core/CachingDeviceAllocator.h>
|
||||
|
||||
namespace c10 {
|
||||
|
||||
// Ensures proper DLL export of this pure virtual base class on Windows,
|
||||
// since it's mainly used in other DLLs outside c10.dll.
|
||||
DeviceAllocator::DeviceAllocator() = default;
|
||||
DeviceAllocator::~DeviceAllocator() = default;
|
||||
|
||||
} // namespace c10
|
||||
@ -1,7 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/Allocator.h>
|
||||
#include <c10/core/Stream.h>
|
||||
|
||||
namespace c10::CachingDeviceAllocator {
|
||||
|
||||
@ -60,55 +59,3 @@ struct DeviceStats {
|
||||
};
|
||||
|
||||
} // namespace c10::CachingDeviceAllocator
|
||||
|
||||
namespace c10 {
|
||||
|
||||
using CaptureId_t = unsigned long long;
|
||||
|
||||
// first is set if the instance is created by Graph mode capture_begin.
|
||||
// second is set if the instance is created by Graph mode graph_pool_handle.
|
||||
using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
|
||||
|
||||
struct C10_API DeviceAllocator : public c10::Allocator {
|
||||
DeviceAllocator();
|
||||
~DeviceAllocator() override;
|
||||
|
||||
// Returns true if the allocator has been properly initialized and is ready
|
||||
// for use
|
||||
virtual bool initialized() = 0;
|
||||
|
||||
// Releases all cached device memory from the specified memory pool back to
|
||||
// the system
|
||||
virtual void emptyCache(MempoolId_t mempool_id = {0, 0}) = 0;
|
||||
|
||||
// Associates a memory allocation with a stream to establish dependency
|
||||
// tracking. Prevents memory reuse until all operations on the specified
|
||||
// stream complete
|
||||
virtual void recordStream(const DataPtr& ptr, c10::Stream stream) = 0;
|
||||
|
||||
// Retrieves comprehensive memory statistics for the specified device,
|
||||
// including allocation patterns, usage metrics
|
||||
virtual CachingDeviceAllocator::DeviceStats getDeviceStats(
|
||||
c10::DeviceIndex device) = 0;
|
||||
|
||||
// Resets cumulative allocation statistics for the specified device to zero
|
||||
virtual void resetAccumulatedStats(c10::DeviceIndex device) = 0;
|
||||
|
||||
// Resets peak memory usage statistics for the specified device
|
||||
virtual void resetPeakStats(c10::DeviceIndex device) = 0;
|
||||
};
|
||||
|
||||
// This function is used to get the DeviceAllocator for a specific device type
|
||||
// and keep backward compatibility with c10::GetAllocator.
|
||||
C10_API inline DeviceAllocator* getDeviceAllocator(const DeviceType& t) {
|
||||
TORCH_CHECK(
|
||||
t != DeviceType::CPU,
|
||||
"getDeviceAllocator is not supported for CPU device type.");
|
||||
auto* allocator = c10::GetAllocator(t);
|
||||
auto* device_allocator = dynamic_cast<DeviceAllocator*>(allocator);
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
device_allocator, "Allocator for ", t, " is not a DeviceAllocator.");
|
||||
return device_allocator;
|
||||
}
|
||||
|
||||
} // namespace c10
|
||||
|
||||
@ -240,4 +240,24 @@ struct C10_API PyInterpreter {
|
||||
void disarm() noexcept;
|
||||
};
|
||||
|
||||
// PyInterpreterStatus describes what the state of its interpreter tag
|
||||
// is, relative to the thread currently holding the GIL.
|
||||
enum class PyInterpreterStatus {
|
||||
// We just allocated the Tensor, it hasn't escaped to other threads,
|
||||
// we know that it definitely hasn't been tagged to be associated
|
||||
// with an interpreter.
|
||||
DEFINITELY_UNINITIALIZED,
|
||||
// We queried the interpreter field and it looked uninitialized. But
|
||||
// another thread may have raced with us to tag it with some other
|
||||
// interpreter id. So we will have to do a CEX to make sure we can
|
||||
// actually nab it.
|
||||
MAYBE_UNINITIALIZED,
|
||||
// We queried the interpreter field and it was tagged to belong to us.
|
||||
// This means we have sole write access (as we hold the GIL for this
|
||||
// interpreter)
|
||||
TAGGED_BY_US,
|
||||
// Someone else tagged this. We can't use this TensorImpl from Python.
|
||||
TAGGED_BY_OTHER,
|
||||
};
|
||||
|
||||
} // namespace c10::impl
|
||||
|
||||
@ -1,32 +0,0 @@
|
||||
#include <c10/core/impl/PyInterpreterHooks.h>
|
||||
|
||||
namespace c10::impl {
|
||||
|
||||
// Define the registry
|
||||
C10_DEFINE_REGISTRY(
|
||||
PyInterpreterHooksRegistry,
|
||||
PyInterpreterHooksInterface,
|
||||
PyInterpreterHooksArgs)
|
||||
|
||||
const PyInterpreterHooksInterface& getPyInterpreterHooks() {
|
||||
auto create_impl = [] {
|
||||
#if !defined C10_MOBILE
|
||||
auto hooks = PyInterpreterHooksRegistry()->Create(
|
||||
"PyInterpreterHooks", PyInterpreterHooksArgs{});
|
||||
if (hooks) {
|
||||
return hooks;
|
||||
}
|
||||
#endif
|
||||
// Return stub implementation that will throw errors when methods are called
|
||||
return std::make_unique<PyInterpreterHooksInterface>();
|
||||
};
|
||||
static auto hooks = create_impl();
|
||||
return *hooks;
|
||||
}
|
||||
|
||||
// Main function to get global PyInterpreter
|
||||
PyInterpreter* getGlobalPyInterpreter() {
|
||||
return getPyInterpreterHooks().getPyInterpreter();
|
||||
}
|
||||
|
||||
} // namespace c10::impl
|
||||
@ -1,39 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/impl/PyInterpreter.h>
|
||||
#include <c10/macros/Export.h>
|
||||
#include <c10/util/Registry.h>
|
||||
#include <memory>
|
||||
|
||||
namespace c10::impl {
|
||||
|
||||
// Minimal interface for PyInterpreter hooks
|
||||
struct C10_API PyInterpreterHooksInterface {
|
||||
virtual ~PyInterpreterHooksInterface() = default;
|
||||
|
||||
// Get the PyInterpreter instance
|
||||
// Stub implementation throws error when Python is not available
|
||||
virtual PyInterpreter* getPyInterpreter() const {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"PyTorch was compiled without Python support. "
|
||||
"Cannot access Python interpreter from C++.");
|
||||
}
|
||||
};
|
||||
|
||||
struct C10_API PyInterpreterHooksArgs{};
|
||||
|
||||
C10_DECLARE_REGISTRY(
|
||||
PyInterpreterHooksRegistry,
|
||||
PyInterpreterHooksInterface,
|
||||
PyInterpreterHooksArgs);
|
||||
|
||||
#define REGISTER_PYTHON_HOOKS(clsname) \
|
||||
C10_REGISTER_CLASS(PyInterpreterHooksRegistry, clsname, clsname)
|
||||
|
||||
// Get the global PyInterpreter hooks instance
|
||||
C10_API const PyInterpreterHooksInterface& getPyInterpreterHooks();
|
||||
|
||||
C10_API PyInterpreter* getGlobalPyInterpreter();
|
||||
|
||||
} // namespace c10::impl
|
||||
@ -34,12 +34,29 @@ PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
|
||||
reinterpret_cast<uintptr_t>(pyobj_) & ~0x1ULL);
|
||||
}
|
||||
|
||||
void PyObjectSlot::unchecked_clear_pyobj(PyInterpreter* interpreter) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(interpreter == pyobj_interpreter_.load());
|
||||
pyobj_ = nullptr;
|
||||
}
|
||||
|
||||
PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
|
||||
auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
if (interpreter) {
|
||||
return *interpreter;
|
||||
}
|
||||
TORCH_CHECK(false, "cannot access PyObject for Tensor - no interpreter set");
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"cannot access PyObject for Tensor on interpreter ",
|
||||
(*pyobj_interpreter_.load())->name());
|
||||
}
|
||||
|
||||
bool PyObjectSlot::check_interpreter(PyInterpreter* interpreter) {
|
||||
return interpreter == pyobj_interpreter();
|
||||
}
|
||||
|
||||
bool PyObjectSlot::has_pyobj_nonhermetic() {
|
||||
return check_pyobj(pyobj_interpreter(), /*ignore_hermetic_tls=*/true)
|
||||
.has_value();
|
||||
}
|
||||
|
||||
bool PyObjectSlot::owns_pyobj() {
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
|
||||
#include <c10/core/impl/HermeticPyObjectTLS.h>
|
||||
#include <c10/core/impl/PyInterpreter.h>
|
||||
#include <c10/core/impl/PyInterpreterHooks.h>
|
||||
#include <c10/util/python_stub.h>
|
||||
#include <optional>
|
||||
|
||||
@ -25,9 +24,52 @@ struct C10_API PyObjectSlot {
|
||||
//
|
||||
// NB: THIS FUNCTION CAN RAISE AN EXCEPTION. Make sure to clean up after
|
||||
// PyObject if necessary!
|
||||
void init_pyobj(PyObject* pyobj) {
|
||||
pyobj_interpreter_.store(
|
||||
getGlobalPyInterpreter(), std::memory_order_relaxed);
|
||||
void init_pyobj(
|
||||
PyInterpreter* self_interpreter,
|
||||
PyObject* pyobj,
|
||||
PyInterpreterStatus status) {
|
||||
impl::PyInterpreter* expected = nullptr;
|
||||
switch (status) {
|
||||
case impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED:
|
||||
// caller guarantees there is no multithreaded access; if there is
|
||||
// no data race OK to do a relaxed store
|
||||
pyobj_interpreter_.store(self_interpreter, std::memory_order_relaxed);
|
||||
break;
|
||||
case impl::PyInterpreterStatus::TAGGED_BY_US:
|
||||
// no tagging is necessary, the tag is already correct
|
||||
break;
|
||||
case impl::PyInterpreterStatus::MAYBE_UNINITIALIZED:
|
||||
// attempt to claim this TensorImpl with the specified interpreter
|
||||
// tag
|
||||
if (pyobj_interpreter_.compare_exchange_strong(
|
||||
expected, self_interpreter, std::memory_order_acq_rel)) {
|
||||
break;
|
||||
}
|
||||
// test if, actually, it was already tagged by us! this situation can't
|
||||
// be caused by a race, but it could be caused by a situation
|
||||
// where someone conservatively tagged the tensor as MAYBE_UNINITIALIZED
|
||||
// (because they didn't pre-check the tag) when actually it was
|
||||
// owned by the interpreter
|
||||
if (expected == self_interpreter) {
|
||||
break;
|
||||
}
|
||||
// fallthrough, we lost the race. We are guaranteed not to lose the
|
||||
// race with ourself, as calls to init_pyobj with the same interpreter
|
||||
// ID must be sequentialized by the GIL
|
||||
[[fallthrough]];
|
||||
case impl::PyInterpreterStatus::TAGGED_BY_OTHER:
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"cannot allocate PyObject for Tensor on interpreter ",
|
||||
self_interpreter,
|
||||
" that has already been used by another torch deploy interpreter ",
|
||||
pyobj_interpreter_.load());
|
||||
}
|
||||
|
||||
// we are the ONLY thread that can have gotten to this point. It is not
|
||||
// possible to conflict with another zero interpreter as access is protected
|
||||
// by GIL
|
||||
// NB: owns_pyobj tag is initially false
|
||||
pyobj_ = pyobj;
|
||||
}
|
||||
|
||||
@ -52,25 +94,49 @@ struct C10_API PyObjectSlot {
|
||||
//
|
||||
// NB: this lives in header so that we can avoid actually creating the
|
||||
// std::optional
|
||||
|
||||
// @todo alban: I'm not too sure what's going on here, we can probably delete
|
||||
// it but it's worthwhile making sure
|
||||
std::optional<PyObject*> check_pyobj(bool ignore_hermetic_tls = false) const {
|
||||
std::optional<PyObject*> check_pyobj(
|
||||
PyInterpreter* self_interpreter,
|
||||
bool ignore_hermetic_tls = false) const {
|
||||
// Note [Memory ordering on Python interpreter tag]
|
||||
impl::PyInterpreter* interpreter =
|
||||
pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
if (interpreter == nullptr) {
|
||||
// NB: This never returns DEFINITELY_UNINITIALIZED because there is
|
||||
// always the possibility that another thread races to initialize
|
||||
// after we query here. The only time when we can conclude a tensor
|
||||
// is definitely uninitialized is when we have just allocated it and
|
||||
// it cannot have escaped to other threads yet
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
|
||||
return std::nullopt;
|
||||
} else if (interpreter == self_interpreter) {
|
||||
// NB: pyobj_ could still be null!
|
||||
if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
|
||||
return std::nullopt;
|
||||
} else {
|
||||
return _unchecked_untagged_pyobj();
|
||||
}
|
||||
} else {
|
||||
return _unchecked_untagged_pyobj();
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"cannot access PyObject for Tensor on interpreter ",
|
||||
(*self_interpreter)->name(),
|
||||
" that has already been used by another torch deploy interpreter ",
|
||||
(*pyobj_interpreter_.load())->name());
|
||||
}
|
||||
}
|
||||
|
||||
// Clear the PyObject field for an interpreter, in situations where we
|
||||
// statically know the tensor is tagged with our interpreter.
|
||||
void unchecked_clear_pyobj(PyInterpreter* interpreter);
|
||||
|
||||
PyInterpreter& load_pyobj_interpreter() const;
|
||||
|
||||
// Check if the PyObjectSlot's interpreter is the same as the specified
|
||||
// interpreter
|
||||
bool check_interpreter(PyInterpreter* interpreter);
|
||||
|
||||
// Check if the PyObjectSlot is holding a PyObject, owned or non-owned
|
||||
bool has_pyobj_nonhermetic();
|
||||
|
||||
bool owns_pyobj();
|
||||
|
||||
void set_owns_pyobj(bool b);
|
||||
|
||||
@ -4179,7 +4179,6 @@ struct BackendStaticInitializer {
|
||||
|
||||
BackendStaticInitializer() {
|
||||
auto r = parseEnvForBackend();
|
||||
at::SetAllocator(kCUDA, r, 0);
|
||||
allocator.store(r);
|
||||
}
|
||||
};
|
||||
|
||||
@ -202,24 +202,25 @@ struct ShareableHandle {
|
||||
std::string handle;
|
||||
};
|
||||
|
||||
class CUDAAllocator : public DeviceAllocator {
|
||||
class CUDAAllocator : public Allocator {
|
||||
public:
|
||||
virtual void* raw_alloc(size_t nbytes) = 0;
|
||||
virtual void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) = 0;
|
||||
virtual void raw_delete(void* ptr) = 0;
|
||||
virtual void init(int device_count) = 0;
|
||||
virtual bool initialized() = 0;
|
||||
virtual double getMemoryFraction(c10::DeviceIndex device) = 0;
|
||||
virtual void setMemoryFraction(double fraction, c10::DeviceIndex device) = 0;
|
||||
virtual void emptyCache(MempoolId_t mempool_id = {0, 0}) = 0;
|
||||
virtual void enable(bool value) = 0;
|
||||
virtual bool isEnabled() const = 0;
|
||||
virtual void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) = 0;
|
||||
virtual void* getBaseAllocation(void* ptr, size_t* size) = 0;
|
||||
// Keep for BC only
|
||||
virtual void recordStream(const DataPtr& ptr, CUDAStream stream) = 0;
|
||||
void recordStream(const DataPtr& ptr, c10::Stream stream) override {
|
||||
CUDAStream cuda_stream = CUDAStream(stream);
|
||||
recordStream(ptr, cuda_stream);
|
||||
}
|
||||
virtual void recordStream(const DataPtr&, CUDAStream stream) = 0;
|
||||
virtual c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
|
||||
c10::DeviceIndex device) = 0;
|
||||
virtual void resetAccumulatedStats(c10::DeviceIndex device) = 0;
|
||||
virtual void resetPeakStats(c10::DeviceIndex device) = 0;
|
||||
virtual SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) = 0;
|
||||
virtual void beginAllocateToPool(
|
||||
c10::DeviceIndex device,
|
||||
@ -524,10 +525,6 @@ inline void enablePeerAccess(
|
||||
|
||||
namespace c10::cuda {
|
||||
|
||||
// Keep BC only
|
||||
using c10::CaptureId_t;
|
||||
using c10::MempoolId_t;
|
||||
|
||||
// MemPool represents a pool of memory in a caching allocator. Currently,
|
||||
// it's just the ID of the pool object maintained in the CUDACachingAllocator.
|
||||
//
|
||||
|
||||
@ -30,7 +30,7 @@ void c10_cuda_check_implementation(
|
||||
check_message.append("CUDA error: ");
|
||||
const char* error_string = cudaGetErrorString(cuda_error);
|
||||
check_message.append(error_string);
|
||||
check_message.append(c10::cuda::get_cuda_error_help(error_string));
|
||||
check_message.append(c10::cuda::get_cuda_error_help(cuda_error));
|
||||
check_message.append(c10::cuda::get_cuda_check_suffix());
|
||||
check_message.append("\n");
|
||||
if (include_device_assertions) {
|
||||
|
||||
@ -9,6 +9,12 @@
|
||||
|
||||
namespace c10::cuda {
|
||||
|
||||
using CaptureId_t = unsigned long long;
|
||||
|
||||
// first is set if the instance is created by CUDAGraph::capture_begin.
|
||||
// second is set if the instance is created by at::cuda::graph_pool_handle.
|
||||
using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
|
||||
|
||||
// RAII guard for "cudaStreamCaptureMode", a thread-local value
|
||||
// that controls the error-checking strictness of a capture.
|
||||
struct C10_CUDA_API CUDAStreamCaptureModeGuard {
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
#include <c10/cuda/CUDAMiscFunctions.h>
|
||||
#include <c10/util/env.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
|
||||
@ -7,11 +8,19 @@ namespace c10::cuda {
|
||||
|
||||
// Explain common CUDA errors
|
||||
// NOLINTNEXTLINE(bugprone-exception-escape,-warnings-as-errors)
|
||||
std::string get_cuda_error_help(const char* error_string) noexcept {
|
||||
std::string get_cuda_error_help(cudaError_t error) noexcept {
|
||||
std::string help_text;
|
||||
if (strstr(error_string, "invalid device ordinal")) {
|
||||
help_text.append(
|
||||
"\nGPU device may be out of range, do you have enough GPUs?");
|
||||
switch (error) {
|
||||
case cudaErrorInvalidDevice:
|
||||
help_text.append(
|
||||
"\nGPU device may be out of range, do you have enough GPUs?");
|
||||
break;
|
||||
default:
|
||||
help_text.append("\nSearch for `")
|
||||
.append(cudaGetErrorName(error))
|
||||
.append(
|
||||
"' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.");
|
||||
break;
|
||||
}
|
||||
return help_text;
|
||||
}
|
||||
|
||||
@ -3,12 +3,13 @@
|
||||
// CUDAExceptions.h
|
||||
|
||||
#include <c10/cuda/CUDAMacros.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
|
||||
namespace c10::cuda {
|
||||
C10_CUDA_API std::string get_cuda_error_help(const char*) noexcept;
|
||||
C10_CUDA_API std::string get_cuda_error_help(cudaError_t) noexcept;
|
||||
C10_CUDA_API const char* get_cuda_check_suffix() noexcept;
|
||||
C10_CUDA_API std::mutex* getFreeMutex();
|
||||
} // namespace c10::cuda
|
||||
|
||||
@ -540,7 +540,7 @@ class DeviceCachingAllocator {
|
||||
|
||||
static void local_raw_delete(void* ptr);
|
||||
|
||||
class XPUAllocator : public DeviceAllocator {
|
||||
class XPUAllocator : public Allocator {
|
||||
private:
|
||||
std::mutex mutex;
|
||||
ska::flat_hash_map<void*, Block*> allocated_blocks;
|
||||
@ -576,10 +576,6 @@ class XPUAllocator : public DeviceAllocator {
|
||||
}
|
||||
}
|
||||
|
||||
bool initialized() override {
|
||||
return !device_allocators.empty();
|
||||
}
|
||||
|
||||
void malloc(
|
||||
void** devPtr,
|
||||
DeviceIndex device,
|
||||
@ -614,13 +610,13 @@ class XPUAllocator : public DeviceAllocator {
|
||||
}
|
||||
}
|
||||
|
||||
void emptyCache(MempoolId_t mempool_id [[maybe_unused]] = {0, 0}) override {
|
||||
void emptyCache() {
|
||||
for (auto& da : device_allocators) {
|
||||
da->emptyCache();
|
||||
}
|
||||
}
|
||||
|
||||
void recordStream(const DataPtr& ptr, c10::Stream stream) override {
|
||||
void recordStream(const DataPtr& ptr, XPUStream stream) {
|
||||
if (!ptr.get()) {
|
||||
return;
|
||||
}
|
||||
@ -630,8 +626,7 @@ class XPUAllocator : public DeviceAllocator {
|
||||
|
||||
Block* block = get_allocated_block(ptr.get());
|
||||
TORCH_CHECK(block, "No allocated block can be found.");
|
||||
c10::xpu::XPUStream xpu_stream{stream};
|
||||
device_allocators[block->device]->recordStream(block, xpu_stream);
|
||||
device_allocators[block->device]->recordStream(block, stream);
|
||||
}
|
||||
|
||||
DataPtr allocate(size_t size) override {
|
||||
@ -684,17 +679,17 @@ class XPUAllocator : public DeviceAllocator {
|
||||
": did you call init?");
|
||||
}
|
||||
|
||||
DeviceStats getDeviceStats(DeviceIndex device) override {
|
||||
DeviceStats getDeviceStats(DeviceIndex device) {
|
||||
assertValidDevice(device);
|
||||
return device_allocators[device]->getStats();
|
||||
}
|
||||
|
||||
void resetPeakStats(DeviceIndex device) override {
|
||||
void resetPeakStats(DeviceIndex device) {
|
||||
assertValidDevice(device);
|
||||
device_allocators[device]->resetPeakStats();
|
||||
}
|
||||
|
||||
void resetAccumulatedStats(DeviceIndex device) override {
|
||||
void resetAccumulatedStats(DeviceIndex device) {
|
||||
assertValidDevice(device);
|
||||
device_allocators[device]->resetAccumulatedStats();
|
||||
}
|
||||
|
||||
@ -394,7 +394,7 @@ function(torch_compile_options libname)
|
||||
list(APPEND private_compile_options -Wredundant-move)
|
||||
endif()
|
||||
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||
list(APPEND private_compile_options -Wextra-semi -Wno-error=extra-semi -Wmove)
|
||||
list(APPEND private_compile_options -Wextra-semi -Wmove)
|
||||
else()
|
||||
list(APPEND private_compile_options
|
||||
# Considered to be flaky. See the discussion at
|
||||
|
||||
@ -25,26 +25,3 @@
|
||||
synchronize
|
||||
device_index
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. automodule:: torch.accelerator.memory
|
||||
```
|
||||
```{eval-rst}
|
||||
.. currentmodule:: torch.accelerator.memory
|
||||
```
|
||||
|
||||
## Memory management
|
||||
```{eval-rst}
|
||||
.. autosummary::
|
||||
:toctree: generated
|
||||
:nosignatures:
|
||||
|
||||
empty_cache
|
||||
max_memory_allocated
|
||||
max_memory_reserved
|
||||
memory_allocated
|
||||
memory_reserved
|
||||
memory_stats
|
||||
reset_accumulated_memory_stats
|
||||
reset_peak_memory_stats
|
||||
```
|
||||
|
||||
@ -1086,6 +1086,7 @@ coverage_ignore_functions = [
|
||||
"z3op",
|
||||
"z3str",
|
||||
# torch.fx.graph_module
|
||||
"reduce_deploy_graph_module",
|
||||
"reduce_graph_module",
|
||||
"reduce_package_graph_module",
|
||||
# torch.fx.node
|
||||
|
||||
8
docs/source/deploy.md
Normal file
8
docs/source/deploy.md
Normal file
@ -0,0 +1,8 @@
|
||||
---
|
||||
orphan: true
|
||||
---
|
||||
|
||||
# torch::deploy has been moved to pytorch/multipy <!-- codespell:ignore -->
|
||||
|
||||
|
||||
``torch::deploy`` has been moved to its new home at [https://github.com/pytorch/multipy](https://github.com/pytorch/multipy). <!-- codespell:ignore -->
|
||||
@ -20,39 +20,41 @@ for a brief introduction to all features related to distributed training.
|
||||
|
||||
## Backends
|
||||
|
||||
`torch.distributed` supports three built-in backends, each with
|
||||
`torch.distributed` supports four built-in backends, each with
|
||||
different capabilities. The table below shows which functions are available
|
||||
for use with CPU / CUDA tensors.
|
||||
for use with a CPU or GPU for each backend. For NCCL, GPU refers to CUDA GPU
|
||||
while for XCCL to XPU GPU.
|
||||
|
||||
MPI supports CUDA only if the implementation used to build PyTorch supports it.
|
||||
|
||||
```{eval-rst}
|
||||
+----------------+-----------+-----------+-----------+
|
||||
| Backend | ``gloo`` | ``mpi`` | ``nccl`` |
|
||||
+----------------+-----+-----+-----+-----+-----+-----+
|
||||
| Device | CPU | GPU | CPU | GPU | CPU | GPU |
|
||||
+================+=====+=====+=====+=====+=====+=====+
|
||||
| send | ✓ | ✘ | ✓ | ? | ✘ | ✓ |
|
||||
+----------------+-----+-----+-----+-----+-----+-----+
|
||||
| recv | ✓ | ✘ | ✓ | ? | ✘ | ✓ |
|
||||
+----------------+-----+-----+-----+-----+-----+-----+
|
||||
| broadcast | ✓ | ✓ | ✓ | ? | ✘ | ✓ |
|
||||
+----------------+-----+-----+-----+-----+-----+-----+
|
||||
| all_reduce | ✓ | ✓ | ✓ | ? | ✘ | ✓ |
|
||||
+----------------+-----+-----+-----+-----+-----+-----+
|
||||
| reduce | ✓ | ✓ | ✓ | ? | ✘ | ✓ |
|
||||
+----------------+-----+-----+-----+-----+-----+-----+
|
||||
| all_gather | ✓ | ✓ | ✓ | ? | ✘ | ✓ |
|
||||
+----------------+-----+-----+-----+-----+-----+-----+
|
||||
| gather | ✓ | ✓ | ✓ | ? | ✘ | ✓ |
|
||||
+----------------+-----+-----+-----+-----+-----+-----+
|
||||
| scatter | ✓ | ✓ | ✓ | ? | ✘ | ✓ |
|
||||
+----------------+-----+-----+-----+-----+-----+-----+
|
||||
| reduce_scatter | ✓ | ✓ | ✘ | ✘ | ✘ | ✓ |
|
||||
+----------------+-----+-----+-----+-----+-----+-----+
|
||||
| all_to_all | ✓ | ✓ | ✓ | ? | ✘ | ✓ |
|
||||
+----------------+-----+-----+-----+-----+-----+-----+
|
||||
| barrier | ✓ | ✘ | ✓ | ? | ✘ | ✓ |
|
||||
+----------------+-----+-----+-----+-----+-----+-----+
|
||||
+----------------+-----------+-----------+-----------+-----------+
|
||||
| Backend | ``gloo`` | ``mpi`` | ``nccl`` | ``xccl`` |
|
||||
+----------------+-----+-----+-----+-----+-----+-----+-----+-----+
|
||||
| Device | CPU | GPU | CPU | GPU | CPU | GPU | CPU | GPU |
|
||||
+================+=====+=====+=====+=====+=====+=====+=====+=====+
|
||||
| send | ✓ | ✘ | ✓ | ? | ✘ | ✓ | ✘ | ✓ |
|
||||
+----------------+-----+-----+-----+-----+-----+-----+-----+-----+
|
||||
| recv | ✓ | ✘ | ✓ | ? | ✘ | ✓ | ✘ | ✓ |
|
||||
+----------------+-----+-----+-----+-----+-----+-----+-----+-----+
|
||||
| broadcast | ✓ | ✓ | ✓ | ? | ✘ | ✓ | ✘ | ✓ |
|
||||
+----------------+-----+-----+-----+-----+-----+-----+-----+-----+
|
||||
| all_reduce | ✓ | ✓ | ✓ | ? | ✘ | ✓ | ✘ | ✓ |
|
||||
+----------------+-----+-----+-----+-----+-----+-----+-----+-----+
|
||||
| reduce | ✓ | ✓ | ✓ | ? | ✘ | ✓ | ✘ | ✓ |
|
||||
+----------------+-----+-----+-----+-----+-----+-----+-----+-----+
|
||||
| all_gather | ✓ | ✓ | ✓ | ? | ✘ | ✓ | ✘ | ✓ |
|
||||
+----------------+-----+-----+-----+-----+-----+-----+-----+-----+
|
||||
| gather | ✓ | ✓ | ✓ | ? | ✘ | ✓ | ✘ | ✓ |
|
||||
+----------------+-----+-----+-----+-----+-----+-----+-----+-----+
|
||||
| scatter | ✓ | ✓ | ✓ | ? | ✘ | ✓ | ✘ | ✓ |
|
||||
+----------------+-----+-----+-----+-----+-----+-----+-----+-----+
|
||||
| reduce_scatter | ✓ | ✓ | ✘ | ✘ | ✘ | ✓ | ✘ | ✓ |
|
||||
+----------------+-----+-----+-----+-----+-----+-----+-----+-----+
|
||||
| all_to_all | ✓ | ✓ | ✓ | ? | ✘ | ✓ | ✘ | ✓ |
|
||||
+----------------+-----+-----+-----+-----+-----+-----+-----+-----+
|
||||
| barrier | ✓ | ✘ | ✓ | ? | ✘ | ✓ | ✘ | ✓ |
|
||||
+----------------+-----+-----+-----+-----+-----+-----+-----+-----+
|
||||
```
|
||||
|
||||
### Backends that come with PyTorch
|
||||
@ -81,8 +83,9 @@ In the past, we were often asked: "which backend should I use?".
|
||||
|
||||
- Rule of thumb
|
||||
|
||||
- Use the NCCL backend for distributed **GPU** training
|
||||
- Use the Gloo backend for distributed **CPU** training.
|
||||
- Use the NCCL backend for distributed training with CUDA **GPU**.
|
||||
- Use the XCCL backend for distributed training with XPU **GPU**.
|
||||
- Use the Gloo backend for distributed training with **CPU**.
|
||||
|
||||
- GPU hosts with InfiniBand interconnect
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -2,12 +2,13 @@
|
||||
|
||||
[build-system]
|
||||
requires = [
|
||||
# 70.1.0: min version for integrated bdist_wheel command from wheel package
|
||||
# 77.0.0: min version for SPDX expression support for project.license
|
||||
"setuptools>=77.0.0,<80.0",
|
||||
"setuptools>=70.1.0,<80.0",
|
||||
"cmake>=3.27",
|
||||
"ninja",
|
||||
"numpy",
|
||||
"packaging>=24.2",
|
||||
"packaging",
|
||||
"pyyaml",
|
||||
"requests",
|
||||
"six", # dependency chain: NNPACK -> PeachPy -> six
|
||||
@ -19,8 +20,12 @@ build-backend = "setuptools.build_meta"
|
||||
name = "torch"
|
||||
description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.9,<3.14"
|
||||
license = "BSD-3-Clause"
|
||||
requires-python = ">=3.9"
|
||||
# TODO: change to `license = "BSD-3-Clause"` and enable PEP 639 after pinning setuptools>=77
|
||||
# FIXME: As of 2025.06.20, it is hard to ensure the minimum version of setuptools in our CI environment.
|
||||
# TOML-table-based license deprecated in setuptools>=77, and the deprecation warning will be changed
|
||||
# to an error on 2026.02.18. See also: https://github.com/pypa/setuptools/issues/4903
|
||||
license = { text = "BSD-3-Clause" }
|
||||
authors = [{ name = "PyTorch Team", email = "packages@pytorch.org" }]
|
||||
keywords = ["pytorch", "machine learning"]
|
||||
classifiers = [
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
# Build System requirements
|
||||
setuptools>=77.0.0,<80.0 # setuptools develop deprecated on 80.0
|
||||
setuptools>=70.1.0,<80.0 # setuptools develop deprecated on 80.0
|
||||
cmake>=3.27
|
||||
ninja
|
||||
numpy
|
||||
packaging>=24.2
|
||||
packaging
|
||||
pyyaml
|
||||
requests
|
||||
six # dependency chain: NNPACK -> PeachPy -> six
|
||||
|
||||
@ -1,40 +1 @@
|
||||
This directory contains the useful tools.
|
||||
|
||||
|
||||
## build_android.sh
|
||||
This script is to build PyTorch/Caffe2 library for Android. Take the following steps to start the build:
|
||||
|
||||
- set ANDROID_NDK to the location of ndk
|
||||
|
||||
```bash
|
||||
export ANDROID_NDK=YOUR_NDK_PATH
|
||||
```
|
||||
|
||||
- run build_android.sh
|
||||
```bash
|
||||
#in your PyTorch root directory
|
||||
bash scripts/build_android.sh
|
||||
```
|
||||
If succeeded, the libraries and headers would be generated to build_android/install directory. You can then copy these files from build_android/install to your Android project for further usage.
|
||||
|
||||
You can also override the cmake flags via command line, e.g., following command will also compile the executable binary files:
|
||||
```bash
|
||||
bash scripts/build_android.sh -DBUILD_BINARY=ON
|
||||
```
|
||||
|
||||
## build_ios.sh
|
||||
This script is to build PyTorch/Caffe2 library for iOS, and can only be performed on macOS. Take the following steps to start the build:
|
||||
|
||||
- Install Xcode from App Store, and configure "Command Line Tools" properly on Xcode.
|
||||
- Install the dependencies:
|
||||
|
||||
```bash
|
||||
brew install cmake automake libtool
|
||||
```
|
||||
|
||||
- run build_ios.sh
|
||||
```bash
|
||||
#in your PyTorch root directory
|
||||
bash scripts/build_ios.sh
|
||||
```
|
||||
If succeeded, the libraries and headers would be generated to build_ios/install directory. You can then copy these files to your Xcode project for further usage.
|
||||
|
||||
@ -1 +0,0 @@
|
||||
cat apache_header.txt $1 > _add_apache_header.txt && mv _add_apache_header.txt $1
|
||||
@ -1,15 +0,0 @@
|
||||
/**
|
||||
* Copyright (c) 2016-present, Facebook, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
@ -1,14 +0,0 @@
|
||||
# Copyright (c) 2016-present, Facebook, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
##############################################################################
|
||||
@ -1,189 +0,0 @@
|
||||
#!/bin/bash
|
||||
##############################################################################
|
||||
# Example command to build the android target.
|
||||
##############################################################################
|
||||
#
|
||||
# This script shows how one can build a Caffe2 binary for the Android platform
|
||||
# using android-cmake. A few notes:
|
||||
#
|
||||
# (1) This build also does a host build for protobuf. You will need autoconf
|
||||
# to carry out this. If autoconf is not possible, you will need to provide
|
||||
# a pre-built protoc binary that is the same version as the protobuf
|
||||
# version under third_party.
|
||||
# If you are building on Mac, you might need to install autotool and
|
||||
# libtool. The easiest way is via homebrew:
|
||||
# brew install automake
|
||||
# brew install libtool
|
||||
# (2) You will need to have android ndk installed. The current script assumes
|
||||
# that you set ANDROID_NDK to the location of ndk.
|
||||
# (3) The toolchain and the build target platform can be specified with the
|
||||
# cmake arguments below. For more details, check out android-cmake's doc.
|
||||
|
||||
set -e
|
||||
|
||||
# Android specific flags
|
||||
if [ -z "$ANDROID_ABI" ]; then
|
||||
ANDROID_ABI="armeabi-v7a with NEON"
|
||||
fi
|
||||
ANDROID_NATIVE_API_LEVEL="21"
|
||||
echo "Build with ANDROID_ABI[$ANDROID_ABI], ANDROID_NATIVE_API_LEVEL[$ANDROID_NATIVE_API_LEVEL]"
|
||||
|
||||
CAFFE2_ROOT="$( cd "$(dirname "$0")"/.. ; pwd -P)"
|
||||
if [ -z "$ANDROID_NDK" ]; then
|
||||
echo "ANDROID_NDK not set; please set it to the Android NDK directory"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -d "$ANDROID_NDK" ]; then
|
||||
echo "ANDROID_NDK not a directory; did you install it under $ANDROID_NDK?"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "$PYTHON" ]; then
|
||||
PYTHON=python
|
||||
PYTHON_VERSION_MAJOR=$($PYTHON -c 'import sys; print(sys.version_info[0])')
|
||||
if [ "${PYTHON_VERSION_MAJOR}" -le 2 ]; then
|
||||
echo "Default python executable is Python-2, trying to use python3 alias"
|
||||
PYTHON=python3
|
||||
fi
|
||||
fi
|
||||
|
||||
ANDROID_NDK_PROPERTIES="$ANDROID_NDK/source.properties"
|
||||
[ -f "$ANDROID_NDK_PROPERTIES" ] && ANDROID_NDK_VERSION=$(sed -n 's/^Pkg.Revision[^=]*= *\([0-9]*\)\..*$/\1/p' "$ANDROID_NDK_PROPERTIES")
|
||||
|
||||
echo "Bash: $(/bin/bash --version | head -1)"
|
||||
echo "Python: $($PYTHON -c 'import sys; print(sys.version)')"
|
||||
echo "Caffe2 path: $CAFFE2_ROOT"
|
||||
echo "Using Android NDK at $ANDROID_NDK"
|
||||
echo "Android NDK version: $ANDROID_NDK_VERSION"
|
||||
|
||||
CMAKE_ARGS=()
|
||||
|
||||
# Build PyTorch mobile
|
||||
CMAKE_ARGS+=("-DCMAKE_PREFIX_PATH=$($PYTHON -c 'import sysconfig; print(sysconfig.get_path("purelib"))')")
|
||||
CMAKE_ARGS+=("-DPython_EXECUTABLE=$($PYTHON -c 'import sys; print(sys.executable)')")
|
||||
CMAKE_ARGS+=("-DBUILD_CUSTOM_PROTOBUF=OFF")
|
||||
|
||||
# custom build with selected ops
|
||||
if [ -n "${SELECTED_OP_LIST}" ]; then
|
||||
SELECTED_OP_LIST="$(cd $(dirname $SELECTED_OP_LIST); pwd -P)/$(basename $SELECTED_OP_LIST)"
|
||||
echo "Choose SELECTED_OP_LIST file: $SELECTED_OP_LIST"
|
||||
if [ ! -r ${SELECTED_OP_LIST} ]; then
|
||||
echo "Error: SELECTED_OP_LIST file ${SELECTED_OP_LIST} not found."
|
||||
exit 1
|
||||
fi
|
||||
CMAKE_ARGS+=("-DSELECTED_OP_LIST=${SELECTED_OP_LIST}")
|
||||
fi
|
||||
|
||||
# If Ninja is installed, prefer it to Make
|
||||
if [ -x "$(command -v ninja)" ]; then
|
||||
CMAKE_ARGS+=("-GNinja")
|
||||
fi
|
||||
|
||||
# Use android-cmake to build Android project from CMake.
|
||||
CMAKE_ARGS+=("-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake")
|
||||
|
||||
if [ -z "$BUILD_MOBILE_BENCHMARK" ]; then
|
||||
BUILD_MOBILE_BENCHMARK=0
|
||||
fi
|
||||
|
||||
if [ -z "$BUILD_MOBILE_TEST" ]; then
|
||||
BUILD_MOBILE_TEST=0
|
||||
fi
|
||||
# Don't build artifacts we don't need
|
||||
CMAKE_ARGS+=("-DBUILD_TEST=OFF")
|
||||
CMAKE_ARGS+=("-DBUILD_BINARY=OFF")
|
||||
|
||||
# If there exists env variable and it equals to 0, build full jit interpreter.
|
||||
# Default behavior is to build lite interpreter
|
||||
# cmd: BUILD_LITE_INTERPRETER=0 ./scripts/build_android.sh
|
||||
if [ "${BUILD_LITE_INTERPRETER}" == 0 ]; then
|
||||
CMAKE_ARGS+=("-DBUILD_LITE_INTERPRETER=OFF")
|
||||
else
|
||||
CMAKE_ARGS+=("-DBUILD_LITE_INTERPRETER=ON")
|
||||
fi
|
||||
if [ "${TRACING_BASED}" == 1 ]; then
|
||||
CMAKE_ARGS+=("-DTRACING_BASED=ON")
|
||||
else
|
||||
CMAKE_ARGS+=("-DTRACING_BASED=OFF")
|
||||
fi
|
||||
if [ "${USE_LIGHTWEIGHT_DISPATCH}" == 1 ]; then
|
||||
CMAKE_ARGS+=("-DUSE_LIGHTWEIGHT_DISPATCH=ON")
|
||||
CMAKE_ARGS+=("-DSTATIC_DISPATCH_BACKEND=CPU")
|
||||
else
|
||||
CMAKE_ARGS+=("-DUSE_LIGHTWEIGHT_DISPATCH=OFF")
|
||||
fi
|
||||
|
||||
CMAKE_ARGS+=("-DBUILD_MOBILE_BENCHMARK=$BUILD_MOBILE_BENCHMARK")
|
||||
CMAKE_ARGS+=("-DBUILD_MOBILE_TEST=$BUILD_MOBILE_TEST")
|
||||
CMAKE_ARGS+=("-DBUILD_PYTHON=OFF")
|
||||
CMAKE_ARGS+=("-DBUILD_SHARED_LIBS=OFF")
|
||||
if (( "${ANDROID_NDK_VERSION:-0}" < 18 )); then
|
||||
CMAKE_ARGS+=("-DANDROID_TOOLCHAIN=gcc")
|
||||
else
|
||||
CMAKE_ARGS+=("-DANDROID_TOOLCHAIN=clang")
|
||||
fi
|
||||
# Disable unused dependencies
|
||||
CMAKE_ARGS+=("-DUSE_CUDA=OFF")
|
||||
CMAKE_ARGS+=("-DUSE_ITT=OFF")
|
||||
CMAKE_ARGS+=("-DUSE_GFLAGS=OFF")
|
||||
CMAKE_ARGS+=("-DUSE_OPENCV=OFF")
|
||||
CMAKE_ARGS+=("-DUSE_MPI=OFF")
|
||||
CMAKE_ARGS+=("-DUSE_OPENMP=OFF")
|
||||
# Only toggle if VERBOSE=1
|
||||
if [ "${VERBOSE:-}" == '1' ]; then
|
||||
CMAKE_ARGS+=("-DCMAKE_VERBOSE_MAKEFILE=1")
|
||||
fi
|
||||
|
||||
# Android specific flags
|
||||
CMAKE_ARGS+=("-DANDROID_NDK=$ANDROID_NDK")
|
||||
CMAKE_ARGS+=("-DANDROID_ABI=$ANDROID_ABI")
|
||||
CMAKE_ARGS+=("-DANDROID_NATIVE_API_LEVEL=$ANDROID_NATIVE_API_LEVEL")
|
||||
CMAKE_ARGS+=("-DANDROID_CPP_FEATURES=rtti exceptions")
|
||||
if [ "${ANDROID_STL_SHARED:-}" == '1' ]; then
|
||||
CMAKE_ARGS+=("-DANDROID_STL=c++_shared")
|
||||
fi
|
||||
if [ "${ANDROID_DEBUG_SYMBOLS:-}" == '1' ]; then
|
||||
CMAKE_ARGS+=("-DANDROID_DEBUG_SYMBOLS=1")
|
||||
fi
|
||||
|
||||
if [ -n "${USE_VULKAN}" ]; then
|
||||
CMAKE_ARGS+=("-DUSE_VULKAN=ON")
|
||||
if [ -n "${USE_VULKAN_FP16_INFERENCE}" ]; then
|
||||
CMAKE_ARGS+=("-DUSE_VULKAN_FP16_INFERENCE=ON")
|
||||
fi
|
||||
if [ -n "${USE_VULKAN_RELAXED_PRECISION}" ]; then
|
||||
CMAKE_ARGS+=("-DUSE_VULKAN_RELAXED_PRECISION=ON")
|
||||
fi
|
||||
fi
|
||||
|
||||
# Use-specified CMake arguments go last to allow overriding defaults
|
||||
CMAKE_ARGS+=($@)
|
||||
|
||||
# Patch pocketfft (as Android does not have aligned_alloc even if compiled with c++17
|
||||
if [ -f third_party/pocketfft/pocketfft_hdronly.h ]; then
|
||||
sed -i -e "s/__cplusplus >= 201703L/0/" third_party/pocketfft/pocketfft_hdronly.h
|
||||
fi
|
||||
|
||||
# Now, actually build the Android target.
|
||||
BUILD_ROOT=${BUILD_ROOT:-"$CAFFE2_ROOT/build_android"}
|
||||
INSTALL_PREFIX=${BUILD_ROOT}/install
|
||||
mkdir -p $BUILD_ROOT
|
||||
cd $BUILD_ROOT
|
||||
cmake "$CAFFE2_ROOT" \
|
||||
-DCMAKE_INSTALL_PREFIX=$INSTALL_PREFIX \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
"${CMAKE_ARGS[@]}"
|
||||
|
||||
# Cross-platform parallel build
|
||||
if [ -z "$MAX_JOBS" ]; then
|
||||
if [ "$(uname)" == 'Darwin' ]; then
|
||||
MAX_JOBS=$(sysctl -n hw.ncpu)
|
||||
else
|
||||
MAX_JOBS=$(nproc)
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "Will install headers and libs to $INSTALL_PREFIX for further Android project usage."
|
||||
cmake --build . --target install -- "-j${MAX_JOBS}"
|
||||
echo "Installation completed, now you can copy the headers/libs from $INSTALL_PREFIX to your Android project directory."
|
||||
@ -1,102 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -eux -o pipefail
|
||||
|
||||
env
|
||||
echo "BUILD_ENVIRONMENT:$BUILD_ENVIRONMENT"
|
||||
|
||||
export ANDROID_NDK_HOME=/opt/ndk
|
||||
export ANDROID_NDK=/opt/ndk
|
||||
export ANDROID_HOME=/opt/android/sdk
|
||||
|
||||
# Must be in sync with GRADLE_VERSION in docker image for android
|
||||
# https://github.com/pietern/pytorch-dockerfiles/blob/master/build.sh#L155
|
||||
export GRADLE_VERSION=6.8.3
|
||||
export GRADLE_HOME=/opt/gradle/gradle-$GRADLE_VERSION
|
||||
export GRADLE_PATH=$GRADLE_HOME/bin/gradle
|
||||
|
||||
# touch gradle cache files to prevent expiration
|
||||
while IFS= read -r -d '' file
|
||||
do
|
||||
touch "$file" || true
|
||||
done < <(find /var/lib/jenkins/.gradle -type f -print0)
|
||||
|
||||
# Patch pocketfft (as Android does not have aligned_alloc even if compiled with c++17
|
||||
if [ -f ~/workspace/third_party/pocketfft/pocketfft_hdronly.h ]; then
|
||||
sed -i -e "s/__cplusplus >= 201703L/0/" ~/workspace/third_party/pocketfft/pocketfft_hdronly.h
|
||||
fi
|
||||
|
||||
export GRADLE_LOCAL_PROPERTIES=~/workspace/android/local.properties
|
||||
rm -f $GRADLE_LOCAL_PROPERTIES
|
||||
echo "sdk.dir=/opt/android/sdk" >> $GRADLE_LOCAL_PROPERTIES
|
||||
echo "ndk.dir=/opt/ndk" >> $GRADLE_LOCAL_PROPERTIES
|
||||
echo "cmake.dir=/usr/local" >> $GRADLE_LOCAL_PROPERTIES
|
||||
|
||||
retry () {
|
||||
$* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
|
||||
}
|
||||
|
||||
# Run custom build script
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *-gradle-custom-build* ]]; then
|
||||
# Install torch & torchvision - used to download & dump used ops from test model.
|
||||
retry pip install torch torchvision --progress-bar off
|
||||
|
||||
exec "$(dirname "${BASH_SOURCE[0]}")/../android/build_test_app_custom.sh" armeabi-v7a
|
||||
fi
|
||||
|
||||
# Run default build
|
||||
BUILD_ANDROID_INCLUDE_DIR_x86=~/workspace/build_android/install/include
|
||||
BUILD_ANDROID_LIB_DIR_x86=~/workspace/build_android/install/lib
|
||||
|
||||
BUILD_ANDROID_INCLUDE_DIR_x86_64=~/workspace/build_android_install_x86_64/install/include
|
||||
BUILD_ANDROID_LIB_DIR_x86_64=~/workspace/build_android_install_x86_64/install/lib
|
||||
|
||||
BUILD_ANDROID_INCLUDE_DIR_arm_v7a=~/workspace/build_android_install_arm_v7a/install/include
|
||||
BUILD_ANDROID_LIB_DIR_arm_v7a=~/workspace/build_android_install_arm_v7a/install/lib
|
||||
|
||||
BUILD_ANDROID_INCLUDE_DIR_arm_v8a=~/workspace/build_android_install_arm_v8a/install/include
|
||||
BUILD_ANDROID_LIB_DIR_arm_v8a=~/workspace/build_android_install_arm_v8a/install/lib
|
||||
|
||||
PYTORCH_ANDROID_SRC_MAIN_DIR=~/workspace/android/pytorch_android/src/main
|
||||
|
||||
JNI_INCLUDE_DIR=${PYTORCH_ANDROID_SRC_MAIN_DIR}/cpp/libtorch_include
|
||||
mkdir -p $JNI_INCLUDE_DIR
|
||||
|
||||
JNI_LIBS_DIR=${PYTORCH_ANDROID_SRC_MAIN_DIR}/jniLibs
|
||||
mkdir -p $JNI_LIBS_DIR
|
||||
|
||||
ln -s ${BUILD_ANDROID_INCLUDE_DIR_x86} ${JNI_INCLUDE_DIR}/x86
|
||||
ln -s ${BUILD_ANDROID_LIB_DIR_x86} ${JNI_LIBS_DIR}/x86
|
||||
|
||||
if [[ "${BUILD_ENVIRONMENT}" != *-gradle-build-only-x86_32* ]]; then
|
||||
ln -s ${BUILD_ANDROID_INCLUDE_DIR_x86_64} ${JNI_INCLUDE_DIR}/x86_64
|
||||
ln -s ${BUILD_ANDROID_LIB_DIR_x86_64} ${JNI_LIBS_DIR}/x86_64
|
||||
|
||||
ln -s ${BUILD_ANDROID_INCLUDE_DIR_arm_v7a} ${JNI_INCLUDE_DIR}/armeabi-v7a
|
||||
ln -s ${BUILD_ANDROID_LIB_DIR_arm_v7a} ${JNI_LIBS_DIR}/armeabi-v7a
|
||||
|
||||
ln -s ${BUILD_ANDROID_INCLUDE_DIR_arm_v8a} ${JNI_INCLUDE_DIR}/arm64-v8a
|
||||
ln -s ${BUILD_ANDROID_LIB_DIR_arm_v8a} ${JNI_LIBS_DIR}/arm64-v8a
|
||||
fi
|
||||
|
||||
GRADLE_PARAMS="-p android assembleRelease --debug --stacktrace"
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *-gradle-build-only-x86_32* ]]; then
|
||||
GRADLE_PARAMS+=" -PABI_FILTERS=x86"
|
||||
fi
|
||||
|
||||
if [ -n "${GRADLE_OFFLINE:-}" ]; then
|
||||
GRADLE_PARAMS+=" --offline"
|
||||
fi
|
||||
|
||||
$GRADLE_PATH $GRADLE_PARAMS
|
||||
|
||||
find . -type f -name "*.a" -exec ls -lh {} \;
|
||||
|
||||
while IFS= read -r -d '' file
|
||||
do
|
||||
echo
|
||||
echo "$file"
|
||||
ls -lah "$file"
|
||||
zipinfo -l "$file"
|
||||
done < <(find . -type f -name '*.aar' -print0)
|
||||
|
||||
find . -type f -name *aar -print | xargs tar cfvz ~/workspace/android/artifacts.tgz
|
||||
@ -1,59 +0,0 @@
|
||||
#!/bin/bash
|
||||
##############################################################################
|
||||
# Build script to build the protoc compiler for the host platform.
|
||||
##############################################################################
|
||||
# This script builds the protoc compiler for the host platform, which is needed
|
||||
# for any cross-compilation as we will need to convert the protobuf source
|
||||
# files to cc files.
|
||||
#
|
||||
# --other-flags accepts flags that should be passed to cmake. Optional.
|
||||
#
|
||||
# After the execution of the file, one should be able to find the host protoc
|
||||
# binary at build_host_protoc/bin/protoc.
|
||||
|
||||
set -e
|
||||
|
||||
CAFFE2_ROOT="$( cd "$(dirname -- "$0")"/.. ; pwd -P)"
|
||||
BUILD_ROOT=${BUILD_ROOT:-"$CAFFE2_ROOT/build_host_protoc"}
|
||||
mkdir -p $BUILD_ROOT/build
|
||||
cd $BUILD_ROOT/build
|
||||
|
||||
CMAKE_ARGS=()
|
||||
CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=$BUILD_ROOT")
|
||||
CMAKE_ARGS+=("-Dprotobuf_BUILD_TESTS=OFF")
|
||||
|
||||
# If Ninja is installed, prefer it to Make
|
||||
if [ -x "$(command -v ninja)" ]; then
|
||||
CMAKE_ARGS+=("-GNinja")
|
||||
fi
|
||||
|
||||
while true; do
|
||||
case "$1" in
|
||||
--other-flags)
|
||||
shift;
|
||||
CMAKE_ARGS+=("$@")
|
||||
break ;;
|
||||
"")
|
||||
break ;;
|
||||
*)
|
||||
echo "Unknown option passed as argument: $1"
|
||||
break ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Use ccache if available (this path is where Homebrew installs ccache symlinks)
|
||||
if [ "$(uname)" == 'Darwin' ] && [ -d /usr/local/opt/ccache/libexec ]; then
|
||||
CMAKE_ARGS+=("-DCMAKE_C_COMPILER=/usr/local/opt/ccache/libexec/gcc")
|
||||
CMAKE_ARGS+=("-DCMAKE_CXX_COMPILER=/usr/local/opt/ccache/libexec/g++")
|
||||
fi
|
||||
|
||||
cmake "$CAFFE2_ROOT/third_party/protobuf/cmake" ${CMAKE_ARGS[@]}
|
||||
|
||||
if [ -z "$MAX_JOBS" ]; then
|
||||
if [ "$(uname)" == 'Darwin' ]; then
|
||||
MAX_JOBS=$(sysctl -n hw.ncpu)
|
||||
else
|
||||
MAX_JOBS=$(nproc)
|
||||
fi
|
||||
fi
|
||||
cmake --build . -- "-j${MAX_JOBS}" install
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user