mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-28 02:04:53 +08:00
Compare commits
1 Commits
test-vec-m
...
lintbuildd
| Author | SHA1 | Date | |
|---|---|---|---|
| e44ca7305f |
@ -36,105 +36,3 @@ See `build.sh` for valid build environments (it's the giant switch).
|
||||
# Set flags (see build.sh) and build image
|
||||
sudo bash -c 'TRITON=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
|
||||
```
|
||||
|
||||
## [Guidance] Adding a New Base Docker Image
|
||||
|
||||
### Background
|
||||
|
||||
The base Docker images in directory `.ci/docker/` are built by the `docker-builds.yml` workflow. Those images are used throughout the PyTorch CI/CD pipeline. You should only create or modify a base Docker image if you need specific environment changes or dependencies before building PyTorch on CI.
|
||||
|
||||
1. **Automatic Rebuilding**:
|
||||
- The Docker image building process is triggered automatically when changes are made to files in the `.ci/docker/*` directory
|
||||
- This ensures all images stay up-to-date with the latest dependencies and configurations
|
||||
|
||||
2. **Image Reuse in PyTorch Build Workflows** (example: linux-build):
|
||||
- The images generated by `docker-builds.yml` are reused in `_linux-build.yml` through the `calculate-docker-image` step
|
||||
- The `_linux-build.yml` workflow:
|
||||
- Pulls the Docker image determined by the `calculate-docker-image` step
|
||||
- Runs a Docker container with that image
|
||||
- Executes `.ci/pytorch/build.sh` inside the container to build PyTorch
|
||||
|
||||
3. **Usage in Test Workflows** (example: linux-test):
|
||||
- The same Docker images are also used in `_linux-test.yml` for running tests
|
||||
- The `_linux-test.yml` workflow follows a similar pattern:
|
||||
- It uses the `calculate-docker-image` step to determine which Docker image to use
|
||||
- It pulls the Docker image and runs a container with that image
|
||||
- It installs the wheels from the artifacts generated by PyTorch build jobs
|
||||
- It executes test scripts (like `.ci/pytorch/test.sh` or `.ci/pytorch/multigpu-test.sh`) inside the container
|
||||
|
||||
### Understanding File Purposes
|
||||
|
||||
#### `.ci/docker/build.sh` vs `.ci/pytorch/build.sh`
|
||||
- **`.ci/docker/build.sh`**:
|
||||
- Used for building base Docker images
|
||||
- Executed by the `docker-builds.yml` workflow to pre-build Docker images for CI
|
||||
- Contains configurations for different Docker build environments
|
||||
|
||||
- **`.ci/pytorch/build.sh`**:
|
||||
- Used for building PyTorch inside a Docker container
|
||||
- Called by workflows like `_linux-build.yml` after the Docker container is started
|
||||
- Builds PyTorch wheels and other artifacts
|
||||
|
||||
#### `.ci/docker/ci_commit_pins/` vs `.github/ci_commit_pins`
|
||||
- **`.ci/docker/ci_commit_pins/`**:
|
||||
- Used for pinning dependency versions during base Docker image building
|
||||
- Ensures consistent environments for building PyTorch
|
||||
- Changes here trigger base Docker image rebuilds
|
||||
|
||||
- **`.github/ci_commit_pins`**:
|
||||
- Used for pinning dependency versions during PyTorch building and tests
|
||||
- Ensures consistent dependencies for PyTorch across different builds
|
||||
- Used by build scripts running inside Docker containers
|
||||
|
||||
### Step-by-Step Guide for Adding a New Base Docker Image
|
||||
|
||||
#### 1. Add Pinned Commits (If Applicable)
|
||||
|
||||
We use pinned commits for build stability. The `nightly.yml` workflow checks and updates pinned commits for certain repository dependencies daily.
|
||||
|
||||
If your new Docker image needs a library installed from a specific pinned commit or built from source:
|
||||
|
||||
1. Add the repository you want to track in `nightly.yml` and `merge-rules.yml`
|
||||
2. Add the initial pinned commit in `.ci/docker/ci_commit_pins/`. The text filename should match the one defined in step 1
|
||||
|
||||
#### 2. Configure the Base Docker Image
|
||||
1. **Add new Base Docker image configuration** (if applicable):
|
||||
|
||||
Add the configuration in `.ci/docker/build.sh`. For example:
|
||||
```bash
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-new1)
|
||||
CUDA_VERSION=12.8.1
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
NEW_ARG_1=yes
|
||||
;;
|
||||
```
|
||||
|
||||
2. **Add build arguments to Docker build command**:
|
||||
|
||||
If you're introducing a new argument to the Docker build, make sure to add it in the Docker build step in `.ci/docker/build.sh`:
|
||||
```bash
|
||||
docker build \
|
||||
....
|
||||
--build-arg "NEW_ARG_1=${NEW_ARG_1}"
|
||||
```
|
||||
|
||||
3. **Update Dockerfile logic**:
|
||||
|
||||
Update the Dockerfile to use the new argument. For example, in `ubuntu/Dockerfile`:
|
||||
```dockerfile
|
||||
ARG NEW_ARG_1
|
||||
# Set up environment for NEW_ARG_1
|
||||
RUN if [ -n "${NEW_ARG_1}" ]; then bash ./do_something.sh; fi
|
||||
```
|
||||
|
||||
4. **Add the Docker configuration** in `.github/workflows/docker-builds.yml`:
|
||||
|
||||
The `docker-builds.yml` workflow pre-builds the Docker images whenever changes occur in the `.ci/docker/` directory. This includes the
|
||||
pinned commit updates.
|
||||
|
||||
@ -78,331 +78,45 @@ elif [[ "$image" == *linter* ]]; then
|
||||
DOCKERFILE="linter/Dockerfile"
|
||||
fi
|
||||
|
||||
_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
|
||||
_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
|
||||
if [[ "$image" == *rocm* ]]; then
|
||||
_UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6
|
||||
_UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
|
||||
fi
|
||||
PY_HARDCODED_CONFIG_SCRIPT=$(python3 get_config.py --image "$image")
|
||||
|
||||
tag=$(echo $image | awk -F':' '{print $2}')
|
||||
|
||||
# It's annoying to rename jobs every time you want to rewrite a
|
||||
# configuration, so we hardcode everything here rather than do it
|
||||
# from scratch
|
||||
case "$tag" in
|
||||
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11)
|
||||
CUDA_VERSION=12.4
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11)
|
||||
CUDA_VERSION=12.8.1
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.8.1
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.8.1
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.8.1
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.13
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9)
|
||||
CUDA_VERSION=12.6.3
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm)
|
||||
CUDA_VERSION=12.8.1
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.6
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.6
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.6
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.13
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9)
|
||||
CUDA_VERSION=12.8.1
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3-clang12-onnx)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
CLANG_VERSION=12
|
||||
VISION=yes
|
||||
ONNX=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3.9-clang12)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
CLANG_VERSION=12
|
||||
VISION=yes
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3.11-clang12)
|
||||
ANACONDA_PYTHON_VERSION=3.11
|
||||
CLANG_VERSION=12
|
||||
VISION=yes
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3.9-gcc9)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-noble-rocm-n-py3)
|
||||
if [[ $tag =~ "jammy" ]]; then
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
else
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
fi
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
ROCM_VERSION=6.4
|
||||
if [[ $? -eq 0 ]]; then
|
||||
eval "$PY_HARDCODED_CONFIG_SCRIPT"
|
||||
else
|
||||
echo "[Fallback] Python script failed or no match — fallback to hardcoded shell case"
|
||||
# Catch-all for builds that are not hardcoded.
|
||||
VISION=yes
|
||||
echo "image '$image' did not match an existing build configuration"
|
||||
if [[ "$image" == *py* ]]; then
|
||||
extract_version_from_image_name py ANACONDA_PYTHON_VERSION
|
||||
fi
|
||||
if [[ "$image" == *cuda* ]]; then
|
||||
extract_version_from_image_name cuda CUDA_VERSION
|
||||
extract_version_from_image_name cudnn CUDNN_VERSION
|
||||
fi
|
||||
if [[ "$image" == *rocm* ]]; then
|
||||
extract_version_from_image_name rocm ROCM_VERSION
|
||||
NINJA_VERSION=1.9.0
|
||||
TRITON=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
pytorch-linux-noble-rocm-alpha-py3)
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
ROCM_VERSION=7.0
|
||||
NINJA_VERSION=1.9.0
|
||||
TRITON=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
|
||||
;;
|
||||
pytorch-linux-jammy-xpu-2025.0-py3)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
XPU_VERSION=2025.0
|
||||
NINJA_VERSION=1.9.0
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-xpu-2025.1-py3)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
XPU_VERSION=2025.1
|
||||
NINJA_VERSION=1.9.0
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
TRITON=yes
|
||||
DOCS=yes
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
CUDA_VERSION=12.8.1
|
||||
CUDNN_VERSION=9
|
||||
CLANG_VERSION=12
|
||||
VISION=yes
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3-clang18-asan)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
CLANG_VERSION=18
|
||||
VISION=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3.9-gcc11)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
TRITON=yes
|
||||
DOCS=yes
|
||||
UNINSTALL_DILL=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3-clang12-executorch)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
CLANG_VERSION=12
|
||||
EXECUTORCH=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3.12-halide)
|
||||
CUDA_VERSION=12.6
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
GCC_VERSION=11
|
||||
HALIDE=yes
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3.12-triton-cpu)
|
||||
CUDA_VERSION=12.6
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
GCC_VERSION=11
|
||||
TRITON_CPU=yes
|
||||
;;
|
||||
pytorch-linux-jammy-linter)
|
||||
# TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
|
||||
# We will need to update mypy version eventually, but that's for another day. The task
|
||||
# would be to upgrade mypy to 1.0.0 with Python 3.11
|
||||
PYTHON_VERSION=3.9
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)
|
||||
PYTHON_VERSION=3.9
|
||||
CUDA_VERSION=12.8.1
|
||||
;;
|
||||
pytorch-linux-jammy-aarch64-py3.10-gcc11)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=11
|
||||
ACL=yes
|
||||
VISION=yes
|
||||
CONDA_CMAKE=yes
|
||||
OPENBLAS=yes
|
||||
# snadampal: skipping llvm src build install because the current version
|
||||
# from pytorch/llvm:9.0.1 is x86 specific
|
||||
SKIP_LLVM_SRC_BUILD_INSTALL=yes
|
||||
;;
|
||||
pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=11
|
||||
ACL=yes
|
||||
VISION=yes
|
||||
CONDA_CMAKE=yes
|
||||
OPENBLAS=yes
|
||||
# snadampal: skipping llvm src build install because the current version
|
||||
# from pytorch/llvm:9.0.1 is x86 specific
|
||||
SKIP_LLVM_SRC_BUILD_INSTALL=yes
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
*)
|
||||
# Catch-all for builds that are not hardcoded.
|
||||
VISION=yes
|
||||
echo "image '$image' did not match an existing build configuration"
|
||||
if [[ "$image" == *py* ]]; then
|
||||
extract_version_from_image_name py ANACONDA_PYTHON_VERSION
|
||||
# To ensure that any ROCm config will build using conda cmake
|
||||
# and thus have LAPACK/MKL enabled
|
||||
fi
|
||||
if [[ "$image" == *cuda* ]]; then
|
||||
extract_version_from_image_name cuda CUDA_VERSION
|
||||
extract_version_from_image_name cudnn CUDNN_VERSION
|
||||
fi
|
||||
if [[ "$image" == *rocm* ]]; then
|
||||
extract_version_from_image_name rocm ROCM_VERSION
|
||||
NINJA_VERSION=1.9.0
|
||||
TRITON=yes
|
||||
# To ensure that any ROCm config will build using conda cmake
|
||||
# and thus have LAPACK/MKL enabled
|
||||
fi
|
||||
if [[ "$image" == *centos7* ]]; then
|
||||
NINJA_VERSION=1.10.2
|
||||
fi
|
||||
if [[ "$image" == *gcc* ]]; then
|
||||
extract_version_from_image_name gcc GCC_VERSION
|
||||
fi
|
||||
if [[ "$image" == *clang* ]]; then
|
||||
extract_version_from_image_name clang CLANG_VERSION
|
||||
fi
|
||||
if [[ "$image" == *devtoolset* ]]; then
|
||||
extract_version_from_image_name devtoolset DEVTOOLSET_VERSION
|
||||
fi
|
||||
if [[ "$image" == *glibc* ]]; then
|
||||
extract_version_from_image_name glibc GLIBC_VERSION
|
||||
fi
|
||||
;;
|
||||
if [[ "$image" == *centos7* ]]; then
|
||||
NINJA_VERSION=1.10.2
|
||||
fi
|
||||
if [[ "$image" == *gcc* ]]; then
|
||||
extract_version_from_image_name gcc GCC_VERSION
|
||||
fi
|
||||
if [[ "$image" == *clang* ]]; then
|
||||
extract_version_from_image_name clang CLANG_VERSION
|
||||
fi
|
||||
if [[ "$image" == *devtoolset* ]]; then
|
||||
extract_version_from_image_name devtoolset DEVTOOLSET_VERSION
|
||||
fi
|
||||
if [[ "$image" == *glibc* ]]; then
|
||||
extract_version_from_image_name glibc GLIBC_VERSION
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
|
||||
tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
|
||||
|
||||
@ -15,11 +15,35 @@ function install_timm() {
|
||||
commit=$(get_pinned_commit timm)
|
||||
|
||||
pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
|
||||
# Clean up
|
||||
conda_run pip uninstall -y torch torchvision triton
|
||||
}
|
||||
|
||||
function install_torchbench() {
|
||||
local commit
|
||||
commit=$(get_pinned_commit torchbench)
|
||||
git clone https://github.com/pytorch/benchmark torchbench
|
||||
pushd torchbench
|
||||
git checkout "$commit"
|
||||
|
||||
python install.py --continue_on_fail
|
||||
|
||||
# TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488
|
||||
# is regressing speedup metric. This needs to be investigated further
|
||||
pip install transformers==4.38.1
|
||||
|
||||
echo "Print all dependencies after TorchBench is installed"
|
||||
python -mpip freeze
|
||||
popd
|
||||
}
|
||||
|
||||
# Pango is needed for weasyprint which is needed for doctr
|
||||
conda_install pango
|
||||
|
||||
# Stable packages are ok here, just to satisfy TorchBench check
|
||||
pip_install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
|
||||
|
||||
install_torchbench
|
||||
install_huggingface
|
||||
install_timm
|
||||
|
||||
# Clean up
|
||||
conda_run pip uninstall -y torch torchvision torchaudio triton
|
||||
|
||||
@ -30,7 +30,7 @@ EOF
|
||||
|
||||
# we want the patch version of 6.4 instead
|
||||
if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
|
||||
ROCM_VERSION="${ROCM_VERSION}.2"
|
||||
ROCM_VERSION="${ROCM_VERSION}.1"
|
||||
fi
|
||||
|
||||
# Default url values
|
||||
@ -85,19 +85,16 @@ EOF
|
||||
# CI no longer builds for ROCm 6.3, but
|
||||
# ROCm 6.4 did not yet fix the regression, also HIP branch names are different
|
||||
if [[ $(ver $ROCM_VERSION) -ge $(ver 6.4) ]] && [[ $(ver $ROCM_VERSION) -lt $(ver 7.0) ]]; then
|
||||
if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.2) ]]; then
|
||||
HIP_TAG=rocm-6.4.2
|
||||
CLR_HASH=74d78ba3ac4bac235d02bcb48511c30b5cfdd457 # branch release/rocm-rel-6.4.2-statco-hotfix
|
||||
elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.1) ]]; then
|
||||
HIP_TAG=rocm-6.4.1
|
||||
CLR_HASH=efe6c35790b9206923bfeed1209902feff37f386 # branch release/rocm-rel-6.4.1-statco-hotfix
|
||||
if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.1) ]]; then
|
||||
HIP_BRANCH=release/rocm-rel-6.4
|
||||
CLR_HASH=606bc820b4b1f315d135da02a1f0b176ca50a92c # branch release/rocm-rel-6.4.1-statco-hotfix
|
||||
elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
|
||||
HIP_TAG=rocm-6.4.0
|
||||
HIP_BRANCH=release/rocm-rel-6.4
|
||||
CLR_HASH=600f5b0d2baed94d5121e2174a9de0851b040b0c # branch release/rocm-rel-6.4-statco-hotfix
|
||||
fi
|
||||
# clr build needs CppHeaderParser but can only find it using conda's python
|
||||
python -m pip install CppHeaderParser
|
||||
git clone https://github.com/ROCm/HIP -b $HIP_TAG
|
||||
git clone https://github.com/ROCm/HIP -b $HIP_BRANCH
|
||||
HIP_COMMON_DIR=$(readlink -f HIP)
|
||||
git clone https://github.com/jeffdaily/clr
|
||||
pushd clr
|
||||
|
||||
350
.ci/docker/get_config.py
Normal file
350
.ci/docker/get_config.py
Normal file
@ -0,0 +1,350 @@
|
||||
import argparse
|
||||
import sys
|
||||
from enum import Enum
|
||||
import shlex
|
||||
|
||||
class HardwareType(Enum):
|
||||
DEFAULT = "default"
|
||||
ROCM = "rocm"
|
||||
|
||||
@staticmethod
|
||||
def from_image_name(image_name: str) -> "HardwareType":
|
||||
if "rocm" in image_name:
|
||||
return HardwareType.ROCM
|
||||
return HardwareType.DEFAULT
|
||||
|
||||
class HardcodedBaseConfig:
|
||||
_UCX_UCC_CONFIGS: dict[HardwareType, dict[str, str]] = {
|
||||
HardwareType.DEFAULT: {
|
||||
"UCX_COMMIT": "7bb2722ff2187a0cad557ae4a6afa090569f83fb",
|
||||
"UCC_COMMIT": "20eae37090a4ce1b32bcce6144ccad0b49943e0b",
|
||||
},
|
||||
HardwareType.ROCM: {
|
||||
"UCX_COMMIT": "cc312eaa4655c0cc5c2bcd796db938f90563bcf6",
|
||||
"UCC_COMMIT": "0c0fc21559835044ab107199e334f7157d6a0d3d",
|
||||
},
|
||||
}
|
||||
|
||||
def __init__(self, hardwareType: HardwareType) -> None:
|
||||
commits = self.get_ucx_ucc_commits(hardwareType)
|
||||
self.ucx_commit = commits["UCX_COMMIT"]
|
||||
self.ucc_commit = commits["UCC_COMMIT"]
|
||||
|
||||
def _get_tag(self, image: str):
|
||||
if ":" not in image:
|
||||
print(f"echo 'Invalid image format (missing :): {image}'", file=sys.stderr)
|
||||
return
|
||||
tag = image.split(":")[1]
|
||||
return tag
|
||||
|
||||
def get_all_configs(self):
|
||||
_TAG_CONFIGS = {
|
||||
"pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11": {
|
||||
"CUDA_VERSION": "12.4",
|
||||
"CUDNN_VERSION": "9",
|
||||
"ANACONDA_PYTHON_VERSION": "3.10",
|
||||
"GCC_VERSION": "11",
|
||||
"VISION": "yes",
|
||||
"KATEX": "yes",
|
||||
"UCX_COMMIT": self.ucx_commit,
|
||||
"UCC_COMMIT": self.ucc_commit,
|
||||
"TRITON": "yes",
|
||||
},
|
||||
"pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11": {
|
||||
"CUDA_VERSION": "12.8.1",
|
||||
"CUDNN_VERSION": "9",
|
||||
"ANACONDA_PYTHON_VERSION": "3.10",
|
||||
"GCC_VERSION": "11",
|
||||
"VISION": "yes",
|
||||
"KATEX": "yes",
|
||||
"UCX_COMMIT": self.ucx_commit,
|
||||
"UCC_COMMIT": self.ucc_commit,
|
||||
"TRITON": "yes",
|
||||
},
|
||||
"pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks": {
|
||||
"CUDA_VERSION": "12.8.1",
|
||||
"CUDNN_VERSION": "9",
|
||||
"ANACONDA_PYTHON_VERSION": "3.10",
|
||||
"GCC_VERSION": "9",
|
||||
"VISION": "yes",
|
||||
"KATEX": "yes",
|
||||
"UCX_COMMIT": self.ucx_commit,
|
||||
"UCC_COMMIT": self.ucc_commit,
|
||||
"TRITON": "yes",
|
||||
"INDUCTOR_BENCHMARKS": "yes",
|
||||
},
|
||||
"pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks": {
|
||||
"CUDA_VERSION": "12.8.1",
|
||||
"CUDNN_VERSION": "9",
|
||||
"ANACONDA_PYTHON_VERSION": "3.12",
|
||||
"GCC_VERSION": "9",
|
||||
"VISION": "yes",
|
||||
"KATEX": "yes",
|
||||
"UCX_COMMIT": self.ucx_commit,
|
||||
"UCC_COMMIT": self.ucc_commit,
|
||||
"TRITON": "yes",
|
||||
"INDUCTOR_BENCHMARKS": "yes",
|
||||
},
|
||||
"pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks": {
|
||||
"CUDA_VERSION": "12.8.1",
|
||||
"CUDNN_VERSION": "9",
|
||||
"ANACONDA_PYTHON_VERSION": "3.13",
|
||||
"GCC_VERSION": "9",
|
||||
"VISION": "yes",
|
||||
"KATEX": "yes",
|
||||
"UCX_COMMIT": self.ucx_commit,
|
||||
"UCC_COMMIT": self.ucc_commit,
|
||||
"TRITON": "yes",
|
||||
"INDUCTOR_BENCHMARKS": "yes",
|
||||
},
|
||||
"pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9": {
|
||||
"CUDA_VERSION": "12.6.3",
|
||||
"CUDNN_VERSION": "9",
|
||||
"ANACONDA_PYTHON_VERSION": "3.10",
|
||||
"GCC_VERSION": "9",
|
||||
"VISION": "yes",
|
||||
"KATEX": "yes",
|
||||
"UCX_COMMIT": self.ucx_commit,
|
||||
"UCC_COMMIT": self.ucc_commit,
|
||||
"TRITON": "yes",
|
||||
},
|
||||
"pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks": {
|
||||
"CUDA_VERSION": "12.6",
|
||||
"CUDNN_VERSION": "9",
|
||||
"ANACONDA_PYTHON_VERSION": "3.10",
|
||||
"GCC_VERSION": "9",
|
||||
"VISION": "yes",
|
||||
"KATEX": "yes",
|
||||
"UCX_COMMIT": self.ucx_commit,
|
||||
"UCC_COMMIT": self.ucc_commit,
|
||||
"TRITON": "yes",
|
||||
"INDUCTOR_BENCHMARKS": "yes",
|
||||
},
|
||||
"pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks": {
|
||||
"CUDA_VERSION": "12.6",
|
||||
"CUDNN_VERSION": "9",
|
||||
"ANACONDA_PYTHON_VERSION": "3.12",
|
||||
"GCC_VERSION": "9",
|
||||
"VISION": "yes",
|
||||
"KATEX": "yes",
|
||||
"UCX_COMMIT": self.ucx_commit,
|
||||
"UCC_COMMIT": self.ucc_commit,
|
||||
"TRITON": "yes",
|
||||
"INDUCTOR_BENCHMARKS": "yes",
|
||||
},
|
||||
"pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks": {
|
||||
"CUDA_VERSION": "12.6",
|
||||
"CUDNN_VERSION": "9",
|
||||
"ANACONDA_PYTHON_VERSION": "3.13",
|
||||
"GCC_VERSION": "9",
|
||||
"VISION": "yes",
|
||||
"KATEX": "yes",
|
||||
"UCX_COMMIT": self.ucx_commit,
|
||||
"UCC_COMMIT": self.ucc_commit,
|
||||
"TRITON": "yes",
|
||||
"INDUCTOR_BENCHMARKS": "yes",
|
||||
},
|
||||
"pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9": {
|
||||
"CUDA_VERSION": "12.8.1",
|
||||
"CUDNN_VERSION": "9",
|
||||
"ANACONDA_PYTHON_VERSION": "3.10",
|
||||
"GCC_VERSION": "9",
|
||||
"VISION": "yes",
|
||||
"KATEX": "yes",
|
||||
"UCX_COMMIT": self.ucx_commit,
|
||||
"UCC_COMMIT": self.ucc_commit,
|
||||
"TRITON": "yes",
|
||||
},
|
||||
"pytorch-linux-jammy-py3-clang12-onnx": {
|
||||
"ANACONDA_PYTHON_VERSION": "3.9",
|
||||
"CLANG_VERSION": "12",
|
||||
"VISION": "yes",
|
||||
"ONNX": "yes",
|
||||
},
|
||||
"pytorch-linux-jammy-py3.9-clang12": {
|
||||
"ANACONDA_PYTHON_VERSION": "3.9",
|
||||
"CLANG_VERSION": "12",
|
||||
"VISION": "yes",
|
||||
"TRITON": "yes",
|
||||
},
|
||||
"pytorch-linux-jammy-py3.11-clang12": {
|
||||
"ANACONDA_PYTHON_VERSION": "3.11",
|
||||
"CLANG_VERSION": "12",
|
||||
"VISION": "yes",
|
||||
"TRITON": "yes",
|
||||
},
|
||||
"pytorch-linux-jammy-py3.9-gcc9": {
|
||||
"ANACONDA_PYTHON_VERSION": "3.9",
|
||||
"GCC_VERSION": "9",
|
||||
"VISION": "yes",
|
||||
"TRITON": "yes",
|
||||
},
|
||||
"pytorch-linux-jammy-rocm-n-py3": {
|
||||
"ANACONDA_PYTHON_VERSION": "3.10",
|
||||
"GCC_VERSION": "11",
|
||||
"VISION": "yes",
|
||||
"ROCM_VERSION": "6.4",
|
||||
"NINJA_VERSION": "1.9.0",
|
||||
"TRITON": "yes",
|
||||
"KATEX": "yes",
|
||||
"UCX_COMMIT": self.ucx_commit,
|
||||
"UCC_COMMIT": self.ucc_commit,
|
||||
"INDUCTOR_BENCHMARKS": "yes",
|
||||
},
|
||||
"pytorch-linux-noble-rocm-n-py3": {
|
||||
"ANACONDA_PYTHON_VERSION": "3.12",
|
||||
"GCC_VERSION": "11",
|
||||
"VISION": "yes",
|
||||
"ROCM_VERSION": "6.4",
|
||||
"NINJA_VERSION": "1.9.0",
|
||||
"TRITON": "yes",
|
||||
"KATEX": "yes",
|
||||
"UCX_COMMIT": self.ucx_commit,
|
||||
"UCC_COMMIT": self.ucc_commit,
|
||||
"INDUCTOR_BENCHMARKS": "yes",
|
||||
},
|
||||
"pytorch-linux-noble-rocm-alpha-py3": {
|
||||
"ANACONDA_PYTHON_VERSION": "3.12",
|
||||
"GCC_VERSION": "11",
|
||||
"VISION": "yes",
|
||||
"ROCM_VERSION": "7.0",
|
||||
"NINJA_VERSION": "1.9.0",
|
||||
"TRITON": "yes",
|
||||
"KATEX": "yes",
|
||||
"UCX_COMMIT": self.ucx_commit,
|
||||
"UCC_COMMIT": self.ucc_commit,
|
||||
"INDUCTOR_BENCHMARKS": "yes",
|
||||
"PYTORCH_ROCM_ARCH": "gfx90a;gfx942;gfx950",
|
||||
},
|
||||
"pytorch-linux-jammy-xpu-2025.0-py3": {
|
||||
"ANACONDA_PYTHON_VERSION": "3.9",
|
||||
"GCC_VERSION": "11",
|
||||
"VISION": "yes",
|
||||
"XPU_VERSION": "2025.0",
|
||||
"NINJA_VERSION": "1.9.0",
|
||||
"TRITON": "yes",
|
||||
},
|
||||
"pytorch-linux-jammy-xpu-2025.1-py3": {
|
||||
"ANACONDA_PYTHON_VERSION": "3.9",
|
||||
"GCC_VERSION": "11",
|
||||
"VISION": "yes",
|
||||
"XPU_VERSION": "2025.1",
|
||||
"NINJA_VERSION": "1.9.0",
|
||||
"TRITON": "yes",
|
||||
},
|
||||
"pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks": {
|
||||
"ANACONDA_PYTHON_VERSION": "3.9",
|
||||
"GCC_VERSION": "11",
|
||||
"VISION": "yes",
|
||||
"KATEX": "yes",
|
||||
"TRITON": "yes",
|
||||
"DOCS": "yes",
|
||||
"INDUCTOR_BENCHMARKS": "yes",
|
||||
},
|
||||
"pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12": {
|
||||
"ANACONDA_PYTHON_VERSION": "3.9",
|
||||
"CUDA_VERSION": "12.8.1",
|
||||
"CUDNN_VERSION": "9",
|
||||
"CLANG_VERSION": "12",
|
||||
"VISION": "yes",
|
||||
"TRITON": "yes",
|
||||
},
|
||||
"pytorch-linux-jammy-py3-clang18-asan": {
|
||||
"ANACONDA_PYTHON_VERSION": "3.10",
|
||||
"CLANG_VERSION": "18",
|
||||
"VISION": "yes",
|
||||
},
|
||||
"pytorch-linux-jammy-py3.9-gcc11": {
|
||||
"ANACONDA_PYTHON_VERSION": "3.9",
|
||||
"GCC_VERSION": "11",
|
||||
"VISION": "yes",
|
||||
"KATEX": "yes",
|
||||
"TRITON": "yes",
|
||||
"DOCS": "yes",
|
||||
"UNINSTALL_DILL": "yes",
|
||||
},
|
||||
"pytorch-linux-jammy-py3-clang12-executorch": {
|
||||
"ANACONDA_PYTHON_VERSION": "3.10",
|
||||
"CLANG_VERSION": "12",
|
||||
"EXECUTORCH": "yes",
|
||||
},
|
||||
"pytorch-linux-jammy-py3.12-halide": {
|
||||
"CUDA_VERSION": "12.6",
|
||||
"ANACONDA_PYTHON_VERSION": "3.12",
|
||||
"GCC_VERSION": "11",
|
||||
"HALIDE": "yes",
|
||||
"TRITON": "yes",
|
||||
},
|
||||
"pytorch-linux-jammy-py3.12-triton-cpu": {
|
||||
"CUDA_VERSION": "12.6",
|
||||
"ANACONDA_PYTHON_VERSION": "3.12",
|
||||
"GCC_VERSION": "11",
|
||||
"TRITON_CPU": "yes",
|
||||
},
|
||||
"pytorch-linux-jammy-linter": {
|
||||
"PYTHON_VERSION": "3.9",
|
||||
},
|
||||
"pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter": {
|
||||
"PYTHON_VERSION": "3.9",
|
||||
"CUDA_VERSION": "12.8.1",
|
||||
},
|
||||
"pytorch-linux-jammy-aarch64-py3.10-gcc11": {
|
||||
"ANACONDA_PYTHON_VERSION": "3.10",
|
||||
"GCC_VERSION": "11",
|
||||
"ACL": "yes",
|
||||
"VISION": "yes",
|
||||
"CONDA_CMAKE": "yes",
|
||||
"OPENBLAS": "yes",
|
||||
"SKIP_LLVM_SRC_BUILD_INSTALL": "yes",
|
||||
},
|
||||
"pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks": {
|
||||
"ANACONDA_PYTHON_VERSION": "3.10",
|
||||
"GCC_VERSION": "11",
|
||||
"ACL": "yes",
|
||||
"VISION": "yes",
|
||||
"CONDA_CMAKE": "yes",
|
||||
"OPENBLAS": "yes",
|
||||
"SKIP_LLVM_SRC_BUILD_INSTALL": "yes",
|
||||
"INDUCTOR_BENCHMARKS": "yes",
|
||||
},
|
||||
}
|
||||
return _TAG_CONFIGS
|
||||
def get_config(self, image_name:str) -> dict:
|
||||
tag = self._get_tag(image_name)
|
||||
|
||||
config_dict = self.get_all_configs()
|
||||
if tag not in config_dict:
|
||||
raise ValueError(f"Unknown tag: {tag}")
|
||||
return config_dict[tag]
|
||||
|
||||
def get_ucx_ucc_commits(self, hw_type: HardwareType) -> dict[str, str]:
|
||||
if hw_type not in self._UCX_UCC_CONFIGS:
|
||||
raise ValueError(f"Unsupported hardware type: {hw_type}")
|
||||
return self._UCX_UCC_CONFIGS[hw_type]
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Return for a given image tag."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--image", required=True, help="Full image string (e.g., repo/name:tag)"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
|
||||
image_name = args.image
|
||||
hw_type = HardwareType.from_image_name(image_name)
|
||||
|
||||
config_runner = HardcodedBaseConfig(hw_type)
|
||||
config = config_runner.get_config(args.image)
|
||||
for key, val in config.items():
|
||||
print(f'export {key}={shlex.quote(val)}')
|
||||
except Exception as e:
|
||||
# Any error will signal fallback
|
||||
print(f"# Fallback due to error: {e}", file=sys.stderr)
|
||||
sys.exit(42)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -41,7 +41,7 @@ case ${DOCKER_TAG_PREFIX} in
|
||||
rocm*)
|
||||
# we want the patch version of 6.4 instead
|
||||
if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
|
||||
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
|
||||
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.1"
|
||||
fi
|
||||
BASE_TARGET=rocm
|
||||
GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
|
||||
|
||||
@ -77,7 +77,7 @@ case ${image} in
|
||||
manylinux2_28-builder:rocm*)
|
||||
# we want the patch version of 6.4 instead
|
||||
if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
|
||||
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
|
||||
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.1"
|
||||
fi
|
||||
TARGET=rocm_final
|
||||
MANY_LINUX_VERSION="2_28"
|
||||
|
||||
@ -50,7 +50,7 @@ flatbuffers==24.12.23
|
||||
hypothesis==5.35.1
|
||||
# Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
|
||||
#Description: advanced library for generating parametrized tests
|
||||
#Pinned versions: 5.35.1
|
||||
#Pinned versions: 3.44.6, 4.53.2
|
||||
#test that import: test_xnnpack_integration.py, test_pruning_op.py, test_nn.py
|
||||
|
||||
junitparser==2.1.1
|
||||
@ -221,9 +221,9 @@ pygments==2.15.0
|
||||
#Pinned versions: 2.12.0
|
||||
#test that import: the doctests
|
||||
|
||||
#pyyaml
|
||||
#PyYAML
|
||||
#Description: data serialization format
|
||||
#Pinned versions: 6.0.2
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
|
||||
#requests
|
||||
@ -233,7 +233,7 @@ pygments==2.15.0
|
||||
|
||||
#rich
|
||||
#Description: rich text and beautiful formatting in the terminal
|
||||
#Pinned versions: 14.1.0
|
||||
#Pinned versions: 10.9.0
|
||||
#test that import:
|
||||
|
||||
scikit-image==0.19.3 ; python_version < "3.10"
|
||||
@ -307,7 +307,7 @@ pytest-cpp==2.3.0
|
||||
#Pinned versions: 2.3.0
|
||||
#test that import:
|
||||
|
||||
z3-solver==4.15.1.0
|
||||
z3-solver==4.12.6.0
|
||||
#Description: The Z3 Theorem Prover Project
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
@ -361,7 +361,6 @@ pwlf==2.2.1
|
||||
#Pinned versions: 2.2.1
|
||||
#test that import: test_sac_estimator.py
|
||||
|
||||
|
||||
# To build PyTorch itself
|
||||
pyyaml
|
||||
pyzstd
|
||||
|
||||
@ -98,8 +98,9 @@ COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps
|
||||
COPY ./common/common_utils.sh common_utils.sh
|
||||
COPY ci_commit_pins/huggingface.txt huggingface.txt
|
||||
COPY ci_commit_pins/timm.txt timm.txt
|
||||
COPY ci_commit_pins/torchbench.txt torchbench.txt
|
||||
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
|
||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
|
||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt
|
||||
|
||||
# (optional) Install non-default Ninja version
|
||||
ARG NINJA_VERSION
|
||||
|
||||
@ -98,8 +98,9 @@ COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps
|
||||
COPY ./common/common_utils.sh common_utils.sh
|
||||
COPY ci_commit_pins/huggingface.txt huggingface.txt
|
||||
COPY ci_commit_pins/timm.txt timm.txt
|
||||
COPY ci_commit_pins/torchbench.txt torchbench.txt
|
||||
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
|
||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
|
||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt
|
||||
|
||||
ARG TRITON
|
||||
ARG TRITON_CPU
|
||||
|
||||
34
.ci/pytorch/build-mobile.sh
Executable file
34
.ci/pytorch/build-mobile.sh
Executable file
@ -0,0 +1,34 @@
|
||||
#!/usr/bin/env bash
|
||||
# DO NOT ADD 'set -x' not to reveal CircleCI secret context environment variables
|
||||
set -eu -o pipefail
|
||||
|
||||
# This script uses linux host toolchain + mobile build options in order to
|
||||
# build & test mobile libtorch without having to setup Android/iOS
|
||||
# toolchain/simulator.
|
||||
|
||||
# shellcheck source=./common.sh
|
||||
source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
|
||||
# shellcheck source=./common-build.sh
|
||||
source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
|
||||
|
||||
# Install torch & torchvision - used to download & trace test model.
|
||||
# Ideally we should use the libtorch built on the PR so that backward
|
||||
# incompatible changes won't break this script - but it will significantly slow
|
||||
# down mobile CI jobs.
|
||||
# Here we install nightly instead of stable so that we have an option to
|
||||
# temporarily skip mobile CI jobs on BC-breaking PRs until they are in nightly.
|
||||
retry pip install --pre torch torchvision \
|
||||
-f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html \
|
||||
--progress-bar off
|
||||
|
||||
# Run end-to-end process of building mobile library, linking into the predictor
|
||||
# binary, and running forward pass with a real model.
|
||||
if [[ "$BUILD_ENVIRONMENT" == *-mobile-custom-build-static* ]]; then
|
||||
TEST_CUSTOM_BUILD_STATIC=1 test/mobile/custom_build/build.sh
|
||||
elif [[ "$BUILD_ENVIRONMENT" == *-mobile-lightweight-dispatch* ]]; then
|
||||
test/mobile/lightweight_dispatch/build.sh
|
||||
else
|
||||
TEST_DEFAULT_BUILD=1 test/mobile/custom_build/build.sh
|
||||
fi
|
||||
|
||||
print_sccache_stats
|
||||
@ -11,6 +11,10 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
|
||||
# shellcheck source=./common-build.sh
|
||||
source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" == *-mobile-*build* ]]; then
|
||||
exec "$(dirname "${BASH_SOURCE[0]}")/build-mobile.sh" "$@"
|
||||
fi
|
||||
|
||||
echo "Python version:"
|
||||
python --version
|
||||
|
||||
@ -50,6 +54,9 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
|
||||
export ATEN_THREADING=NATIVE
|
||||
fi
|
||||
|
||||
# Enable LLVM dependency for TensorExpr testing
|
||||
export USE_LLVM=/opt/llvm
|
||||
export LLVM_DIR=/opt/llvm/lib/cmake/llvm
|
||||
|
||||
if ! which conda; then
|
||||
# In ROCm CIs, we are doing cross compilation on build machines with
|
||||
@ -117,8 +124,26 @@ if [[ "$BUILD_ENVIRONMENT" == *libtorch* ]]; then
|
||||
fi
|
||||
|
||||
# Use special scripts for Android builds
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
|
||||
export ANDROID_NDK=/opt/ndk
|
||||
build_args=()
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *-arm-v7a* ]]; then
|
||||
build_args+=("-DANDROID_ABI=armeabi-v7a")
|
||||
elif [[ "${BUILD_ENVIRONMENT}" == *-arm-v8a* ]]; then
|
||||
build_args+=("-DANDROID_ABI=arm64-v8a")
|
||||
elif [[ "${BUILD_ENVIRONMENT}" == *-x86_32* ]]; then
|
||||
build_args+=("-DANDROID_ABI=x86")
|
||||
elif [[ "${BUILD_ENVIRONMENT}" == *-x86_64* ]]; then
|
||||
build_args+=("-DANDROID_ABI=x86_64")
|
||||
fi
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *vulkan* ]]; then
|
||||
build_args+=("-DUSE_VULKAN=ON")
|
||||
fi
|
||||
build_args+=("-DUSE_LITE_INTERPRETER_PROFILER=OFF")
|
||||
exec ./scripts/build_android.sh "${build_args[@]}" "$@"
|
||||
fi
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" == *vulkan* ]]; then
|
||||
if [[ "$BUILD_ENVIRONMENT" != *android* && "$BUILD_ENVIRONMENT" == *vulkan* ]]; then
|
||||
export USE_VULKAN=1
|
||||
# shellcheck disable=SC1091
|
||||
source /var/lib/jenkins/vulkansdk/setup-env.sh
|
||||
@ -189,6 +214,7 @@ if [[ "$BUILD_ENVIRONMENT" == *-clang*-asan* ]]; then
|
||||
export USE_ASAN=1
|
||||
export REL_WITH_DEB_INFO=1
|
||||
export UBSAN_FLAGS="-fno-sanitize-recover=all"
|
||||
unset USE_LLVM
|
||||
fi
|
||||
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *no-ops* ]]; then
|
||||
@ -199,7 +225,7 @@ if [[ "${BUILD_ENVIRONMENT}" == *-pch* ]]; then
|
||||
export USE_PRECOMPILED_HEADERS=1
|
||||
fi
|
||||
|
||||
if [[ "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
|
||||
if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
|
||||
export BUILD_STATIC_RUNTIME_BENCHMARK=ON
|
||||
fi
|
||||
|
||||
|
||||
@ -204,32 +204,8 @@ function install_torchrec_and_fbgemm() {
|
||||
pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
|
||||
pip_uninstall fbgemm-gpu-nightly
|
||||
|
||||
# Set ROCM_HOME isn't available, use ROCM_PATH if set or /opt/rocm
|
||||
ROCM_HOME="${ROCM_HOME:-${ROCM_PATH:-/opt/rocm}}"
|
||||
|
||||
# Find rocm_version.h header file for ROCm version extract
|
||||
rocm_version_h="${ROCM_HOME}/include/rocm-core/rocm_version.h"
|
||||
if [ ! -f "$rocm_version_h" ]; then
|
||||
rocm_version_h="${ROCM_HOME}/include/rocm_version.h"
|
||||
fi
|
||||
|
||||
# Error out if rocm_version.h not found
|
||||
if [ ! -f "$rocm_version_h" ]; then
|
||||
echo "Error: rocm_version.h not found in expected locations." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Extract major, minor and patch ROCm version numbers
|
||||
MAJOR_VERSION=$(grep 'ROCM_VERSION_MAJOR' "$rocm_version_h" | awk '{print $3}')
|
||||
MINOR_VERSION=$(grep 'ROCM_VERSION_MINOR' "$rocm_version_h" | awk '{print $3}')
|
||||
PATCH_VERSION=$(grep 'ROCM_VERSION_PATCH' "$rocm_version_h" | awk '{print $3}')
|
||||
ROCM_INT=$((MAJOR_VERSION * 10000 + MINOR_VERSION * 100 + PATCH_VERSION))
|
||||
echo "ROCm version: $ROCM_INT"
|
||||
export BUILD_ROCM_VERSION="$MAJOR_VERSION.$MINOR_VERSION"
|
||||
|
||||
pip_install tabulate # needed for newer fbgemm
|
||||
pip_install patchelf # needed for rocm fbgemm
|
||||
pushd /tmp
|
||||
|
||||
local wheel_dir=dist/fbgemm_gpu
|
||||
local found_whl=0
|
||||
@ -247,7 +223,7 @@ function install_torchrec_and_fbgemm() {
|
||||
pushd fbgemm/fbgemm_gpu
|
||||
git checkout "${fbgemm_commit}"
|
||||
python setup.py bdist_wheel \
|
||||
--build-variant=rocm \
|
||||
--package_variant=rocm \
|
||||
-DHIP_ROOT_DIR="${ROCM_PATH}" \
|
||||
-DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \
|
||||
-DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
|
||||
@ -264,7 +240,6 @@ function install_torchrec_and_fbgemm() {
|
||||
done
|
||||
|
||||
rm -rf fbgemm
|
||||
popd
|
||||
else
|
||||
pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
|
||||
pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu
|
||||
@ -283,30 +258,6 @@ function clone_pytorch_xla() {
|
||||
fi
|
||||
}
|
||||
|
||||
function checkout_install_torchbench() {
|
||||
local commit
|
||||
commit=$(get_pinned_commit torchbench)
|
||||
git clone https://github.com/pytorch/benchmark torchbench
|
||||
pushd torchbench
|
||||
git checkout "$commit"
|
||||
|
||||
if [ "$1" ]; then
|
||||
python install.py --continue_on_fail models "$@"
|
||||
else
|
||||
# Occasionally the installation may fail on one model but it is ok to continue
|
||||
# to install and test other models
|
||||
python install.py --continue_on_fail
|
||||
fi
|
||||
|
||||
# TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488
|
||||
# is regressing speedup metric. This needs to be investigated further
|
||||
pip install transformers==4.38.1
|
||||
|
||||
echo "Print all dependencies after TorchBench is installed"
|
||||
python -mpip freeze
|
||||
popd
|
||||
}
|
||||
|
||||
function install_torchao() {
|
||||
local commit
|
||||
commit=$(get_pinned_commit torchao)
|
||||
|
||||
123
.ci/pytorch/create_test_cert.py
Normal file
123
.ci/pytorch/create_test_cert.py
Normal file
@ -0,0 +1,123 @@
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from tempfile import mkdtemp
|
||||
|
||||
from cryptography import x509
|
||||
from cryptography.hazmat.primitives import hashes, serialization
|
||||
from cryptography.hazmat.primitives.asymmetric import rsa
|
||||
from cryptography.x509.oid import NameOID
|
||||
|
||||
|
||||
temp_dir = mkdtemp()
|
||||
print(temp_dir)
|
||||
|
||||
|
||||
def genrsa(path):
|
||||
key = rsa.generate_private_key(
|
||||
public_exponent=65537,
|
||||
key_size=2048,
|
||||
)
|
||||
with open(path, "wb") as f:
|
||||
f.write(
|
||||
key.private_bytes(
|
||||
encoding=serialization.Encoding.PEM,
|
||||
format=serialization.PrivateFormat.TraditionalOpenSSL,
|
||||
encryption_algorithm=serialization.NoEncryption(),
|
||||
)
|
||||
)
|
||||
return key
|
||||
|
||||
|
||||
def create_cert(path, C, ST, L, O, key):
|
||||
subject = issuer = x509.Name(
|
||||
[
|
||||
x509.NameAttribute(NameOID.COUNTRY_NAME, C),
|
||||
x509.NameAttribute(NameOID.STATE_OR_PROVINCE_NAME, ST),
|
||||
x509.NameAttribute(NameOID.LOCALITY_NAME, L),
|
||||
x509.NameAttribute(NameOID.ORGANIZATION_NAME, O),
|
||||
]
|
||||
)
|
||||
cert = (
|
||||
x509.CertificateBuilder()
|
||||
.subject_name(subject)
|
||||
.issuer_name(issuer)
|
||||
.public_key(key.public_key())
|
||||
.serial_number(x509.random_serial_number())
|
||||
.not_valid_before(datetime.now(timezone.utc))
|
||||
.not_valid_after(
|
||||
# Our certificate will be valid for 10 days
|
||||
datetime.now(timezone.utc) + timedelta(days=10)
|
||||
)
|
||||
.add_extension(
|
||||
x509.BasicConstraints(ca=True, path_length=None),
|
||||
critical=True,
|
||||
)
|
||||
.sign(key, hashes.SHA256())
|
||||
)
|
||||
# Write our certificate out to disk.
|
||||
with open(path, "wb") as f:
|
||||
f.write(cert.public_bytes(serialization.Encoding.PEM))
|
||||
return cert
|
||||
|
||||
|
||||
def create_req(path, C, ST, L, O, key):
|
||||
csr = (
|
||||
x509.CertificateSigningRequestBuilder()
|
||||
.subject_name(
|
||||
x509.Name(
|
||||
[
|
||||
# Provide various details about who we are.
|
||||
x509.NameAttribute(NameOID.COUNTRY_NAME, C),
|
||||
x509.NameAttribute(NameOID.STATE_OR_PROVINCE_NAME, ST),
|
||||
x509.NameAttribute(NameOID.LOCALITY_NAME, L),
|
||||
x509.NameAttribute(NameOID.ORGANIZATION_NAME, O),
|
||||
]
|
||||
)
|
||||
)
|
||||
.sign(key, hashes.SHA256())
|
||||
)
|
||||
with open(path, "wb") as f:
|
||||
f.write(csr.public_bytes(serialization.Encoding.PEM))
|
||||
return csr
|
||||
|
||||
|
||||
def sign_certificate_request(path, csr_cert, ca_cert, private_ca_key):
|
||||
cert = (
|
||||
x509.CertificateBuilder()
|
||||
.subject_name(csr_cert.subject)
|
||||
.issuer_name(ca_cert.subject)
|
||||
.public_key(csr_cert.public_key())
|
||||
.serial_number(x509.random_serial_number())
|
||||
.not_valid_before(datetime.now(timezone.utc))
|
||||
.not_valid_after(
|
||||
# Our certificate will be valid for 10 days
|
||||
datetime.now(timezone.utc) + timedelta(days=10)
|
||||
# Sign our certificate with our private key
|
||||
)
|
||||
.sign(private_ca_key, hashes.SHA256())
|
||||
)
|
||||
with open(path, "wb") as f:
|
||||
f.write(cert.public_bytes(serialization.Encoding.PEM))
|
||||
return cert
|
||||
|
||||
|
||||
ca_key = genrsa(temp_dir + "/ca.key")
|
||||
ca_cert = create_cert(
|
||||
temp_dir + "/ca.pem",
|
||||
"US",
|
||||
"New York",
|
||||
"New York",
|
||||
"Gloo Certificate Authority",
|
||||
ca_key,
|
||||
)
|
||||
|
||||
pkey = genrsa(temp_dir + "/pkey.key")
|
||||
csr = create_req(
|
||||
temp_dir + "/csr.csr",
|
||||
"US",
|
||||
"California",
|
||||
"San Francisco",
|
||||
"Gloo Testing Company",
|
||||
pkey,
|
||||
)
|
||||
|
||||
cert = sign_certificate_request(temp_dir + "/cert.pem", csr, ca_cert, ca_key)
|
||||
18
.ci/pytorch/run_glootls_test.sh
Executable file
18
.ci/pytorch/run_glootls_test.sh
Executable file
@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
|
||||
CREATE_TEST_CERT="$(dirname "${BASH_SOURCE[0]}")/create_test_cert.py"
|
||||
TMP_CERT_DIR=$(python "$CREATE_TEST_CERT")
|
||||
|
||||
openssl verify -CAfile "${TMP_CERT_DIR}/ca.pem" "${TMP_CERT_DIR}/cert.pem"
|
||||
|
||||
export GLOO_DEVICE_TRANSPORT=TCP_TLS
|
||||
export GLOO_DEVICE_TRANSPORT_TCP_TLS_PKEY=${TMP_CERT_DIR}/pkey.key
|
||||
export GLOO_DEVICE_TRANSPORT_TCP_TLS_CERT=${TMP_CERT_DIR}/cert.pem
|
||||
export GLOO_DEVICE_TRANSPORT_TCP_TLS_CA_FILE=${TMP_CERT_DIR}/ca.pem
|
||||
|
||||
time python test/run_test.py --include distributed/test_c10d_gloo --verbose -- ProcessGroupGlooTest
|
||||
|
||||
unset GLOO_DEVICE_TRANSPORT
|
||||
unset GLOO_DEVICE_TRANSPORT_TCP_TLS_PKEY
|
||||
unset GLOO_DEVICE_TRANSPORT_TCP_TLS_CERT
|
||||
unset GLOO_DEVICE_TRANSPORT_TCP_TLS_CA_FILE
|
||||
@ -385,29 +385,6 @@ def smoke_test_compile(device: str = "cpu") -> None:
|
||||
x_pt2 = torch.compile(model, mode="max-autotune")(x)
|
||||
|
||||
|
||||
def smoke_test_nvshmem() -> None:
|
||||
if not torch.cuda.is_available():
|
||||
print("CUDA is not available, skipping NVSHMEM test")
|
||||
return
|
||||
|
||||
# Check if NVSHMEM is compiled in current build
|
||||
try:
|
||||
from torch._C._distributed_c10d import _is_nvshmem_available
|
||||
except ImportError:
|
||||
# Not built with NVSHMEM support.
|
||||
# torch is not compiled with NVSHMEM prior to 2.9
|
||||
if torch.__version__ < "2.9":
|
||||
return
|
||||
else:
|
||||
# After 2.9: NVSHMEM is expected to be compiled in current build
|
||||
raise RuntimeError("torch not compiled with NVSHMEM") from None
|
||||
|
||||
print("torch compiled with NVSHMEM")
|
||||
|
||||
# Check if NVSHMEM is available on current system.
|
||||
print(f"NVSHMEM available at run time: {_is_nvshmem_available()}")
|
||||
|
||||
|
||||
def smoke_test_modules():
|
||||
cwd = os.getcwd()
|
||||
for module in MODULES:
|
||||
@ -502,8 +479,6 @@ def main() -> None:
|
||||
options.pypi_pkg_check,
|
||||
)
|
||||
|
||||
smoke_test_nvshmem()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@ -345,12 +345,6 @@ test_h100_symm_mem() {
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
test_h100_cutlass_backend() {
|
||||
# cutlass backend tests for H100
|
||||
TORCHINDUCTOR_CUTLASS_DIR=$(realpath "./third_party/cutlass") python test/run_test.py --include inductor/test_cutlass_backend -k "not addmm" $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
TORCHINDUCTOR_CUTLASS_DIR=$(realpath "./third_party/cutlass") python test/run_test.py --include inductor/test_cutlass_evt $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
}
|
||||
|
||||
test_lazy_tensor_meta_reference_disabled() {
|
||||
export TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE=1
|
||||
echo "Testing lazy tensor operations without meta reference"
|
||||
@ -365,6 +359,7 @@ test_dynamo_wrapped_shard() {
|
||||
exit 1
|
||||
fi
|
||||
python tools/dynamo/verify_dynamo.py
|
||||
python tools/dynamo/gb_id_mapping.py verify
|
||||
# PLEASE DO NOT ADD ADDITIONAL EXCLUDES HERE.
|
||||
# Instead, use @skipIfTorchDynamo on your tests.
|
||||
time python test/run_test.py --dynamo \
|
||||
@ -462,7 +457,7 @@ test_inductor_aoti() {
|
||||
# rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled
|
||||
/usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}"
|
||||
|
||||
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
|
||||
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile
|
||||
}
|
||||
|
||||
test_inductor_cpp_wrapper_shard() {
|
||||
@ -928,6 +923,12 @@ test_torchbench_gcp_smoketest(){
|
||||
popd
|
||||
}
|
||||
|
||||
test_python_gloo_with_tls() {
|
||||
source "$(dirname "${BASH_SOURCE[0]}")/run_glootls_test.sh"
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
|
||||
test_aten() {
|
||||
# Test ATen
|
||||
# The following test(s) of ATen have already been skipped by caffe2 in rocm environment:
|
||||
@ -974,8 +975,6 @@ test_without_numpy() {
|
||||
if [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
|
||||
python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch;torch.compile(lambda x:print(x))('Hello World')"
|
||||
fi
|
||||
# Regression test for https://github.com/pytorch/pytorch/pull/157734 (torch.onnx should be importable without numpy)
|
||||
python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch; import torch.onnx"
|
||||
popd
|
||||
}
|
||||
|
||||
@ -1039,10 +1038,20 @@ test_libtorch_api() {
|
||||
mkdir -p $TEST_REPORTS_DIR
|
||||
|
||||
OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" "$TORCH_BIN_DIR"/test_api --gtest_filter='-IMethodTest.*' --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml
|
||||
"$TORCH_BIN_DIR"/test_tensorexpr --gtest_output=xml:$TEST_REPORTS_DIR/test_tensorexpr.xml
|
||||
else
|
||||
# Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy
|
||||
OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_api -k "not IMethodTest"
|
||||
|
||||
# On s390x, pytorch is built without llvm.
|
||||
# Even if it would be built with llvm, llvm currently doesn't support used features on s390x and
|
||||
# test fails with errors like:
|
||||
# JIT session error: Unsupported target machine architecture in ELF object pytorch-jitted-objectbuffer
|
||||
# unknown file: Failure
|
||||
# C++ exception with description "valOrErr INTERNAL ASSERT FAILED at "/var/lib/jenkins/workspace/torch/csrc/jit/tensorexpr/llvm_jit.h":34, please report a bug to PyTorch. Unexpected failure in LLVM JIT: Failed to materialize symbols: { (main, { func }) }
|
||||
if [[ "${BUILD_ENVIRONMENT}" != *s390x* ]]; then
|
||||
python test/run_test.py --cpp --verbose -i cpp/test_tensorexpr
|
||||
fi
|
||||
fi
|
||||
|
||||
# quantization is not fully supported on s390x yet
|
||||
@ -1310,13 +1319,10 @@ EOF
|
||||
|
||||
# Step 2. Make sure that the public API test "test_correct_module_names" fails when an existing
|
||||
# file is modified to introduce an invalid public API function.
|
||||
# The filepath here must not have __all__ defined in it, otherwise the test will pass.
|
||||
# If your PR introduces __all__ to torch/cuda/streams.py please point this to another file
|
||||
# that does not have __all__ defined.
|
||||
EXISTING_FILEPATH="${TORCH_INSTALL_DIR}/cuda/streams.py"
|
||||
EXISTING_FILEPATH="${TORCH_INSTALL_DIR}/nn/parameter.py"
|
||||
cp -v "${EXISTING_FILEPATH}" "${EXISTING_FILEPATH}.orig"
|
||||
echo "${BAD_PUBLIC_FUNC}" >> "${EXISTING_FILEPATH}"
|
||||
invalid_api="torch.cuda.streams.new_public_func"
|
||||
invalid_api="torch.nn.parameter.new_public_func"
|
||||
echo "Appended an invalid public API function to existing file ${EXISTING_FILEPATH}..."
|
||||
|
||||
check_public_api_test_fails \
|
||||
@ -1550,7 +1556,7 @@ test_executorch() {
|
||||
test_linux_aarch64() {
|
||||
python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
|
||||
test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
|
||||
test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
|
||||
test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops test_cpp_extensions_open_device_registration \
|
||||
--shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
|
||||
|
||||
# Dynamo tests
|
||||
@ -1662,13 +1668,11 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then
|
||||
elif [[ "${TEST_CONFIG}" == cachebench ]]; then
|
||||
install_torchaudio
|
||||
install_torchvision
|
||||
checkout_install_torchbench nanogpt BERT_pytorch resnet50 hf_T5 llama moco
|
||||
PYTHONPATH=$(pwd)/torchbench test_cachebench
|
||||
PYTHONPATH=/torchbench test_cachebench
|
||||
elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then
|
||||
install_torchaudio
|
||||
install_torchvision
|
||||
checkout_install_torchbench nanogpt
|
||||
PYTHONPATH=$(pwd)/torchbench test_verify_cachebench
|
||||
PYTHONPATH=/torchbench test_verify_cachebench
|
||||
elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
|
||||
install_torchaudio
|
||||
install_torchvision
|
||||
@ -1677,28 +1681,22 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
|
||||
# https://github.com/opencv/opencv-python/issues/885
|
||||
pip_install opencv-python==4.8.0.74
|
||||
if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
|
||||
checkout_install_torchbench hf_Bert hf_Albert timm_vision_transformer
|
||||
PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
|
||||
PYTHONPATH=/torchbench test_inductor_torchbench_smoketest_perf
|
||||
elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
|
||||
checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_edgecnn \
|
||||
llama_v2_7b_16h resnet50 timm_efficientnet mobilenet_v3_large timm_resnest \
|
||||
functorch_maml_omniglot yolov3 mobilenet_v2 resnext50_32x4d densenet121 mnasnet1_0
|
||||
PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_cpu_smoketest_perf
|
||||
PYTHONPATH=/torchbench test_inductor_torchbench_cpu_smoketest_perf
|
||||
elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then
|
||||
checkout_install_torchbench
|
||||
TORCHBENCHPATH=$(pwd)/torchbench test_torchbench_gcp_smoketest
|
||||
TORCHBENCHPATH=/torchbench test_torchbench_gcp_smoketest
|
||||
else
|
||||
checkout_install_torchbench
|
||||
# Do this after checkout_install_torchbench to ensure we clobber any
|
||||
# nightlies that torchbench may pull in
|
||||
if [[ "${TEST_CONFIG}" != *cpu* ]]; then
|
||||
install_torchrec_and_fbgemm
|
||||
fi
|
||||
PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
|
||||
PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"
|
||||
fi
|
||||
elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
|
||||
install_torchvision
|
||||
PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
|
||||
PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
|
||||
if [[ "$SHARD_NUMBER" -eq "1" ]]; then
|
||||
test_inductor_aoti
|
||||
fi
|
||||
@ -1763,8 +1761,6 @@ elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
|
||||
test_h100_distributed
|
||||
elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
|
||||
test_h100_symm_mem
|
||||
elif [[ "${TEST_CONFIG}" == h100_cutlass_backend ]]; then
|
||||
test_h100_cutlass_backend
|
||||
else
|
||||
install_torchvision
|
||||
install_monkeytype
|
||||
|
||||
@ -1,34 +0,0 @@
|
||||
# If you want to rebuild, run this with $env:REBUILD=1
|
||||
# If you want to build with CUDA, run this with $env:USE_CUDA=1
|
||||
# If you want to build without CUDA, run this with $env:USE_CUDA=0
|
||||
|
||||
# Check for setup.py in the current directory
|
||||
if (-not (Test-Path "setup.py")) {
|
||||
Write-Host "ERROR: Please run this build script from PyTorch root directory."
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Get the script's parent directory
|
||||
$ScriptParentDir = Split-Path -Parent $MyInvocation.MyCommand.Definition
|
||||
|
||||
# Set TMP_DIR and convert to Windows path
|
||||
$env:TMP_DIR = Join-Path (Get-Location) "build\win_tmp"
|
||||
$env:TMP_DIR_WIN = $env:TMP_DIR # Already in Windows format, no cygpath needed
|
||||
|
||||
# Set final package directory with default fallback
|
||||
if (-not $env:PYTORCH_FINAL_PACKAGE_DIR) {
|
||||
$env:PYTORCH_FINAL_PACKAGE_DIR = "C:\w\build-results"
|
||||
}
|
||||
|
||||
# Create the final package directory if it doesn't exist
|
||||
if (-not (Test-Path $env:PYTORCH_FINAL_PACKAGE_DIR)) {
|
||||
New-Item -Path $env:PYTORCH_FINAL_PACKAGE_DIR -ItemType Directory -Force | Out-Null
|
||||
}
|
||||
|
||||
# Set script helpers directory
|
||||
$env:SCRIPT_HELPERS_DIR = Join-Path $ScriptParentDir "win-test-helpers\arm64"
|
||||
|
||||
# Run the main build script
|
||||
& "$env:SCRIPT_HELPERS_DIR\build_pytorch.ps1"
|
||||
|
||||
Write-Host "BUILD PASSED"
|
||||
@ -1,24 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -ex -o pipefail
|
||||
|
||||
SCRIPT_PARENT_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
|
||||
# shellcheck source=./common.sh
|
||||
source "$SCRIPT_PARENT_DIR/common.sh"
|
||||
|
||||
run_tests() {
|
||||
echo Running smoke_test.py...
|
||||
python ./.ci/pytorch/smoke_test/smoke_test.py --package torchonly
|
||||
|
||||
echo Running test_autograd.oy, test_nn.py, test_torch.py...
|
||||
cd test
|
||||
|
||||
CORE_TEST_LIST=("test_autograd.py" "test_nn.py" "test_modules.py")
|
||||
|
||||
for t in "${CORE_TEST_LIST[@]}"; do
|
||||
echo "Running test: $t"
|
||||
python "$t" --verbose --save-xml --use-pytest -vvvv -rfEsxXP -p no:xdist
|
||||
done
|
||||
}
|
||||
|
||||
run_tests
|
||||
echo "TEST PASSED"
|
||||
@ -1,98 +0,0 @@
|
||||
# TODO: we may can use existing build_pytorch.bat for arm64
|
||||
|
||||
if ($env:DEBUG -eq "1") {
|
||||
$env:BUILD_TYPE = "debug"
|
||||
} else {
|
||||
$env:BUILD_TYPE = "release"
|
||||
}
|
||||
|
||||
# This inflates our log size slightly, but it is REALLY useful to be
|
||||
# able to see what our cl.exe commands are. (since you can actually
|
||||
# just copy-paste them into a local Windows setup to just rebuild a
|
||||
# single file.)
|
||||
# log sizes are too long, but leaving this here in case someone wants to use it locally
|
||||
# $env:CMAKE_VERBOSE_MAKEFILE = "1"
|
||||
|
||||
$env:INSTALLER_DIR = Join-Path $env:SCRIPT_HELPERS_DIR "installation-helpers"
|
||||
|
||||
cd ..
|
||||
|
||||
# Environment variables
|
||||
$env:SCCACHE_IDLE_TIMEOUT = "0"
|
||||
$env:SCCACHE_IGNORE_SERVER_IO_ERROR = "1"
|
||||
$env:CMAKE_BUILD_TYPE = $env:BUILD_TYPE
|
||||
$env:CMAKE_C_COMPILER_LAUNCHER = "sccache"
|
||||
$env:CMAKE_CXX_COMPILER_LAUNCHER = "sccache"
|
||||
$env:libuv_ROOT = Join-Path $env:DEPENDENCIES_DIR "libuv\install"
|
||||
$env:MSSdk = "1"
|
||||
|
||||
if ($env:PYTORCH_BUILD_VERSION) {
|
||||
$env:PYTORCH_BUILD_VERSION = $env:PYTORCH_BUILD_VERSION
|
||||
$env:PYTORCH_BUILD_NUMBER = "1"
|
||||
}
|
||||
|
||||
$env:CMAKE_POLICY_VERSION_MINIMUM = "3.5"
|
||||
|
||||
# Set BLAS type
|
||||
if ($env:ENABLE_APL -eq "1") {
|
||||
$env:BLAS = "APL"
|
||||
$env:USE_LAPACK = "1"
|
||||
} elseif ($env:ENABLE_OPENBLAS -eq "1") {
|
||||
$env:BLAS = "OpenBLAS"
|
||||
$env:OpenBLAS_HOME = Join-Path $env:DEPENDENCIES_DIR "OpenBLAS\install"
|
||||
}
|
||||
|
||||
# Change to source directory
|
||||
Set-Location $env:PYTORCH_ROOT
|
||||
|
||||
# Copy libuv.dll
|
||||
Copy-Item -Path (Join-Path $env:libuv_ROOT "lib\Release\uv.dll") -Destination "torch\lib\uv.dll" -Force
|
||||
|
||||
# Create virtual environment
|
||||
python -m venv .venv
|
||||
.\.venv\Scripts\Activate.ps1
|
||||
where.exe python
|
||||
|
||||
# Python install dependencies
|
||||
python -m pip install --upgrade pip
|
||||
pip install setuptools pyyaml
|
||||
pip install -r requirements.txt
|
||||
|
||||
# Set after installing psutil
|
||||
$env:DISTUTILS_USE_SDK = "1"
|
||||
|
||||
# Print all environment variables
|
||||
Get-ChildItem Env:
|
||||
|
||||
# Start and inspect sccache
|
||||
sccache --start-server
|
||||
sccache --zero-stats
|
||||
sccache --show-stats
|
||||
|
||||
# Build the wheel
|
||||
python setup.py bdist_wheel
|
||||
if ($LASTEXITCODE -ne 0) { exit 1 }
|
||||
|
||||
# Install the wheel locally
|
||||
$whl = Get-ChildItem -Path "dist\*.whl" | Select-Object -First 1
|
||||
if ($whl) {
|
||||
python -mpip install --no-index --no-deps $whl.FullName
|
||||
}
|
||||
|
||||
# Copy final wheel
|
||||
robocopy "dist" "$env:PYTORCH_FINAL_PACKAGE_DIR" *.whl
|
||||
|
||||
# Export test times
|
||||
python tools/stats/export_test_times.py
|
||||
|
||||
# Copy additional CI files
|
||||
robocopy ".additional_ci_files" "$env:PYTORCH_FINAL_PACKAGE_DIR\.additional_ci_files" /E
|
||||
|
||||
# Save ninja log
|
||||
Copy-Item -Path "build\.ninja_log" -Destination $env:PYTORCH_FINAL_PACKAGE_DIR -Force
|
||||
|
||||
# Final sccache stats and stop
|
||||
sccache --show-stats
|
||||
sccache --stop-server
|
||||
|
||||
exit 0
|
||||
@ -41,7 +41,7 @@ fi
|
||||
python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
|
||||
|
||||
# Install Z3 optional dependency for Windows builds.
|
||||
python -m pip install z3-solver==4.15.1.0
|
||||
python -m pip install z3-solver==4.12.2.0
|
||||
|
||||
# Install tlparse for test\dynamo\test_structured_trace.py UTs.
|
||||
python -m pip install tlparse==0.3.30
|
||||
|
||||
4
.flake8
4
.flake8
@ -7,12 +7,12 @@ max-line-length = 120
|
||||
# C408 ignored because we like the dict keyword argument syntax
|
||||
# E501 is not flexible enough, we're using B950 instead
|
||||
ignore =
|
||||
E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
|
||||
E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,
|
||||
# shebang has extra meaning in fbcode lints, so I think it's not worth trying
|
||||
# to line this up with executable bit
|
||||
EXE001,
|
||||
# these ignores are from flake8-bugbear; please fix!
|
||||
B007,B008,B017,B019,B023,B028,B903,B904,B905,B906,B907,B908,B910
|
||||
B007,B008,B017,B019,B023,B028,B903,B904,B905,B906,B907
|
||||
# these ignores are from flake8-comprehensions; please fix!
|
||||
C407,
|
||||
# these ignores are from flake8-logging-format; please fix!
|
||||
|
||||
78
.github/actions/build-android/action.yml
vendored
Normal file
78
.github/actions/build-android/action.yml
vendored
Normal file
@ -0,0 +1,78 @@
|
||||
name: build android
|
||||
|
||||
description: build android for a specific arch
|
||||
|
||||
inputs:
|
||||
arch:
|
||||
description: arch to build
|
||||
required: true
|
||||
arch-for-build-env:
|
||||
description: |
|
||||
arch to pass to build environment.
|
||||
This is currently different than the arch name we use elsewhere, which
|
||||
should be fixed.
|
||||
required: true
|
||||
github-secret:
|
||||
description: github token
|
||||
required: true
|
||||
build-environment:
|
||||
required: true
|
||||
description: Top-level label for what's being built/tested.
|
||||
docker-image:
|
||||
required: true
|
||||
description: Name of the base docker image to build with.
|
||||
branch:
|
||||
required: true
|
||||
description: What branch we are building on.
|
||||
outputs:
|
||||
container_id:
|
||||
description: Docker container identifier used to build the artifacts
|
||||
value: ${{ steps.build.outputs.container_id }}
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
- name: Build-${{ inputs.arch }}
|
||||
id: build
|
||||
shell: bash
|
||||
env:
|
||||
BRANCH: ${{ inputs.branch }}
|
||||
BUILD_ENVIRONMENT: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-${{ inputs.arch-for-build-env }}-build"
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
|
||||
SCCACHE_REGION: us-east-1
|
||||
DOCKER_IMAGE: ${{ inputs.docker-image }}
|
||||
MATRIX_ARCH: ${{ inputs.arch }}
|
||||
run: |
|
||||
# detached container should get cleaned up by teardown_ec2_linux
|
||||
set -exo pipefail
|
||||
export container_name
|
||||
container_name=$(docker run \
|
||||
-e BUILD_ENVIRONMENT \
|
||||
-e MAX_JOBS="$(nproc --ignore=2)" \
|
||||
-e AWS_DEFAULT_REGION \
|
||||
-e PR_NUMBER \
|
||||
-e SHA1 \
|
||||
-e BRANCH \
|
||||
-e SCCACHE_BUCKET \
|
||||
-e SCCACHE_REGION \
|
||||
-e SKIP_SCCACHE_INITIALIZATION=1 \
|
||||
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
|
||||
--security-opt seccomp=unconfined \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--tty \
|
||||
--detach \
|
||||
--user jenkins \
|
||||
-w /var/lib/jenkins/workspace \
|
||||
"${DOCKER_IMAGE}"
|
||||
)
|
||||
git submodule sync && git submodule update -q --init --recursive --depth 1
|
||||
docker cp "${GITHUB_WORKSPACE}/." "${container_name}:/var/lib/jenkins/workspace"
|
||||
(echo "sudo chown -R jenkins . && .ci/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete" | docker exec -u jenkins -i "${container_name}" bash) 2>&1
|
||||
|
||||
# Copy install binaries back
|
||||
mkdir -p "${GITHUB_WORKSPACE}/build_android_install_${MATRIX_ARCH}"
|
||||
docker cp "${container_name}:/var/lib/jenkins/workspace/build_android/install" "${GITHUB_WORKSPACE}/build_android_install_${MATRIX_ARCH}"
|
||||
echo "container_id=${container_name}" >> "${GITHUB_OUTPUT}"
|
||||
@ -70,7 +70,7 @@ runs:
|
||||
set -eux
|
||||
# PyYAML 6.0 doesn't work with MacOS x86 anymore
|
||||
# This must run on Python-3.7 (AmazonLinux2) so can't use request=3.32.2
|
||||
python3 -m pip install requests==2.27.1 pyyaml==6.0.2
|
||||
python3 -m pip install requests==2.27.1 pyyaml==6.0.1
|
||||
|
||||
- name: Parse ref
|
||||
id: parse-ref
|
||||
|
||||
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
f6dfe1231dcdd221a68416e49ab85c2575cbb824
|
||||
00b0c91db92c51a11356249262577b9fa26c18c5
|
||||
|
||||
2
.github/ci_commit_pins/fbgemm_rocm.txt
vendored
2
.github/ci_commit_pins/fbgemm_rocm.txt
vendored
@ -1 +1 @@
|
||||
7f1de94a4c2d14f59ad4ca84538c36084ea6b2c8
|
||||
5fb5024118e9bb9decf96c2b0b1a8f0010bf56be
|
||||
|
||||
2
.github/ci_commit_pins/vllm.txt
vendored
2
.github/ci_commit_pins/vllm.txt
vendored
@ -1 +1 @@
|
||||
8f605ee30912541126c0fe46d0c8c413101b600a
|
||||
29d1ffc5b4c763ef76aff9e3f617fa60dd292418
|
||||
|
||||
2
.github/ci_commit_pins/xla.txt
vendored
2
.github/ci_commit_pins/xla.txt
vendored
@ -1 +1 @@
|
||||
29ae4c76c026185f417a25e841d2cd5e65f087a3
|
||||
1c00dea2c9adb2137903c86b4191e8c247f8fda9
|
||||
|
||||
28
.github/merge_rules.yaml
vendored
28
.github/merge_rules.yaml
vendored
@ -131,6 +131,21 @@
|
||||
- Lint
|
||||
- pull
|
||||
|
||||
- name: Mobile
|
||||
patterns:
|
||||
- ios/**
|
||||
- android/**
|
||||
- test/mobile/**
|
||||
approved_by:
|
||||
- linbinyu
|
||||
- IvanKobzarev
|
||||
- dreiss
|
||||
- raziel
|
||||
mandatory_checks_name:
|
||||
- EasyCLA
|
||||
- Lint
|
||||
- pull
|
||||
|
||||
- name: PrimTorch
|
||||
patterns:
|
||||
- torch/_meta_registrations.py
|
||||
@ -477,19 +492,6 @@
|
||||
- srossross
|
||||
- chillee
|
||||
- zou3519
|
||||
- guilhermeleobas
|
||||
mandatory_checks_name:
|
||||
- EasyCLA
|
||||
- Lint
|
||||
- pull
|
||||
|
||||
- name: Dynamo
|
||||
patterns:
|
||||
- torch/_dynamo/**
|
||||
- torch/csrc/dynamo/**
|
||||
- test/dynamo/**
|
||||
approved_by:
|
||||
- guilhermeleobas
|
||||
mandatory_checks_name:
|
||||
- EasyCLA
|
||||
- Lint
|
||||
|
||||
2
.github/pytorch-probot.yml
vendored
2
.github/pytorch-probot.yml
vendored
@ -31,9 +31,7 @@ ciflow_push_tags:
|
||||
- ciflow/pull
|
||||
- ciflow/h100
|
||||
- ciflow/h100-distributed
|
||||
- ciflow/win-arm64
|
||||
- ciflow/h100-symm-mem
|
||||
- ciflow/h100-cutlass-backend
|
||||
retryable_workflows:
|
||||
- pull
|
||||
- trunk
|
||||
|
||||
6
.github/requirements-gha-cache.txt
vendored
6
.github/requirements-gha-cache.txt
vendored
@ -7,9 +7,9 @@
|
||||
# .ci/docker/requirements-ci.txt
|
||||
boto3==1.35.42
|
||||
jinja2==3.1.6
|
||||
lintrunner==0.12.7
|
||||
lintrunner==0.10.7
|
||||
ninja==1.10.0.post1
|
||||
nvidia-ml-py==11.525.84
|
||||
pyyaml==6.0.2
|
||||
pyyaml==6.0
|
||||
requests==2.32.4
|
||||
rich==14.1.0
|
||||
rich==10.9.0
|
||||
|
||||
@ -2,7 +2,7 @@ boto3==1.35.42
|
||||
cmake==3.27.*
|
||||
expecttest==0.3.0
|
||||
fbscribelogger==0.1.7
|
||||
filelock==3.13.1
|
||||
filelock==3.6.0
|
||||
hypothesis==6.56.4
|
||||
librosa>=0.6.2
|
||||
mpmath==1.3.0
|
||||
@ -33,4 +33,4 @@ tensorboard==2.13.0
|
||||
typing-extensions==4.12.2
|
||||
unittest-xml-reporting<=3.2.0,>=2.0.0
|
||||
xdoctest==1.1.0
|
||||
z3-solver==4.15.1.0
|
||||
z3-solver==4.12.2.0
|
||||
|
||||
2
.github/scripts/lintrunner.sh
vendored
2
.github/scripts/lintrunner.sh
vendored
@ -2,7 +2,7 @@
|
||||
set -ex
|
||||
|
||||
# Use uv to speed up lintrunner init
|
||||
python3 -m pip install -U uv==0.8.* setuptools
|
||||
python3 -m pip install uv==0.1.45 setuptools
|
||||
|
||||
CACHE_DIRECTORY="/tmp/.lintbin"
|
||||
# Try to recover the cached binaries
|
||||
|
||||
4
.github/workflows/_get-changed-files.yml
vendored
4
.github/workflows/_get-changed-files.yml
vendored
@ -27,7 +27,7 @@ jobs:
|
||||
PR_NUMBER="${{ github.event.number }}"
|
||||
|
||||
# Use gh CLI to get changed files in the PR with explicit repo
|
||||
CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
|
||||
CHANGED_FILES=$(gh pr view "$PR_NUMBER" --repo "${{ github.repository }}" --json files --jq '.files[].path' | tr '\n' ' ' | sed 's/ $//')
|
||||
|
||||
if [ -z "$CHANGED_FILES" ]; then
|
||||
echo "No changed files found, setting to '*'"
|
||||
@ -40,4 +40,4 @@ jobs:
|
||||
else
|
||||
echo "Not in PR context, setting changed files to '*'"
|
||||
echo "changed-files=*" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
fi
|
||||
4
.github/workflows/_rocm-test.yml
vendored
4
.github/workflows/_rocm-test.yml
vendored
@ -269,8 +269,8 @@ jobs:
|
||||
# copy test results back to the mounted workspace, needed sudo, resulting permissions were correct
|
||||
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test"
|
||||
|
||||
- name: Change permissions (only needed for MI300 and MI355 kubernetes runners for now)
|
||||
if: ${{ always() && steps.test.conclusion && (contains(matrix.runner, 'mi300') || contains(matrix.runner, 'mi355')) }}
|
||||
- name: Change permissions (only needed for MI300 runners for now)
|
||||
if: ${{ always() && steps.test.conclusion && contains(matrix.runner, 'mi300') }}
|
||||
run: |
|
||||
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"
|
||||
|
||||
|
||||
@ -56,7 +56,7 @@ jobs:
|
||||
cache: pip
|
||||
architecture: x64
|
||||
|
||||
- run: pip install pyyaml==6.0.2
|
||||
- run: pip install pyyaml==6.0
|
||||
shell: bash
|
||||
|
||||
- name: Verify mergeability
|
||||
|
||||
2
.github/workflows/cherry-pick.yml
vendored
2
.github/workflows/cherry-pick.yml
vendored
@ -26,7 +26,7 @@ jobs:
|
||||
cache: pip
|
||||
|
||||
# Not the direct dependencies but the script uses trymerge
|
||||
- run: pip install pyyaml==6.0.2
|
||||
- run: pip install pyyaml==6.0
|
||||
|
||||
- name: Setup committer id
|
||||
run: |
|
||||
|
||||
1
.github/workflows/docker-builds.yml
vendored
1
.github/workflows/docker-builds.yml
vendored
@ -50,7 +50,6 @@ jobs:
|
||||
runner: [linux.12xlarge]
|
||||
docker-image-name: [
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks,
|
||||
|
||||
2
.github/workflows/docker-release.yml
vendored
2
.github/workflows/docker-release.yml
vendored
@ -144,7 +144,7 @@ jobs:
|
||||
run: |
|
||||
make -f docker.Makefile "${BUILD_IMAGE_TYPE}-image"
|
||||
- name: Push nightly tags
|
||||
if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' && matrix.platform == 'linux/amd4' }}
|
||||
if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' && matrix.build_platforms == 'linux/amd4' }}
|
||||
run: |
|
||||
PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-cuda${CUDA_VERSION_SHORT}-cudnn${CUDNN_VERSION}-runtime"
|
||||
CUDA_SUFFIX="-cu${CUDA_VERSION}"
|
||||
|
||||
58
.github/workflows/h100-cutlass-backend.yml
vendored
58
.github/workflows/h100-cutlass-backend.yml
vendored
@ -1,58 +0,0 @@
|
||||
name: Limited CI for CUTLASS backend on H100
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- .github/workflows/h100-cutlass-backend.yml
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: 22 9 * * * # every 24 hours about 2:22am PDT
|
||||
push:
|
||||
tags:
|
||||
- ciflow/h100-cutlass-backend/*
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
|
||||
get-label-type:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: get-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-cutlass-backend:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc11-sm90-cutlass-backend
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-cutlass-backend
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '9.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "h100_cutlass_backend", shard: 1, num_shards: 1, runner: "linux.aws.h100", owners: ["oncall:pt2"] },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-sm90-test:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc11-sm90-cutlass-backend
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs:
|
||||
- linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-cutlass-backend
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-cutlass-backend
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-cutlass-backend.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-cutlass-backend.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
@ -2,7 +2,7 @@ name: inductor-perf-nightly-h100
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: 15 0,12 * * 1-6
|
||||
- cron: 15 0,4,8,12,16,20 * * 1-6
|
||||
- cron: 0 7 * * 0
|
||||
# NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
|
||||
# out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
|
||||
@ -126,7 +126,7 @@ jobs:
|
||||
name: cuda12.8-py3.10-gcc9-sm90
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
if: github.event.schedule == '15 0,12 * * 1-6'
|
||||
if: github.event.schedule == '15 0,4,8,12,16,20 * * 1-6'
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
|
||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
|
||||
|
||||
15
.github/workflows/pull.yml
vendored
15
.github/workflows/pull.yml
vendored
@ -315,6 +315,21 @@ jobs:
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3-clang18-mobile-build:
|
||||
name: linux-jammy-py3-clang18-mobile-build
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3-clang12-mobile-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
|
||||
build-generates-artifacts: false
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 1 },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-cudnn9-py3_9-clang12-build:
|
||||
name: linux-jammy-cuda12.8-cudnn9-py3.9-clang12
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
|
||||
2
.github/workflows/revert.yml
vendored
2
.github/workflows/revert.yml
vendored
@ -26,7 +26,7 @@ jobs:
|
||||
architecture: x64
|
||||
check-latest: false
|
||||
cache: pip
|
||||
- run: pip install pyyaml==6.0.2
|
||||
- run: pip install pyyaml==6.0
|
||||
|
||||
- name: Setup committer id
|
||||
run: |
|
||||
|
||||
68
.github/workflows/rocm-mi355.yml
vendored
68
.github/workflows/rocm-mi355.yml
vendored
@ -1,68 +0,0 @@
|
||||
name: rocm-mi355
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: 30 9 * * * # about 2:30am PDT
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
target-determination:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: before-test
|
||||
uses: ./.github/workflows/target_determination.yml
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-noble-rocm-py3_12-build:
|
||||
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
|
||||
name: linux-noble-rocm-py3.12-mi355
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-noble-rocm-py3.12-mi355
|
||||
docker-image-name: ci-image:pytorch-linux-noble-rocm-alpha-py3
|
||||
sync-tag: rocm-build
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
|
||||
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
|
||||
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
|
||||
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
|
||||
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
|
||||
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-noble-rocm-py3_12-test:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-noble-rocm-py3.12-mi355
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-noble-rocm-py3_12-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-noble-rocm-py3.12-mi355
|
||||
docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }}
|
||||
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
|
||||
secrets: inherit
|
||||
2
.github/workflows/trymerge.yml
vendored
2
.github/workflows/trymerge.yml
vendored
@ -28,7 +28,7 @@ jobs:
|
||||
check-latest: false
|
||||
cache: pip
|
||||
architecture: x64
|
||||
- run: pip install pyyaml==6.0.2
|
||||
- run: pip install pyyaml==6.0
|
||||
|
||||
- name: Setup committer id
|
||||
run: |
|
||||
|
||||
2
.github/workflows/tryrebase.yml
vendored
2
.github/workflows/tryrebase.yml
vendored
@ -25,7 +25,7 @@ jobs:
|
||||
architecture: x64
|
||||
check-latest: false
|
||||
cache: pip
|
||||
- run: pip install pyyaml==6.0.2
|
||||
- run: pip install pyyaml==6.0
|
||||
|
||||
- name: Setup committer id
|
||||
run: |
|
||||
|
||||
1
.github/workflows/upload-test-stats.yml
vendored
1
.github/workflows/upload-test-stats.yml
vendored
@ -14,7 +14,6 @@ on:
|
||||
- inductor-periodic
|
||||
- rocm
|
||||
- rocm-mi300
|
||||
- rocm-mi355
|
||||
- inductor-micro-benchmark
|
||||
- inductor-micro-benchmark-x86
|
||||
- inductor-cu124
|
||||
|
||||
187
.github/workflows/win-arm64-build-test.yml
vendored
187
.github/workflows/win-arm64-build-test.yml
vendored
@ -1,187 +0,0 @@
|
||||
name: windows-arm64-build-test
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- ciflow/win-arm64/*
|
||||
|
||||
env:
|
||||
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
|
||||
PYTHON_VERSION: "3.12"
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
DOWNLOADS_DIR: c:\temp\downloads
|
||||
DEPENDENCIES_DIR: c:\temp\dependencies
|
||||
ENABLE_APL: 1
|
||||
ENABLE_OPENBLAS: 0
|
||||
BUILD_TYPE: release
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
build:
|
||||
# Don't run on forked repos.
|
||||
if: github.repository_owner == 'pytorch'
|
||||
runs-on: "windows-11-arm64-preview"
|
||||
timeout-minutes: 240
|
||||
steps:
|
||||
- name: configure aws credentials
|
||||
id: aws_creds
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_sscache
|
||||
aws-region: us-east-1
|
||||
role-duration-seconds: 18000
|
||||
|
||||
- name: Enable long paths
|
||||
shell: cmd
|
||||
run: |
|
||||
git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
|
||||
git config --system core.longpaths true
|
||||
|
||||
- name: Git checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
path: pytorch
|
||||
submodules: recursive
|
||||
|
||||
- name: Bootstrap Python
|
||||
shell: cmd
|
||||
run: |
|
||||
"pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
|
||||
|
||||
- name: Parse ref
|
||||
id: parse-ref
|
||||
shell: bash
|
||||
run: python pytorch/.github/scripts/parse_ref.py
|
||||
|
||||
- name: Get workflow job id
|
||||
shell: bash
|
||||
id: get-job-id
|
||||
run: |
|
||||
set -eux
|
||||
python pytorch/.github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}"
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Bootstrap APL
|
||||
shell: cmd
|
||||
run: |
|
||||
"pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
|
||||
|
||||
- name: Bootstrap Rust
|
||||
shell: cmd
|
||||
run: |
|
||||
"pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
|
||||
|
||||
- name: Bootstrap sccache
|
||||
shell: cmd
|
||||
run: |
|
||||
"pytorch/.ci/pytorch/windows/arm64/bootstrap_sccache.bat"
|
||||
|
||||
- name: Bootstrap Libuv
|
||||
shell: cmd
|
||||
run: |
|
||||
"pytorch/.ci/pytorch/windows/arm64/bootstrap_libuv.bat"
|
||||
|
||||
- name: Build
|
||||
id: build
|
||||
shell: cmd
|
||||
env:
|
||||
PYTORCH_FINAL_PACKAGE_DIR: C:/${{ github.run_id }}/build-results/
|
||||
BRANCH: ${{ steps.parse-ref.outputs.branch }}
|
||||
BUILD_WHEEL: 1
|
||||
MAX_JOBS: 8
|
||||
PYTHON_VERSION: "3.12"
|
||||
SCCACHE_BUCKET: "ossci-compiler-cache"
|
||||
SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
|
||||
SCCACHE_REGION: us-east-1
|
||||
VC_PRODUCT: "BuildTools"
|
||||
VC_VERSION: ""
|
||||
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
USE_CUDA: '0'
|
||||
USE_XPU: '0'
|
||||
OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
|
||||
run: |
|
||||
cd pytorch
|
||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64
|
||||
powershell -ExecutionPolicy Bypass -File ".ci/pytorch/win-arm64-build.ps1"
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4.4.0
|
||||
if: always()
|
||||
with:
|
||||
name: torch-wheel-win-arm64-py3-12
|
||||
retention-days: 14
|
||||
if-no-files-found: error
|
||||
path: C:\${{ github.run_id }}\build-results
|
||||
|
||||
test:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
strategy:
|
||||
fail-fast: false
|
||||
runs-on: "windows-11-arm64-preview"
|
||||
needs: build
|
||||
steps:
|
||||
- name: Enable long paths
|
||||
shell: cmd
|
||||
run: |
|
||||
git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
|
||||
git config --system core.longpaths true
|
||||
|
||||
- name: Git checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
path: pytorch
|
||||
submodules: recursive
|
||||
|
||||
- name: Bootstrap Python
|
||||
shell: cmd
|
||||
run: |
|
||||
"pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
|
||||
|
||||
- name: Bootstrap Rust
|
||||
shell: cmd
|
||||
run: |
|
||||
"pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
|
||||
|
||||
- name: Get workflow job id
|
||||
shell: bash
|
||||
id: get-job-id
|
||||
run: |
|
||||
set -eux
|
||||
python pytorch/.github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}"
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Download Build Artifacts
|
||||
uses: actions/download-artifact@v4.1.7
|
||||
with:
|
||||
name: torch-wheel-win-arm64-py3-12
|
||||
path: C:\${{ github.run_id }}\build-results
|
||||
|
||||
- name: Test
|
||||
id: test
|
||||
shell: cmd
|
||||
env:
|
||||
USE_CUDA: '0'
|
||||
INSTALL_WINDOWS_SDK: 1
|
||||
PYTHON_VERSION: "3.12"
|
||||
VC_PRODUCT: "BuildTools"
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
GITHUB_REPOSITORY: ${{ github.repository }}
|
||||
GITHUB_WORKFLOW: ${{ github.workflow }}
|
||||
GITHUB_JOB: ${{ github.job }}
|
||||
GITHUB_RUN_ID: ${{ github.run_id }}
|
||||
GITHUB_RUN_NUMBER: ${{ github.run_number }}
|
||||
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
|
||||
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
|
||||
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
|
||||
PYTORCH_FINAL_PACKAGE_DIR: C:/${{ github.run_id }}/build-results/
|
||||
run: |
|
||||
mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
|
||||
call pytorch/.ci/pytorch/windows/arm64/bootstrap_tests.bat
|
||||
set GIT_BASH=C:\Program Files\Git\usr\bin\bash.exe
|
||||
"%GIT_BASH%" -c "bash --noprofile --norc .ci/pytorch/win-arm64-test.sh"
|
||||
@ -39,16 +39,16 @@ init_command = [
|
||||
'python3',
|
||||
'tools/linter/adapters/pip_init.py',
|
||||
'--dry-run={{DRYRUN}}',
|
||||
'flake8==7.3.0',
|
||||
'flake8-bugbear==24.12.12',
|
||||
'flake8-comprehensions==3.16.0',
|
||||
'flake8==6.1.0',
|
||||
'flake8-bugbear==23.3.23',
|
||||
'flake8-comprehensions==3.15.0',
|
||||
'flake8-executable==2.1.3',
|
||||
'flake8-logging-format==2024.24.12',
|
||||
'flake8-pyi==25.5.0',
|
||||
'flake8-simplify==0.22.0',
|
||||
'flake8-logging-format==0.9.0',
|
||||
'flake8-pyi==23.3.1',
|
||||
'flake8-simplify==0.19.3',
|
||||
'mccabe==0.7.0',
|
||||
'pycodestyle==2.14.0',
|
||||
'pyflakes==3.4.0',
|
||||
'pycodestyle==2.11.1',
|
||||
'pyflakes==3.1.0',
|
||||
'torchfix==0.4.0 ; python_version >= "3.9" and python_version < "3.13"',
|
||||
]
|
||||
|
||||
@ -158,7 +158,7 @@ init_command = [
|
||||
'mypy==1.16.0',
|
||||
'sympy==1.13.3',
|
||||
'types-requests==2.27.25',
|
||||
'types-pyyaml==6.0.2',
|
||||
'types-pyyaml==6.0.1',
|
||||
'types-tabulate==0.8.8',
|
||||
'types-protobuf==5.29.1.20250403',
|
||||
'types-setuptools==79.0.0.20250422',
|
||||
@ -166,8 +166,8 @@ init_command = [
|
||||
'types-colorama==0.4.6',
|
||||
'filelock==3.13.1',
|
||||
'junitparser==2.1.1',
|
||||
'rich==14.1.0',
|
||||
'pyyaml==6.0.2',
|
||||
'rich==10.9.0',
|
||||
'pyyaml==6.0.1',
|
||||
'optree==0.13.0',
|
||||
'dataclasses-json==0.6.7',
|
||||
'pandas==2.2.3',
|
||||
@ -1111,7 +1111,7 @@ init_command = [
|
||||
'python3',
|
||||
'tools/linter/adapters/pip_init.py',
|
||||
'--dry-run={{DRYRUN}}',
|
||||
'pyyaml==6.0.2',
|
||||
'PyYAML==6.0.1',
|
||||
]
|
||||
|
||||
[[linter]]
|
||||
@ -1133,7 +1133,7 @@ init_command = [
|
||||
'python3',
|
||||
'tools/linter/adapters/pip_init.py',
|
||||
'--dry-run={{DRYRUN}}',
|
||||
'pyyaml==6.0.2',
|
||||
'PyYAML==6.0.1',
|
||||
]
|
||||
|
||||
[[linter]]
|
||||
@ -1794,12 +1794,3 @@ include_patterns = [
|
||||
'torch/header_only_apis.txt',
|
||||
]
|
||||
is_formatter = false
|
||||
|
||||
|
||||
[[linter]]
|
||||
code = "GB_REGISTRY"
|
||||
include_patterns = ["torch/_dynamo/**/*.py"]
|
||||
command = [
|
||||
"python3",
|
||||
"tools/linter/adapters/gb_registry_linter.py",
|
||||
]
|
||||
|
||||
15
Dockerfile
15
Dockerfile
@ -47,6 +47,18 @@ WORKDIR /opt/pytorch
|
||||
COPY . .
|
||||
RUN git submodule update --init --recursive
|
||||
|
||||
FROM conda as build
|
||||
ARG CMAKE_VARS
|
||||
WORKDIR /opt/pytorch
|
||||
COPY --from=conda /opt/conda /opt/conda
|
||||
COPY --from=submodule-update /opt/pytorch /opt/pytorch
|
||||
RUN make triton
|
||||
RUN --mount=type=cache,target=/opt/ccache \
|
||||
export eval ${CMAKE_VARS} && \
|
||||
TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 8.9 9.0 9.0a" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
|
||||
CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
|
||||
python -m pip install --no-build-isolation -v .
|
||||
|
||||
FROM conda as conda-installs
|
||||
ARG PYTHON_VERSION=3.11
|
||||
ARG CUDA_PATH=cu121
|
||||
@ -97,5 +109,4 @@ WORKDIR /workspace
|
||||
|
||||
FROM official as dev
|
||||
# Should override the already installed version from the official-image stage
|
||||
COPY --from=conda /opt/conda /opt/conda
|
||||
COPY --from=submodule-update /opt/pytorch /opt/pytorch
|
||||
COPY --from=build /opt/conda /opt/conda
|
||||
|
||||
@ -294,12 +294,14 @@ Install PyTorch
|
||||
|
||||
```bash
|
||||
export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
|
||||
python -m pip install -r requirements-build.txt
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
```
|
||||
|
||||
**On macOS**
|
||||
|
||||
```bash
|
||||
python -m pip install -r requirements-build.txt
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
```
|
||||
|
||||
|
||||
@ -586,10 +586,17 @@ if(USE_CUDA AND NOT USE_ROCM)
|
||||
CUDA::cufft_static_nocallback
|
||||
)
|
||||
if(NOT BUILD_LAZY_CUDA_LINALG)
|
||||
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
|
||||
CUDA::cusolver_static
|
||||
${CUDAToolkit_LIBRARY_DIR}/libcusolver_lapack_static.a # needed for libcusolver_static
|
||||
)
|
||||
if(CUDA_VERSION_MAJOR LESS_EQUAL 11)
|
||||
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
|
||||
CUDA::cusolver_static
|
||||
${CUDAToolkit_LIBRARY_DIR}/liblapack_static.a # needed for libcusolver_static
|
||||
)
|
||||
elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 12)
|
||||
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
|
||||
CUDA::cusolver_static
|
||||
${CUDAToolkit_LIBRARY_DIR}/libcusolver_lapack_static.a # needed for libcusolver_static
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
else()
|
||||
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
|
||||
|
||||
@ -14,9 +14,7 @@
|
||||
#include <ATen/cpu/FlushDenormal.h>
|
||||
|
||||
#ifdef USE_FBGEMM
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
|
||||
#include <fbgemm/Fbgemm.h>
|
||||
C10_DIAGNOSTIC_POP()
|
||||
#endif // USE_FBGEMM
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE)
|
||||
#include <cpuinfo.h>
|
||||
@ -334,14 +332,6 @@ void Context::setBenchmarkLimitCuDNN(int b) {
|
||||
benchmark_limit_cudnn = b;
|
||||
}
|
||||
|
||||
bool Context::immediateMiopen() const {
|
||||
return immediate_miopen;
|
||||
}
|
||||
|
||||
void Context::setImmediateMiopen(bool b) {
|
||||
immediate_miopen = b;
|
||||
}
|
||||
|
||||
bool Context::allowTF32CuBLAS() const {
|
||||
#ifdef USE_ROCM
|
||||
const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
|
||||
@ -512,7 +502,7 @@ at::BlasBackend Context::blasPreferredBackend() {
|
||||
static const std::vector<std::string> archs = {
|
||||
"gfx90a", "gfx942",
|
||||
#if ROCM_VERSION >= 60300
|
||||
"gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908",
|
||||
"gfx1100", "gfx1101", "gfx1200", "gfx1201",
|
||||
#endif
|
||||
#if ROCM_VERSION >= 60500
|
||||
"gfx950"
|
||||
|
||||
@ -205,8 +205,6 @@ class TORCH_API Context {
|
||||
void setBenchmarkCuDNN(bool);
|
||||
int benchmarkLimitCuDNN() const;
|
||||
void setBenchmarkLimitCuDNN(int);
|
||||
bool immediateMiopen() const;
|
||||
void setImmediateMiopen(bool);
|
||||
bool deterministicCuDNN() const;
|
||||
void setDeterministicCuDNN(bool);
|
||||
bool deterministicMkldnn() const;
|
||||
@ -442,7 +440,6 @@ class TORCH_API Context {
|
||||
bool enabled_overrideable = true;
|
||||
bool allow_fp16_bf16_reduction_mathSDP = false;
|
||||
bool benchmark_cudnn = false;
|
||||
bool immediate_miopen = false;
|
||||
Float32MatmulPrecision float32_matmul_precision =
|
||||
c10::utils::check_env("TORCH_ALLOW_TF32_CUBLAS_OVERRIDE") == true
|
||||
? at::Float32MatmulPrecision::HIGH
|
||||
|
||||
@ -132,9 +132,6 @@ DLDevice torchDeviceToDLDevice(at::Device device) {
|
||||
case DeviceType::PrivateUse1:
|
||||
ctx.device_type = DLDeviceType::kDLExtDev;
|
||||
break;
|
||||
case DeviceType::MPS:
|
||||
ctx.device_type = DLDeviceType::kDLMetal;
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK_BUFFER(false, "Cannot pack tensors on " + device.str());
|
||||
}
|
||||
@ -167,8 +164,6 @@ static Device getATenDevice(DLDeviceType type, c10::DeviceIndex index, void* dat
|
||||
return at::Device(DeviceType::MAIA, index);
|
||||
case DLDeviceType::kDLExtDev:
|
||||
return at::Device(DeviceType::PrivateUse1, index);
|
||||
case DLDeviceType::kDLMetal:
|
||||
return at::Device(DeviceType::MPS, index);
|
||||
default:
|
||||
TORCH_CHECK_BUFFER(
|
||||
false, "Unsupported device_type: ", std::to_string(type));
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/CachingDeviceAllocator.h>
|
||||
#include <c10/core/DeviceType.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
|
||||
@ -72,6 +73,27 @@ TORCH_API c10::DeviceIndex exchangeDevice(c10::DeviceIndex device_index);
|
||||
// original device index that was active before the change.
|
||||
TORCH_API c10::DeviceIndex maybeExchangeDevice(c10::DeviceIndex device_index);
|
||||
|
||||
TORCH_API inline void emptyCache() {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
at::getDeviceAllocator(device_type)->emptyCache();
|
||||
}
|
||||
|
||||
TORCH_API inline at::CachingDeviceAllocator::DeviceStats getDeviceStats(
|
||||
c10::DeviceIndex device_index) {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
return at::getDeviceAllocator(device_type)->getDeviceStats(device_index);
|
||||
}
|
||||
|
||||
TORCH_API inline void resetAccumulatedStats(c10::DeviceIndex device_index) {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
at::getDeviceAllocator(device_type)->resetAccumulatedStats(device_index);
|
||||
}
|
||||
|
||||
TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
|
||||
}
|
||||
|
||||
} // namespace at::accelerator
|
||||
|
||||
namespace at {
|
||||
|
||||
@ -9,36 +9,7 @@
|
||||
|
||||
namespace at {
|
||||
|
||||
/*
|
||||
* Design:
|
||||
* 1. ZeroTensors are regular tensors with TensorOptions, a storage
|
||||
* pointing to nullptr and a ZeroTensor dispatch key set.
|
||||
*
|
||||
* 2. ZeroTensors are immutable. This is done to prevent data race in the case of multithreading
|
||||
* (when two threads try to read the same zero tensor and materialize it in-place).
|
||||
*
|
||||
* 3. ZeroTensor has a boxed fallback that will be dispatched to any ops that don't
|
||||
* have special ZeroTensor handling. This fallback materializes each ZeroTensor to
|
||||
* `at::zeros({}, tensor.options()).expand(tensor.sizes())`.
|
||||
|
||||
* 4. ZeroTensors are handled above autograd. This is necessary because fallback
|
||||
* operations are not differentiable.
|
||||
* - Example: Consider add in the case it was using the fallback: zerotensor_a + b.
|
||||
* zerotensor_a would be materialized to c=torch.zeros_like(zerotensor_a) after
|
||||
* passing through the fallback. If this happens above the autograd, then the
|
||||
* gradients would be populated on c instead of zerotensor_a.
|
||||
*
|
||||
* 5. The grad field is always populated with an honest to goodness tensor. This
|
||||
* materialization of ZeroTensors will happen in:
|
||||
* - AccumulateGrad for Backward Mode AD.
|
||||
* - will never be required for ForwardMode AD.
|
||||
* - This is because if all the tangents were undefined (efficient ZeroTensors),
|
||||
* no computation will be performed (this is ensured via an existing pre-check).
|
||||
*
|
||||
* Today ZeroTensors are primarily used to represent undefined gradients in forward AD,
|
||||
* it does not perfectly handle NaNs and Infs as we don't check the actual values
|
||||
* and assume that they are non-zero, non-inf, non-NaN etc.
|
||||
*/
|
||||
// TODO: add a note explaining the design decisions
|
||||
// ZeroTensors are designed to be immutable. Thus, we error out when an in-place operation is performed on ZeroTensors
|
||||
static void zeroTensorFallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
|
||||
const auto& arguments = op.schema().arguments();
|
||||
|
||||
@ -1 +1,55 @@
|
||||
#include <torch/headeronly/cpu/vec/intrinsics.h>
|
||||
#pragma once
|
||||
#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
|
||||
/* GCC or clang-compatible compiler, targeting x86/x86-64 */
|
||||
#include <x86intrin.h>
|
||||
#elif defined(__clang__) && (defined(__ARM_NEON__) || defined(__aarch64__))
|
||||
/* Clang-compatible compiler, targeting arm neon */
|
||||
#include <arm_neon.h>
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
/* CLANG-compatible compiler, targeting ARM with SVE */
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
#elif defined(_MSC_VER)
|
||||
/* Microsoft C/C++-compatible compiler */
|
||||
#include <intrin.h>
|
||||
#if _MSC_VER <= 1900
|
||||
#define _mm256_extract_epi64(X, Y) \
|
||||
(_mm_extract_epi64(_mm256_extractf128_si256(X, Y >> 1), Y % 2))
|
||||
#define _mm256_extract_epi32(X, Y) \
|
||||
(_mm_extract_epi32(_mm256_extractf128_si256(X, Y >> 2), Y % 4))
|
||||
#define _mm256_extract_epi16(X, Y) \
|
||||
(_mm_extract_epi16(_mm256_extractf128_si256(X, Y >> 3), Y % 8))
|
||||
#define _mm256_extract_epi8(X, Y) \
|
||||
(_mm_extract_epi8(_mm256_extractf128_si256(X, Y >> 4), Y % 16))
|
||||
#endif
|
||||
#elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__aarch64__))
|
||||
/* GCC-compatible compiler, targeting ARM with NEON */
|
||||
#include <arm_neon.h>
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
/* GCC-compatible compiler, targeting ARM with SVE */
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
#if defined(MISSING_ARM_VLD1)
|
||||
#include <ATen/cpu/vec/vec256/missing_vld1_neon.h>
|
||||
#elif defined(MISSING_ARM_VST1)
|
||||
#include <ATen/cpu/vec/vec256/missing_vst1_neon.h>
|
||||
#endif
|
||||
#elif defined(__GNUC__) && defined(__IWMMXT__)
|
||||
/* GCC-compatible compiler, targeting ARM with WMMX */
|
||||
#include <mmintrin.h>
|
||||
#elif defined(__s390x__)
|
||||
// targets Z/architecture
|
||||
// we will include vecintrin later
|
||||
#elif (defined(__GNUC__) || defined(__xlC__)) && \
|
||||
(defined(__VEC__) || defined(__ALTIVEC__))
|
||||
/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
|
||||
#include <altivec.h>
|
||||
/* We need to undef those tokens defined by <altivec.h> to avoid conflicts
|
||||
with the C++ types. => Can still use __bool/__vector */
|
||||
#undef bool
|
||||
#undef vector
|
||||
#undef pixel
|
||||
#elif defined(__GNUC__) && defined(__SPE__)
|
||||
/* GCC-compatible compiler, targeting PowerPC with SPE */
|
||||
#include <spe.h>
|
||||
#endif
|
||||
|
||||
@ -5,7 +5,6 @@
|
||||
#include <ATen/cpu/vec/sve/vec_common_sve.h>
|
||||
#include <ATen/cpu/vec/sve/vec_float.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <c10/util/bit_cast.h>
|
||||
#include <cmath>
|
||||
namespace at {
|
||||
namespace vec {
|
||||
@ -37,7 +36,7 @@ class Vectorized<BFloat16> {
|
||||
return VECTOR_WIDTH / sizeof(BFloat16);
|
||||
}
|
||||
|
||||
Vectorized();
|
||||
Vectorized() {}
|
||||
Vectorized(svbfloat16_t v) : values(v) {}
|
||||
Vectorized(int val);
|
||||
Vectorized(BFloat16 val);
|
||||
@ -307,11 +306,6 @@ Vectorized<c10::BFloat16> inline operator/(
|
||||
return binary_operator_via_float(std::divides<Vectorized<float>>(), a, b);
|
||||
}
|
||||
|
||||
inline Vectorized<BFloat16>::Vectorized() {
|
||||
const short zero = 0;
|
||||
values = svdup_n_bf16(c10::bit_cast<bfloat16_t>(zero));
|
||||
}
|
||||
|
||||
inline Vectorized<BFloat16>::Vectorized(int val) {
|
||||
auto vals_f = svdup_n_f32(val);
|
||||
values = convert_float_bfloat16(vals_f, vals_f);
|
||||
|
||||
@ -38,9 +38,7 @@ class Vectorized<double> {
|
||||
static constexpr size_type size() {
|
||||
return VECTOR_WIDTH / sizeof(double);
|
||||
}
|
||||
Vectorized() {
|
||||
values = svdup_n_f64(0);
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(svfloat64_t v) : values(v) {}
|
||||
Vectorized(double val) {
|
||||
values = svdup_n_f64(val);
|
||||
@ -587,30 +585,6 @@ Vectorized<double> inline fmadd(
|
||||
return svmad_f64_x(ptrue, a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline fnmadd(
|
||||
const Vectorized<double>& a,
|
||||
const Vectorized<double>& b,
|
||||
const Vectorized<double>& c) {
|
||||
return svmsb_f64_x(ptrue, a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline fmsub(
|
||||
const Vectorized<double>& a,
|
||||
const Vectorized<double>& b,
|
||||
const Vectorized<double>& c) {
|
||||
return svnmsb_f64_x(ptrue, a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline fnmsub(
|
||||
const Vectorized<double>& a,
|
||||
const Vectorized<double>& b,
|
||||
const Vectorized<double>& c) {
|
||||
return svnmad_f64_x(ptrue, a, b, c);
|
||||
}
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
|
||||
@ -38,9 +38,7 @@ class Vectorized<float> {
|
||||
static constexpr size_type size() {
|
||||
return VECTOR_WIDTH / sizeof(float);
|
||||
}
|
||||
Vectorized() {
|
||||
values = svdup_n_f32(0);
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(svfloat32_t v) : values(v) {}
|
||||
Vectorized(float val) {
|
||||
values = svdup_n_f32(val);
|
||||
@ -758,30 +756,6 @@ Vectorized<float> inline fmadd(
|
||||
return svmad_f32_x(ptrue, a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fnmadd(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& c) {
|
||||
return svmsb_f32_x(ptrue, a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fmsub(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& c) {
|
||||
return svnmsb_f32_x(ptrue, a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fnmsub(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& c) {
|
||||
return svnmad_f32_x(ptrue, a, b, c);
|
||||
}
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
|
||||
@ -32,9 +32,7 @@ inline namespace CPU_CAPABILITY {
|
||||
static constexpr size_type size() { \
|
||||
return vl; \
|
||||
} \
|
||||
Vectorized() { \
|
||||
values = svdup_n_s##bit(0); \
|
||||
} \
|
||||
Vectorized() {} \
|
||||
Vectorized(svint##bit##_t v) : values(v) {} \
|
||||
Vectorized(int##bit##_t val) { \
|
||||
values = svdup_n_s##bit(val); \
|
||||
|
||||
@ -552,15 +552,6 @@ Vectorized<c10::BFloat16> inline fmadd(
|
||||
return a * b + c;
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<c10::BFloat16> inline fnmadd(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b,
|
||||
const Vectorized<c10::BFloat16>& c) {
|
||||
// See NOTE [BF16 FMA] above.
|
||||
return -a * b + c;
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<c10::BFloat16> inline fmsub(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
@ -570,15 +561,6 @@ Vectorized<c10::BFloat16> inline fmsub(
|
||||
return a * b - c;
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<c10::BFloat16> inline fnmsub(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b,
|
||||
const Vectorized<c10::BFloat16>& c) {
|
||||
// See NOTE [BF16 FMA] above.
|
||||
return -a * b - c;
|
||||
}
|
||||
|
||||
#endif // !defined(C10_MOBILE) && defined(__aarch64__)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
|
||||
@ -83,9 +83,7 @@ class Vectorized<float> {
|
||||
static constexpr size_type size() {
|
||||
return 4;
|
||||
}
|
||||
Vectorized() {
|
||||
values = vmovq_n_f32(0);
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(float32x4_t v) : values(v) {}
|
||||
Vectorized(float val) : values{vdupq_n_f32(val)} {}
|
||||
Vectorized(float val0, float val1, float val2, float val3)
|
||||
@ -584,14 +582,6 @@ Vectorized<float> inline fmadd(
|
||||
return Vectorized<float>(vfmaq_f32(c, a, b));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fnmadd(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& c) {
|
||||
return Vectorized<float>(vfmsq_f32(c, a, b));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fmsub(
|
||||
const Vectorized<float>& a,
|
||||
@ -600,14 +590,6 @@ Vectorized<float> inline fmsub(
|
||||
return Vectorized<float>(vnegq_f32(vfmsq_f32(c, a, b)));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fnmsub(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& c) {
|
||||
return Vectorized<float>(vnegq_f32(vfmaq_f32(c, a, b)));
|
||||
}
|
||||
|
||||
inline Vectorized<float> Vectorized<float>::erf() const {
|
||||
// constants
|
||||
const Vectorized<float> neg_zero_vec(-0.f);
|
||||
|
||||
@ -621,18 +621,6 @@ Vectorized<c10::Half> inline fmadd(
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<c10::Half> inline fnmadd(
|
||||
const Vectorized<c10::Half>& a,
|
||||
const Vectorized<c10::Half>& b,
|
||||
const Vectorized<c10::Half>& c) {
|
||||
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
||||
return Vectorized<c10::Half>(vfmsq_f16(c, a, b));
|
||||
#else
|
||||
return -a * b + c;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<c10::Half> inline fmsub(
|
||||
const Vectorized<c10::Half>& a,
|
||||
@ -644,18 +632,6 @@ Vectorized<c10::Half> inline fmsub(
|
||||
return a * b - c;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<c10::Half> inline fnmsub(
|
||||
const Vectorized<c10::Half>& a,
|
||||
const Vectorized<c10::Half>& b,
|
||||
const Vectorized<c10::Half>& c) {
|
||||
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
||||
return Vectorized<c10::Half>(vnegq_f16(vfmaq_f16(c, a, b)));
|
||||
#else
|
||||
return -a * b - c;
|
||||
#endif
|
||||
}
|
||||
#endif // !defined(C10_MOBILE) && defined(__aarch64__)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
|
||||
@ -1 +1,396 @@
|
||||
#include <torch/headeronly/cpu/vec/vec256/missing_vld1_neon.h>
|
||||
/* Workaround for missing vld1_*_x2 and vst1_*_x2 intrinsics in gcc-7. */
|
||||
|
||||
__extension__ extern __inline uint8x8x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_u8_x2(const uint8_t* __a) {
|
||||
uint8x8x2_t ret;
|
||||
asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline int8x8x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_s8_x2(const int8_t* __a) {
|
||||
int8x8x2_t ret;
|
||||
asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint16x4x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_u16_x2(const uint16_t* __a) {
|
||||
uint16x4x2_t ret;
|
||||
asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline int16x4x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_s16_x2(const int16_t* __a) {
|
||||
int16x4x2_t ret;
|
||||
asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint32x2x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_u32_x2(const uint32_t* __a) {
|
||||
uint32x2x2_t ret;
|
||||
asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline int32x2x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_s32_x2(const int32_t* __a) {
|
||||
int32x2x2_t ret;
|
||||
asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint64x1x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_u64_x2(const uint64_t* __a) {
|
||||
uint64x1x2_t ret;
|
||||
asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline int64x1x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_s64_x2(const int64_t* __a) {
|
||||
int64x1x2_t ret;
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline float16x4x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_f16_x2(const float16_t* __a) {
|
||||
float16x4x2_t ret;
|
||||
asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline float32x2x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_f32_x2(const float32_t* __a) {
|
||||
float32x2x2_t ret;
|
||||
asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline float64x1x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_f64_x2(const float64_t* __a) {
|
||||
float64x1x2_t ret;
|
||||
asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline poly8x8x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_p8_x2(const poly8_t* __a) {
|
||||
poly8x8x2_t ret;
|
||||
asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline poly16x4x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_p16_x2(const poly16_t* __a) {
|
||||
poly16x4x2_t ret;
|
||||
asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline poly64x1x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_p64_x2(const poly64_t* __a) {
|
||||
poly64x1x2_t ret;
|
||||
asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint8x16x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_u8_x2(const uint8_t* __a) {
|
||||
uint8x16x2_t ret;
|
||||
asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline int8x16x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_s8_x2(const int8_t* __a) {
|
||||
int8x16x2_t ret;
|
||||
asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint16x8x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_u16_x2(const uint16_t* __a) {
|
||||
uint16x8x2_t ret;
|
||||
asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline int16x8x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_s16_x2(const int16_t* __a) {
|
||||
int16x8x2_t ret;
|
||||
asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint32x4x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_u32_x2(const uint32_t* __a) {
|
||||
uint32x4x2_t ret;
|
||||
asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline int32x4x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_s32_x2(const int32_t* __a) {
|
||||
int32x4x2_t ret;
|
||||
asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint64x2x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_u64_x2(const uint64_t* __a) {
|
||||
uint64x2x2_t ret;
|
||||
asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline int64x2x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_s64_x2(const int64_t* __a) {
|
||||
int64x2x2_t ret;
|
||||
asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline float16x8x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_f16_x2(const float16_t* __a) {
|
||||
float16x8x2_t ret;
|
||||
asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline float32x4x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_f32_x2(const float32_t* __a) {
|
||||
float32x4x2_t ret;
|
||||
asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline float64x2x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_f64_x2(const float64_t* __a) {
|
||||
float64x2x2_t ret;
|
||||
asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline poly8x16x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_p8_x2(const poly8_t* __a) {
|
||||
poly8x16x2_t ret;
|
||||
asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline poly16x8x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_p16_x2(const poly16_t* __a) {
|
||||
poly16x8x2_t ret;
|
||||
asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline poly64x2x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_p64_x2(const poly64_t* __a) {
|
||||
poly64x2x2_t ret;
|
||||
asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* vst1x2 */
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_s64_x2(int64_t* __a, int64x1x2_t val) {
|
||||
asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_u64_x2(uint64_t* __a, uint64x1x2_t val) {
|
||||
asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_f64_x2(float64_t* __a, float64x1x2_t val) {
|
||||
asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_s8_x2(int8_t* __a, int8x8x2_t val) {
|
||||
asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_p8_x2(poly8_t* __a, poly8x8x2_t val) {
|
||||
asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_s16_x2(int16_t* __a, int16x4x2_t val) {
|
||||
asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_p16_x2(poly16_t* __a, poly16x4x2_t val) {
|
||||
asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_s32_x2(int32_t* __a, int32x2x2_t val) {
|
||||
asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_u8_x2(uint8_t* __a, uint8x8x2_t val) {
|
||||
asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_u16_x2(uint16_t* __a, uint16x4x2_t val) {
|
||||
asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_u32_x2(uint32_t* __a, uint32x2x2_t val) {
|
||||
asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_f16_x2(float16_t* __a, float16x4x2_t val) {
|
||||
asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_f32_x2(float32_t* __a, float32x2x2_t val) {
|
||||
asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_p64_x2(poly64_t* __a, poly64x1x2_t val) {
|
||||
asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_s8_x2(int8_t* __a, int8x16x2_t val) {
|
||||
asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_p8_x2(poly8_t* __a, poly8x16x2_t val) {
|
||||
asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_s16_x2(int16_t* __a, int16x8x2_t val) {
|
||||
asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_p16_x2(poly16_t* __a, poly16x8x2_t val) {
|
||||
asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_s32_x2(int32_t* __a, int32x4x2_t val) {
|
||||
asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_s64_x2(int64_t* __a, int64x2x2_t val) {
|
||||
asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_u8_x2(uint8_t* __a, uint8x16x2_t val) {
|
||||
asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_u16_x2(uint16_t* __a, uint16x8x2_t val) {
|
||||
asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_u32_x2(uint32_t* __a, uint32x4x2_t val) {
|
||||
asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_u64_x2(uint64_t* __a, uint64x2x2_t val) {
|
||||
asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_f16_x2(float16_t* __a, float16x8x2_t val) {
|
||||
asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_f32_x2(float32_t* __a, float32x4x2_t val) {
|
||||
asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_f64_x2(float64_t* __a, float64x2x2_t val) {
|
||||
asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_p64_x2(poly64_t* __a, poly64x2x2_t val) {
|
||||
asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
@ -1 +1,7 @@
|
||||
#include <torch/headeronly/cpu/vec/vec256/missing_vst1_neon.h>
|
||||
/* Workaround for missing vst1q_f32_x2 in gcc-8. */
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_f32_x2(float32_t* __a, float32x4x2_t val) {
|
||||
asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
@ -34,9 +34,7 @@ class Vectorized<c10::complex<double>> {
|
||||
static constexpr size_type size() {
|
||||
return 2;
|
||||
}
|
||||
Vectorized() {
|
||||
values = _mm256_setzero_pd();
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(__m256d v) : values(v) {}
|
||||
Vectorized(c10::complex<double> val) {
|
||||
double real_value = val.real();
|
||||
|
||||
@ -33,9 +33,7 @@ class Vectorized<c10::complex<float>> {
|
||||
static constexpr size_type size() {
|
||||
return 4;
|
||||
}
|
||||
Vectorized() {
|
||||
values = _mm256_setzero_ps();
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(__m256 v) : values(v) {}
|
||||
Vectorized(c10::complex<float> val) {
|
||||
float real_value = val.real();
|
||||
|
||||
@ -31,9 +31,7 @@ class Vectorized<double> {
|
||||
static constexpr size_type size() {
|
||||
return 4;
|
||||
}
|
||||
Vectorized() {
|
||||
values = _mm256_setzero_pd();
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(__m256d v) : values(v) {}
|
||||
Vectorized(double val) {
|
||||
values = _mm256_set1_pd(val);
|
||||
@ -495,14 +493,6 @@ Vectorized<double> inline fmadd(
|
||||
return _mm256_fmadd_pd(a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline fnmadd(
|
||||
const Vectorized<double>& a,
|
||||
const Vectorized<double>& b,
|
||||
const Vectorized<double>& c) {
|
||||
return _mm256_fnmadd_pd(a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline fmsub(
|
||||
const Vectorized<double>& a,
|
||||
@ -510,14 +500,6 @@ Vectorized<double> inline fmsub(
|
||||
const Vectorized<double>& c) {
|
||||
return _mm256_fmsub_pd(a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline fnmsub(
|
||||
const Vectorized<double>& a,
|
||||
const Vectorized<double>& b,
|
||||
const Vectorized<double>& c) {
|
||||
return _mm256_fnmsub_pd(a, b, c);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@ -30,9 +30,7 @@ class Vectorized<float> {
|
||||
static constexpr size_type size() {
|
||||
return 8;
|
||||
}
|
||||
Vectorized() {
|
||||
values = _mm256_setzero_ps();
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(__m256 v) : values(v) {}
|
||||
Vectorized(float val) {
|
||||
values = _mm256_set1_ps(val);
|
||||
@ -696,14 +694,6 @@ Vectorized<float> inline fmadd(
|
||||
return _mm256_fmadd_ps(a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fnmadd(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& c) {
|
||||
return _mm256_fnmadd_ps(a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fmsub(
|
||||
const Vectorized<float>& a,
|
||||
@ -712,14 +702,6 @@ Vectorized<float> inline fmsub(
|
||||
return _mm256_fmsub_ps(a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fnmsub(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& c) {
|
||||
return _mm256_fnmsub_ps(a, b, c);
|
||||
}
|
||||
|
||||
// TODO: rewrite with ATEN vectorized (need to add unpack and shuffle)
|
||||
// Used by Inductor CPP codegen for micro gemm
|
||||
inline void transpose_block(at::vec::VectorizedN<float, 8>& input) {
|
||||
|
||||
@ -23,9 +23,7 @@ struct Vectorizedi {
|
||||
}
|
||||
|
||||
public:
|
||||
Vectorizedi() {
|
||||
values = _mm256_setzero_si256();
|
||||
}
|
||||
Vectorizedi() {}
|
||||
Vectorizedi(__m256i v) : values(v) {}
|
||||
operator __m256i() const {
|
||||
return values;
|
||||
@ -55,9 +53,7 @@ class Vectorized<int64_t> : public Vectorizedi {
|
||||
return 4;
|
||||
}
|
||||
using Vectorizedi::Vectorizedi;
|
||||
Vectorized() {
|
||||
values = _mm256_setzero_si256();
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(int64_t v) {
|
||||
values = _mm256_set1_epi64x(v);
|
||||
}
|
||||
|
||||
@ -54,9 +54,7 @@ struct Vectorizedqi {
|
||||
#endif
|
||||
|
||||
public:
|
||||
Vectorizedqi() {
|
||||
vals = _mm256_setzero_si256();
|
||||
}
|
||||
Vectorizedqi() {}
|
||||
Vectorizedqi(__m256i v) : vals(v) {}
|
||||
operator __m256i() const {
|
||||
return vals;
|
||||
|
||||
@ -192,9 +192,7 @@ class Vectorized16 {
|
||||
static constexpr size_type size() {
|
||||
return 32;
|
||||
}
|
||||
Vectorized16() {
|
||||
values = _mm512_setzero_si512();
|
||||
}
|
||||
Vectorized16() {}
|
||||
Vectorized16(__m512i v) : values(v) {}
|
||||
Vectorized16(T val) {
|
||||
value_type uw = val.x;
|
||||
|
||||
@ -34,9 +34,7 @@ class Vectorized<c10::complex<double>> {
|
||||
static constexpr size_type size() {
|
||||
return 4;
|
||||
}
|
||||
Vectorized() {
|
||||
values = _mm512_setzero_pd();
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(__m512d v) : values(v) {}
|
||||
Vectorized(c10::complex<double> val) {
|
||||
double real_value = val.real();
|
||||
|
||||
@ -34,9 +34,7 @@ class Vectorized<c10::complex<float>> {
|
||||
static constexpr size_type size() {
|
||||
return 8;
|
||||
}
|
||||
Vectorized() {
|
||||
values = _mm512_setzero_ps();
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(__m512 v) : values(v) {}
|
||||
Vectorized(c10::complex<float> val) {
|
||||
float real_value = val.real();
|
||||
|
||||
@ -34,9 +34,7 @@ class Vectorized<double> {
|
||||
static constexpr size_type size() {
|
||||
return 8;
|
||||
}
|
||||
Vectorized() {
|
||||
values = _mm512_setzero_pd();
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(__m512d v) : values(v) {}
|
||||
Vectorized(double val) {
|
||||
values = _mm512_set1_pd(val);
|
||||
@ -536,14 +534,6 @@ Vectorized<double> inline fmadd(
|
||||
return _mm512_fmadd_pd(a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline fnmadd(
|
||||
const Vectorized<double>& a,
|
||||
const Vectorized<double>& b,
|
||||
const Vectorized<double>& c) {
|
||||
return _mm512_fnmadd_pd(a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline fmsub(
|
||||
const Vectorized<double>& a,
|
||||
@ -552,14 +542,6 @@ Vectorized<double> inline fmsub(
|
||||
return _mm512_fmsub_pd(a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline fnmsub(
|
||||
const Vectorized<double>& a,
|
||||
const Vectorized<double>& b,
|
||||
const Vectorized<double>& c) {
|
||||
return _mm512_fnmsub_pd(a, b, c);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
|
||||
@ -32,9 +32,7 @@ class Vectorized<float> {
|
||||
static constexpr size_type size() {
|
||||
return 16;
|
||||
}
|
||||
Vectorized() {
|
||||
values = _mm512_setzero_ps();
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(__m512 v) : values(v) {}
|
||||
Vectorized(float val) {
|
||||
values = _mm512_set1_ps(val);
|
||||
@ -749,14 +747,6 @@ Vectorized<float> inline fmadd(
|
||||
return _mm512_fmadd_ps(a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fnmadd(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& c) {
|
||||
return _mm512_fnmadd_ps(a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fmsub(
|
||||
const Vectorized<float>& a,
|
||||
@ -765,14 +755,6 @@ Vectorized<float> inline fmsub(
|
||||
return _mm512_fmsub_ps(a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fnmsub(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& c) {
|
||||
return _mm512_fnmsub_ps(a, b, c);
|
||||
}
|
||||
|
||||
// TODO: rewrite with ATEN vectorized (need to add unpack and shuffle)
|
||||
// Used by Inductor CPP codegen for micro gemm
|
||||
// Code referred to FBGEMM:
|
||||
|
||||
@ -53,9 +53,7 @@ class Vectorized<int64_t> : public Vectorizedi {
|
||||
return 8;
|
||||
}
|
||||
using Vectorizedi::Vectorizedi;
|
||||
Vectorized() {
|
||||
values = _mm512_setzero_si512();
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(int64_t v) {
|
||||
values = _mm512_set1_epi64(v);
|
||||
}
|
||||
|
||||
@ -55,9 +55,7 @@ struct Vectorizedqi {
|
||||
#endif
|
||||
|
||||
public:
|
||||
Vectorizedqi() {
|
||||
vals = _mm512_setzero_si512();
|
||||
}
|
||||
Vectorizedqi() {}
|
||||
Vectorizedqi(__m512i v) : vals(v) {}
|
||||
operator __m512i() const {
|
||||
return vals;
|
||||
|
||||
@ -1247,16 +1247,6 @@ inline Vectorized<T> fmadd(
|
||||
|
||||
VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(fmadd)
|
||||
|
||||
template <typename T>
|
||||
inline Vectorized<T> fnmadd(
|
||||
const Vectorized<T>& a,
|
||||
const Vectorized<T>& b,
|
||||
const Vectorized<T>& c) {
|
||||
return -(a * b) + c;
|
||||
}
|
||||
|
||||
VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(fnmadd)
|
||||
|
||||
template <typename T>
|
||||
inline Vectorized<T> fmsub(
|
||||
const Vectorized<T>& a,
|
||||
@ -1267,16 +1257,6 @@ inline Vectorized<T> fmsub(
|
||||
|
||||
VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(fmsub)
|
||||
|
||||
template <typename T>
|
||||
inline Vectorized<T> fnmsub(
|
||||
const Vectorized<T>& a,
|
||||
const Vectorized<T>& b,
|
||||
const Vectorized<T>& c) {
|
||||
return -(a * b) - c;
|
||||
}
|
||||
|
||||
VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(fnmsub)
|
||||
|
||||
template <typename T>
|
||||
Vectorized<T> inline operator&&(
|
||||
const Vectorized<T>& a,
|
||||
|
||||
@ -3,12 +3,50 @@
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <c10/util/Exception.h>
|
||||
|
||||
#include <torch/headeronly/cpu/vec/vec_half.h>
|
||||
|
||||
namespace at::vec {
|
||||
// See Note [CPU_CAPABILITY namespace]
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#if (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
|
||||
!defined(__APPLE__)
|
||||
static inline uint16_t float2half_scalar(float val) {
|
||||
#if defined(CPU_CAPABILITY_AVX2)
|
||||
#if defined(_MSC_VER)
|
||||
__m256 v = _mm256_set1_ps(val);
|
||||
__m128i o =
|
||||
_mm256_cvtps_ph(v, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
|
||||
return static_cast<std::uint16_t>(_mm_cvtsi128_si32(o));
|
||||
#else
|
||||
return _cvtss_sh(val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
|
||||
#endif
|
||||
#elif defined(CPU_CAPABILITY_AVX512)
|
||||
__m512 v = _mm512_set1_ps(val);
|
||||
__m256i o =
|
||||
_mm512_cvtps_ph(v, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
|
||||
return static_cast<std::uint16_t>(
|
||||
_mm_cvtsi128_si32(_mm256_castsi256_si128(o)));
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline float half2float_scalar(uint16_t val) {
|
||||
#if defined(CPU_CAPABILITY_AVX2)
|
||||
#if defined(_MSC_VER)
|
||||
__m128i v = _mm_cvtsi32_si128(val);
|
||||
__m256 o = _mm256_cvtph_ps(v);
|
||||
return _mm256_cvtss_f32(o);
|
||||
#else
|
||||
return _cvtsh_ss(val);
|
||||
#endif
|
||||
#elif defined(CPU_CAPABILITY_AVX512)
|
||||
__m256i v =
|
||||
_mm256_setr_epi16(val, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
__m512 o = _mm512_cvtph_ps(v);
|
||||
return _mm512_cvtss_f32(o);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// Transpose a [2, 32] matrix to [32, 2]
|
||||
// Note: the output leading dimension should be 2,
|
||||
// that is, the output must be contiguous
|
||||
|
||||
@ -1843,69 +1843,6 @@ template bool gemm_and_bias(
|
||||
int64_t result_ld,
|
||||
GEMMAndBiasActivationEpilogue activation);
|
||||
|
||||
int get_scale_mode(ScalingType scaling_type, ScalarType scale_dtype, bool use_fast_accum) {
|
||||
switch (scaling_type) {
|
||||
case ScalingType::BlockWise1x32:
|
||||
TORCH_CHECK(scale_dtype == kFloat8_e8m0fnu);
|
||||
#if CUDA_VERSION >= 12080
|
||||
return CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0;
|
||||
#else
|
||||
TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales of 1x32 blocks is only supported for CUDA 12.8 and above");
|
||||
#endif // if CUDA_VERSION >= 12080
|
||||
|
||||
case ScalingType::BlockWise1x16:
|
||||
TORCH_CHECK(scale_dtype == kFloat8_e4m3fn);
|
||||
#if CUDA_VERSION >= 12080
|
||||
return CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3;
|
||||
#else
|
||||
TORCH_CHECK(false, "scaled_gemm with `torch.float8_e4m3fn` scales of 1x16 blocks is only supported for CUDA 12.8 and above");
|
||||
#endif // if CUDA_VERSION >= 12080
|
||||
|
||||
case ScalingType::RowWise:
|
||||
TORCH_CHECK(scale_dtype == kFloat);
|
||||
#if CUDA_VERSION >= 12090 || (defined(USE_ROCM) && defined(HIPBLASLT_OUTER_VEC))
|
||||
return CUBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F;
|
||||
#elif defined(USE_ROCM) && defined(HIPBLASLT_VEC_EXT)
|
||||
// Return the default, since in old hipblaslt this is activated via
|
||||
// the SCALE_POINTER_VEC_EXT attributed.
|
||||
return 0;
|
||||
#else
|
||||
TORCH_CHECK(false, "scaled_gemm with rowwise scaling is only supported for CUDA 12.9 and above");
|
||||
#endif // if CUDA_VERSION >= 12090
|
||||
|
||||
case ScalingType::BlockWise1x128:
|
||||
TORCH_CHECK(scale_dtype == kFloat);
|
||||
TORCH_CHECK(!use_fast_accum, "scaled_gemm doesn't support fast accum with 1x128 blockwise scaling")
|
||||
#if CUDA_VERSION >= 12090
|
||||
return CUBLASLT_MATMUL_MATRIX_SCALE_VEC128_32F;
|
||||
#else
|
||||
TORCH_CHECK(false, "scaled_gemm with 1x128 blockwise scaling is only supported for CUDA 12.9 and above");
|
||||
#endif // if CUDA_VERSION >= 12090
|
||||
|
||||
case ScalingType::BlockWise128x128:
|
||||
TORCH_CHECK(scale_dtype == kFloat);
|
||||
TORCH_CHECK(!use_fast_accum, "scaled_gemm doesn't support fast accum with 128x128 blockwise scaling")
|
||||
#if CUDA_VERSION >= 12090
|
||||
return CUBLASLT_MATMUL_MATRIX_SCALE_BLK128x128_32F;
|
||||
#else
|
||||
TORCH_CHECK(false, "scaled_gemm with 128x128 blockwise scaling is only supported for CUDA 12.9 and above");
|
||||
#endif // if CUDA_VERSION >= 12090
|
||||
|
||||
case ScalingType::TensorWise:
|
||||
TORCH_CHECK(scale_dtype == kFloat);
|
||||
#if CUDA_VERSION >= 12080
|
||||
return CUBLASLT_MATMUL_MATRIX_SCALE_SCALAR_32F;
|
||||
#else
|
||||
// The macro isn't defined, thus we inline its value.
|
||||
return 0;
|
||||
#endif // if CUDA_VERSION >= 12080
|
||||
|
||||
default:
|
||||
TORCH_CHECK(false);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
void scaled_gemm(
|
||||
char transa,
|
||||
char transb,
|
||||
@ -1917,20 +1854,19 @@ void scaled_gemm(
|
||||
int64_t mat1_ld,
|
||||
ScalarType mat1_dtype,
|
||||
ScalarType mat1_scale_dtype,
|
||||
ScalingType mat1_scaling_type,
|
||||
const void* mat2_ptr,
|
||||
const void* mat2_scale_ptr,
|
||||
int64_t mat2_ld,
|
||||
ScalarType mat2_dtype,
|
||||
ScalarType mat2_scale_dtype,
|
||||
ScalingType mat2_scaling_type,
|
||||
const void* bias_ptr,
|
||||
ScalarType bias_dtype,
|
||||
void* result_ptr,
|
||||
const void *result_scale_ptr,
|
||||
int64_t result_ld,
|
||||
ScalarType result_dtype,
|
||||
bool use_fast_accum) {
|
||||
bool use_fast_accum,
|
||||
bool use_rowwise) {
|
||||
// Note: see `cublasCommonArgs` for various non-intuitive manupulations
|
||||
// of input arguments to this function.
|
||||
#if CUDA_VERSION >= 11080 || defined(USE_ROCM)
|
||||
@ -1943,15 +1879,19 @@ void scaled_gemm(
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb));
|
||||
cublasLtMatmulDescAttributes_t matmulDescA = CUBLASLT_MATMUL_DESC_A_SCALE_POINTER;
|
||||
cublasLtMatmulDescAttributes_t matmulDescB = CUBLASLT_MATMUL_DESC_B_SCALE_POINTER;
|
||||
// hipblaslt supported row-wise before cublas, and did so their own way (via
|
||||
// the SCALE_POINTERSs), but then migrated to match how cublas does it (via
|
||||
// the SCALE_MODEs). Here we check for this early custom mode.
|
||||
#if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
|
||||
if (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise) {
|
||||
#if defined(USE_ROCM)
|
||||
#if defined(HIPBLASLT_OUTER_VEC)
|
||||
// this case is handled later as hipified CUBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F
|
||||
#elif defined(HIPBLASLT_VEC_EXT)
|
||||
if (use_rowwise) {
|
||||
matmulDescA = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT;
|
||||
matmulDescB = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT;
|
||||
}
|
||||
#endif // if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
|
||||
#else
|
||||
// rowwise isn't supported using older hipblaslt
|
||||
TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with older hipblaslt");
|
||||
#endif
|
||||
#endif // defined(USE_ROCM)
|
||||
computeDesc.setAttribute(matmulDescA, mat1_scale_ptr);
|
||||
computeDesc.setAttribute(matmulDescB, mat2_scale_ptr);
|
||||
if (result_scale_ptr != nullptr) {
|
||||
@ -1991,14 +1931,30 @@ void scaled_gemm(
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, ScalarTypeToCudaDataType(bias_dtype));
|
||||
}
|
||||
|
||||
// The SCALE_MODE attrs only exist in cuBLAS 12.8+ or in recent hipblaslt,
|
||||
// but we must invoke get_scale_mode anyways to trigger the version checks.
|
||||
[[maybe_unused]] int a_scale_mode = get_scale_mode(mat1_scaling_type, mat1_scale_dtype, use_fast_accum);
|
||||
[[maybe_unused]] int b_scale_mode = get_scale_mode(mat2_scaling_type, mat2_scale_dtype, use_fast_accum);
|
||||
#if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && defined(HIPBLASLT_OUTER_VEC))
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, a_scale_mode);
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, b_scale_mode);
|
||||
#endif
|
||||
if (mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) {
|
||||
#if CUDA_VERSION >= 12080
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0);
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0);
|
||||
#else
|
||||
TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales is only supported for CUDA 12.8 and above");
|
||||
#endif // if CUDA_VERSION >= 12080
|
||||
} else if (mat1_scale_dtype == kFloat8_e4m3fn && mat2_scale_dtype == kFloat8_e4m3fn) {
|
||||
#if CUDA_VERSION >= 12080
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3);
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3);
|
||||
#else
|
||||
TORCH_CHECK(false, "scaled_gemm with `torch.float8_e4m3fn` scales is only supported for CUDA 12.8 and above");
|
||||
#endif // if CUDA_VERSION >= 12080
|
||||
} else if (mat1_scale_dtype == kFloat && mat2_scale_dtype == kFloat && use_rowwise) {
|
||||
#if CUDA_VERSION >= 12090 || (defined(USE_ROCM) && defined(HIPBLASLT_OUTER_VEC))
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F);
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F);
|
||||
#elif defined(USE_ROCM) && defined(HIPBLASLT_VEC_EXT)
|
||||
// no-op here for older hipblaslt ext enums, to avoid TORCH_CHECK below
|
||||
#else
|
||||
TORCH_CHECK(false, "scaled_gemm with `torch.float` outer vector scaling is only supported for CUDA 12.9 and above");
|
||||
#endif // if CUDA_VERSION >= 12090
|
||||
}
|
||||
|
||||
CuBlasLtMatmulPreference preference;
|
||||
auto ltworkspace = CublasLtWorkspace();
|
||||
|
||||
@ -136,15 +136,6 @@ void int8_gemm(
|
||||
int32_t* result_ptr,
|
||||
int64_t result_ld);
|
||||
|
||||
enum class ScalingType : std::uint8_t {
|
||||
TensorWise, // fp32 scales
|
||||
RowWise, // fp32 scales
|
||||
BlockWise1x16, // fp8_e4m3fn scales
|
||||
BlockWise1x32, // fp8_e8m0fnu scales
|
||||
BlockWise1x128, // fp32 scales
|
||||
BlockWise128x128, // fp32 scales
|
||||
};
|
||||
|
||||
void scaled_gemm(
|
||||
char transa,
|
||||
char transb,
|
||||
@ -156,20 +147,19 @@ void scaled_gemm(
|
||||
int64_t mat1_ld,
|
||||
ScalarType mat1_dtype,
|
||||
ScalarType mat1_scale_dtype,
|
||||
ScalingType mat1_scaling_type,
|
||||
const void* mat2_ptr,
|
||||
const void* mat2_scale_ptr,
|
||||
int64_t mat2_ld,
|
||||
ScalarType mat2_dtype,
|
||||
ScalarType mat2_scale_dtype,
|
||||
ScalingType mat2_scaling_type,
|
||||
const void* bias_ptr,
|
||||
ScalarType bias_dtype,
|
||||
void* result_ptr,
|
||||
const void* result_scale_ptr,
|
||||
int64_t result_ld,
|
||||
ScalarType result_dtype,
|
||||
bool use_fast_accum);
|
||||
bool use_fast_accum,
|
||||
bool use_rowwise);
|
||||
|
||||
#define CUDABLAS_BGEMM_ARGTYPES(Dtype) CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, Dtype)
|
||||
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
#include <ATen/cuda/CUDAGraph.h>
|
||||
#include <ATen/cuda/Exceptions.h>
|
||||
#include <ATen/Functions.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
#include <c10/cuda/CUDAFunctions.h>
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
|
||||
#include <ATen/Tensor.h>
|
||||
#include <c10/core/Device.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
#include <c10/cuda/CUDAGraphsC10Utils.h>
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
#include <c10/util/flat_hash_map.h>
|
||||
|
||||
@ -258,7 +258,7 @@ DECLARE_HOST_ALLOCATOR(
|
||||
CUDACachingHostAllocator,
|
||||
CUDACachingHostAllocatorImpl,
|
||||
raw_local_deleter,
|
||||
caching_host_allocator)
|
||||
caching_host_allocator);
|
||||
|
||||
REGISTER_HOST_ALLOCATOR(at::kCUDA, &caching_host_allocator)
|
||||
|
||||
|
||||
@ -29,8 +29,6 @@
|
||||
|
||||
namespace at::cuda::tunable {
|
||||
|
||||
using at::cuda::blas::ScalingType;
|
||||
|
||||
enum class BlasOp {
|
||||
N = 0,
|
||||
T = 1
|
||||
@ -600,8 +598,7 @@ struct ScaledGemmParams : OpParams {
|
||||
//
|
||||
// In TunableOp, we must distinguish in param signature these two cases: with and without a bias vector.
|
||||
return fmt::sprintf("%c%c_%ld_%ld_%ld_ld_%ld_%ld_%ld_rw_%d_bias_%s",
|
||||
transa, transb, m, n, k, lda, ldb, ldc,
|
||||
a_scaling_type == ScalingType::RowWise && b_scaling_type == ScalingType::RowWise,
|
||||
transa, transb, m, n, k, lda, ldb, ldc, use_rowwise,
|
||||
bias_ptr == nullptr ? "None" : at::toString(bias_dtype));
|
||||
}
|
||||
|
||||
@ -676,13 +673,11 @@ struct ScaledGemmParams : OpParams {
|
||||
int64_t lda{};
|
||||
ScalarType a_dtype{};
|
||||
ScalarType a_scale_dtype{};
|
||||
ScalingType a_scaling_type{};
|
||||
const void* b{};
|
||||
const void* b_scale_ptr{};
|
||||
int64_t ldb{};
|
||||
ScalarType b_dtype{};
|
||||
ScalarType b_scale_dtype{};
|
||||
ScalingType b_scaling_type{};
|
||||
const void* bias_ptr{};
|
||||
ScalarType bias_dtype{};
|
||||
void* c{};
|
||||
@ -691,6 +686,7 @@ struct ScaledGemmParams : OpParams {
|
||||
ScalarType c_dtype{};
|
||||
void* amax_ptr{};
|
||||
bool use_fast_accum{};
|
||||
bool use_rowwise{};
|
||||
private:
|
||||
bool duplicate_inputs_{false};
|
||||
};
|
||||
|
||||
@ -206,43 +206,23 @@ float GetBetaFromParams(const ScaledGemmParams<T>* params) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
ScalingType GetAScalingTypeFromParams(const GemmParams<T>* params) {
|
||||
return ScalingType::TensorWise;
|
||||
bool GetUseRowwiseFromParams(const GemmParams<T>* params) {
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
ScalingType GetBScalingTypeFromParams(const GemmParams<T>* params) {
|
||||
return ScalingType::TensorWise;
|
||||
bool GetUseRowwiseFromParams(const GemmAndBiasParams<T>* params) {
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
ScalingType GetAScalingTypeFromParams(const GemmAndBiasParams<T>* params) {
|
||||
return ScalingType::TensorWise;
|
||||
bool GetUseRowwiseFromParams(const GemmStridedBatchedParams<T>* params) {
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
ScalingType GetBScalingTypeFromParams(const GemmAndBiasParams<T>* params) {
|
||||
return ScalingType::TensorWise;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
ScalingType GetAScalingTypeFromParams(const GemmStridedBatchedParams<T>* params) {
|
||||
return ScalingType::TensorWise;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
ScalingType GetBScalingTypeFromParams(const GemmStridedBatchedParams<T>* params) {
|
||||
return ScalingType::TensorWise;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
ScalingType GetAScalingTypeFromParams(const ScaledGemmParams<T>* params) {
|
||||
return params->a_scaling_type;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
ScalingType GetBScalingTypeFromParams(const ScaledGemmParams<T>* params) {
|
||||
return params->b_scaling_type;
|
||||
bool GetUseRowwiseFromParams(const ScaledGemmParams<T>* params) {
|
||||
return params->use_rowwise;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -509,24 +489,23 @@ class HipblasltGemmOp : public Callable<ParamsT> {
|
||||
const void* mat2_scale_ptr = GetBScalePointerFromParams<CT>(params);
|
||||
const void* result_scale_ptr = GetDScalePointerFromParams<CT>(params);
|
||||
if (mat1_scale_ptr && mat2_scale_ptr) {
|
||||
hipblasLtMatmulDescAttributes_t a_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER;
|
||||
hipblasLtMatmulDescAttributes_t b_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER;
|
||||
if (GetAScalingTypeFromParams<CT>(params) == ScalingType::RowWise) {
|
||||
#if defined(HIPBLASLT_OUTER_VEC)
|
||||
#ifdef HIPBLASLT_VEC_EXT
|
||||
if (GetUseRowwiseFromParams<CT>(params)) {
|
||||
matmul.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT, mat1_scale_ptr);
|
||||
matmul.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT, mat2_scale_ptr);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
matmul.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr);
|
||||
matmul.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr);
|
||||
}
|
||||
#ifdef HIPBLASLT_OUTER_VEC
|
||||
if (GetUseRowwiseFromParams<CT>(params)) {
|
||||
matmul.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_MODE, HIPBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F);
|
||||
#elif defined(HIPBLASLT_VEC_EXT)
|
||||
a_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT;
|
||||
#endif
|
||||
}
|
||||
if (GetBScalingTypeFromParams<CT>(params) == ScalingType::RowWise) {
|
||||
#if defined(HIPBLASLT_OUTER_VEC)
|
||||
matmul.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_MODE, HIPBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F);
|
||||
#elif defined(HIPBLASLT_VEC_EXT)
|
||||
b_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT;
|
||||
#endif
|
||||
}
|
||||
matmul.setAttribute(a_scale_ptr_desc, mat1_scale_ptr);
|
||||
matmul.setAttribute(b_scale_ptr_desc, mat2_scale_ptr);
|
||||
#endif
|
||||
}
|
||||
if (result_scale_ptr) {
|
||||
matmul.setAttribute(HIPBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
|
||||
|
||||
@ -96,20 +96,19 @@ class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
|
||||
params->lda,
|
||||
params->a_dtype,
|
||||
params->a_scale_dtype,
|
||||
params->a_scaling_type,
|
||||
params->b,
|
||||
params->b_scale_ptr,
|
||||
params->ldb,
|
||||
params->b_dtype,
|
||||
params->b_scale_dtype,
|
||||
params->b_scaling_type,
|
||||
params->bias_ptr,
|
||||
params->bias_dtype,
|
||||
params->c,
|
||||
params->c_scale_ptr,
|
||||
params->ldc,
|
||||
params->c_dtype,
|
||||
params->use_fast_accum);
|
||||
params->use_fast_accum,
|
||||
params->use_rowwise);
|
||||
return OK;
|
||||
}
|
||||
};
|
||||
|
||||
@ -158,7 +158,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
|
||||
OP_DECOMPOSE(kron);
|
||||
OP_DECOMPOSE(l1_loss);
|
||||
m.impl("layer_norm", native::layer_norm_symint);
|
||||
m.impl("_fused_rms_norm", native::rms_norm_composite);
|
||||
OP_DECOMPOSE2(ldexp, Tensor);
|
||||
OP_DECOMPOSE2(less_equal, Tensor );
|
||||
OP_DECOMPOSE2(less, Tensor );
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/Allocator.h>
|
||||
#include <c10/core/CachingDeviceAllocator.h>
|
||||
#include <c10/core/DeviceType.h>
|
||||
|
||||
// Use of c10::hip namespace here makes hipification easier, because
|
||||
@ -10,10 +10,10 @@ namespace c10::hip {
|
||||
// Takes a valid HIPAllocator (of any sort) and turns it into
|
||||
// an allocator pretending to be a CUDA allocator. See
|
||||
// Note [Masquerading as CUDA]
|
||||
class HIPAllocatorMasqueradingAsCUDA final : public Allocator {
|
||||
Allocator* allocator_;
|
||||
class HIPAllocatorMasqueradingAsCUDA final : public DeviceAllocator {
|
||||
DeviceAllocator* allocator_;
|
||||
public:
|
||||
explicit HIPAllocatorMasqueradingAsCUDA(Allocator* allocator)
|
||||
explicit HIPAllocatorMasqueradingAsCUDA(DeviceAllocator* allocator)
|
||||
: allocator_(allocator) {}
|
||||
DataPtr allocate(size_t size) override {
|
||||
DataPtr r = allocator_->allocate(size);
|
||||
@ -26,6 +26,24 @@ public:
|
||||
void copy_data(void* dest, const void* src, std::size_t count) const final {
|
||||
allocator_->copy_data(dest, src, count);
|
||||
}
|
||||
bool initialized() override {
|
||||
return allocator_->initialized();
|
||||
}
|
||||
void emptyCache(MempoolId_t mempool_id = {0, 0}) {
|
||||
allocator_->emptyCache(mempool_id);
|
||||
}
|
||||
void recordStream(const DataPtr& ptr, c10::Stream stream) {
|
||||
allocator_->recordStream(ptr, stream);
|
||||
}
|
||||
CachingDeviceAllocator::DeviceStats getDeviceStats(c10::DeviceIndex device) {
|
||||
return allocator_->getDeviceStats(device);
|
||||
}
|
||||
void resetAccumulatedStats(c10::DeviceIndex device) {
|
||||
allocator_->resetAccumulatedStats(device);
|
||||
}
|
||||
void resetPeakStats(c10::DeviceIndex device) {
|
||||
allocator_->resetPeakStats(device);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace c10::hip
|
||||
|
||||
@ -4,8 +4,9 @@
|
||||
namespace c10 { namespace hip {
|
||||
namespace HIPCachingAllocatorMasqueradingAsCUDA {
|
||||
|
||||
static HIPAllocatorMasqueradingAsCUDA allocator(HIPCachingAllocator::get());
|
||||
|
||||
Allocator* get() {
|
||||
static HIPAllocatorMasqueradingAsCUDA allocator(HIPCachingAllocator::get());
|
||||
return &allocator;
|
||||
}
|
||||
|
||||
@ -13,5 +14,9 @@ void recordStreamMasqueradingAsCUDA(const DataPtr& ptr, HIPStreamMasqueradingAsC
|
||||
HIPCachingAllocator::recordStream(ptr, stream.hip_stream());
|
||||
}
|
||||
|
||||
// Register this HIP allocator as CUDA allocator to enable access through both
|
||||
// c10::GetAllocator(kCUDA) and c10::getDeviceAllocator(kCUDA) APIs
|
||||
REGISTER_ALLOCATOR(kCUDA, &allocator)
|
||||
|
||||
} // namespace HIPCachingAllocatorMasqueradingAsCUDA
|
||||
}} // namespace c10::hip
|
||||
|
||||
@ -36,10 +36,8 @@
|
||||
#endif
|
||||
|
||||
#ifdef USE_FBGEMM
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
|
||||
#include <fbgemm/Fbgemm.h>
|
||||
#include <fbgemm/FbgemmConvert.h>
|
||||
C10_DIAGNOSTIC_POP()
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
|
||||
@ -14,10 +14,8 @@
|
||||
#include <c10/util/Half.h>
|
||||
|
||||
#ifdef USE_FBGEMM
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
|
||||
#include <fbgemm/Fbgemm.h>
|
||||
#include <fbgemm/FbgemmConvert.h>
|
||||
C10_DIAGNOSTIC_POP()
|
||||
#else
|
||||
#include <caffe2/perfkernels/embedding_lookup_idx.h>
|
||||
#endif
|
||||
|
||||
@ -25,11 +25,9 @@
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#ifdef USE_FBGEMM
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
|
||||
#include <fbgemm/Fbgemm.h>
|
||||
#include <fbgemm/FbgemmFP16.h>
|
||||
#include <fbgemm/QuantUtils.h>
|
||||
C10_DIAGNOSTIC_POP()
|
||||
#endif // USE_FBGEMM
|
||||
|
||||
namespace caffe2 {
|
||||
@ -411,7 +409,7 @@ Tensor fbgemm_pack_gemm_matrix_fp16(const Tensor& weight) {
|
||||
Tensor fbgemm_linear_fp16_weight_fp32_activation(
|
||||
const Tensor& input,
|
||||
const Tensor& packed_weight,
|
||||
const std::optional<Tensor>& bias) {
|
||||
const Tensor& bias) {
|
||||
TORCH_WARN_ONCE("fbgemm_linear_fp16_weight_fp32_activation is deprecated "
|
||||
"and will be removed in a future PyTorch release.")
|
||||
|
||||
@ -432,6 +430,7 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation(
|
||||
|
||||
TORCH_CHECK(input.size(input.dim() - 1) == packed_weight_fp16.numRows())
|
||||
TORCH_CHECK(input.dim() >= 2);
|
||||
TORCH_CHECK(bias.dim() == 1);
|
||||
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
|
||||
@ -450,12 +449,7 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation(
|
||||
output.data_ptr<float>());
|
||||
|
||||
// Add bias term
|
||||
c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias);
|
||||
const Tensor& bias_ = *bias_maybe_owned;
|
||||
if (bias_.defined()) {
|
||||
TORCH_CHECK(bias_.dim() == 1);
|
||||
output.add_(bias_);
|
||||
}
|
||||
output.add_(bias);
|
||||
|
||||
return output;
|
||||
}
|
||||
@ -557,7 +551,7 @@ Tensor fbgemm_pack_gemm_matrix_fp16(const Tensor& weight) {
|
||||
Tensor fbgemm_linear_fp16_weight_fp32_activation(
|
||||
const Tensor& input,
|
||||
const Tensor& packed_weight,
|
||||
const std::optional<Tensor>& bias) {
|
||||
const Tensor& bias) {
|
||||
TORCH_WARN_ONCE("fbgemm_linear_fp16_weight_fp32_activation is deprecated "
|
||||
"and will be removed in a future PyTorch release.")
|
||||
|
||||
|
||||
@ -71,8 +71,6 @@
|
||||
#include <ATen/ops/exp.h>
|
||||
#include <ATen/ops/gather.h>
|
||||
#include <ATen/ops/gradient_native.h>
|
||||
#include <ATen/ops/hash_tensor.h>
|
||||
#include <ATen/ops/hash_tensor_native.h>
|
||||
#include <ATen/ops/imag.h>
|
||||
#include <ATen/ops/isnan_native.h>
|
||||
#include <ATen/ops/linalg_vector_norm.h>
|
||||
@ -400,19 +398,6 @@ TORCH_META_FUNC(amin)
|
||||
resize_reduction(*this, self, dim, keepdim, out_dtype);
|
||||
}
|
||||
|
||||
TORCH_META_FUNC(hash_tensor)
|
||||
(const Tensor& self, IntArrayRef dim, bool keepdim, int64_t mode) {
|
||||
auto maybe_result = maybe_get_output();
|
||||
if (maybe_result.defined()){
|
||||
TORCH_CHECK(maybe_result.scalar_type() == at::kUInt64, "Expected result to be of dtype uint64, but got ", maybe_result.scalar_type());
|
||||
}
|
||||
if (self.sym_numel() == 0) {
|
||||
native::zero_numel_check_dims(self, dim, "hash_tensor");
|
||||
}
|
||||
resize_reduction(*this, self, dim, keepdim, at::kUInt64);
|
||||
}
|
||||
|
||||
|
||||
} // namespace at::meta
|
||||
|
||||
namespace at::native {
|
||||
@ -456,7 +441,6 @@ DEFINE_DISPATCH(argmin_stub);
|
||||
DEFINE_DISPATCH(cumsum_stub);
|
||||
DEFINE_DISPATCH(cumprod_stub);
|
||||
DEFINE_DISPATCH(logcumsumexp_stub);
|
||||
DEFINE_DISPATCH(xor_sum_stub);
|
||||
|
||||
Tensor _logcumsumexp_cpu(const Tensor& self, int64_t dim) {
|
||||
Tensor result = at::empty_like(self, MemoryFormat::Contiguous);
|
||||
@ -2249,24 +2233,6 @@ Tensor dist(const Tensor &self, const Tensor& other, const Scalar& p){
|
||||
return at::norm(self - other, p);
|
||||
}
|
||||
|
||||
enum class HashMode { XOR_SUM = 0 };
|
||||
|
||||
TORCH_IMPL_FUNC(hash_tensor_out) (const Tensor& self, IntArrayRef dim, bool keepdim, int64_t mode, const Tensor& result) {
|
||||
|
||||
auto iter = meta::make_reduction(self, result, dim, keepdim, self.scalar_type());
|
||||
switch (static_cast<HashMode>(mode)) {
|
||||
case HashMode::XOR_SUM:
|
||||
if (iter.numel() == 0) {
|
||||
result.fill_(0);
|
||||
} else {
|
||||
xor_sum_stub(iter.device_type(), iter);
|
||||
}
|
||||
return;
|
||||
default:
|
||||
TORCH_CHECK(false, "Unknown hash_tensor mode: ", mode);
|
||||
}
|
||||
}
|
||||
|
||||
bool cpu_equal(const Tensor& self, const Tensor& other) {
|
||||
if (!at::namedinference::are_names_equal(
|
||||
self.unsafeGetTensorImpl(), other.unsafeGetTensorImpl())) {
|
||||
|
||||
@ -27,7 +27,6 @@ DECLARE_DISPATCH(reduce_fn, min_values_stub)
|
||||
DECLARE_DISPATCH(reduce_fn, max_values_stub)
|
||||
DECLARE_DISPATCH(reduce_fn, argmax_stub)
|
||||
DECLARE_DISPATCH(reduce_fn, argmin_stub)
|
||||
DECLARE_DISPATCH(reduce_fn, xor_sum_stub)
|
||||
|
||||
using reduce_std_var_function =
|
||||
void (*)(TensorIterator&, double correction, bool take_sqrt);
|
||||
|
||||
@ -229,20 +229,17 @@ void replication_pad3d_backward_out_cpu_template(
|
||||
int pbottom = paddingSize[3];
|
||||
int pfront = paddingSize[4];
|
||||
int pback = paddingSize[5];
|
||||
int dimc = 0;
|
||||
int dimw = 3;
|
||||
int dimh = 2;
|
||||
int dimd = 1;
|
||||
|
||||
if (input.dim() == 5) {
|
||||
dimc++;
|
||||
dimw++;
|
||||
dimh++;
|
||||
dimd++;
|
||||
}
|
||||
|
||||
/* sizes */
|
||||
int64_t ichannel = input.size(dimc);
|
||||
int64_t idepth = input.size(dimd);
|
||||
int64_t iheight = input.size(dimh);
|
||||
int64_t iwidth = input.size(dimw);
|
||||
@ -252,9 +249,6 @@ void replication_pad3d_backward_out_cpu_template(
|
||||
|
||||
at::native::padding::check_valid_input<3>(input, paddingSize);
|
||||
|
||||
TORCH_CHECK(ichannel == gradOutput.size(dimc),
|
||||
"gradOutput width unexpected. Expected: ", ichannel, ", Got: ",
|
||||
gradOutput.size(dimc));
|
||||
TORCH_CHECK(owidth == gradOutput.size(dimw),
|
||||
"gradOutput width unexpected. Expected: ", owidth, ", Got: ",
|
||||
gradOutput.size(dimw));
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user