mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-29 11:14:56 +08:00
Compare commits
1 Commits
validation
...
predispatc
| Author | SHA1 | Date | |
|---|---|---|---|
| cf1a2abf35 |
@ -12,7 +12,7 @@ fi
|
||||
SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
|
||||
source $SCRIPTPATH/../manywheel/set_desired_python.sh
|
||||
|
||||
pip install -q numpy==${NUMPY_VERSION} pyyaml==6.0.2 scons==4.7.0 ninja==1.11.1 patchelf==0.17.2
|
||||
pip install -q numpy==${NUMPY_VERSION} pyyaml==6.0.2 scons==4.7.0 ninja==1.11.1.4 patchelf==0.17.2
|
||||
|
||||
for tool in python python3 pip pip3 ninja scons patchelf; do
|
||||
ln -sf ${DESIRED_PYTHON_BIN_DIR}/${tool} /usr/local/bin;
|
||||
|
||||
@ -36,104 +36,3 @@ See `build.sh` for valid build environments (it's the giant switch).
|
||||
# Set flags (see build.sh) and build image
|
||||
sudo bash -c 'TRITON=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
|
||||
```
|
||||
|
||||
## [Guidance] Adding a New Base Docker Image
|
||||
|
||||
### Background
|
||||
|
||||
The base Docker images in directory `.ci/docker/` are built by the `docker-builds.yml` workflow. Those images are used throughout the PyTorch CI/CD pipeline. You should only create or modify a base Docker image if you need specific environment changes or dependencies before building PyTorch on CI.
|
||||
|
||||
1. **Automatic Rebuilding**:
|
||||
- The Docker image building process is triggered automatically when changes are made to files in the `.ci/docker/*` directory
|
||||
- This ensures all images stay up-to-date with the latest dependencies and configurations
|
||||
|
||||
2. **Image Reuse in PyTorch Build Workflows** (example: linux-build):
|
||||
- The images generated by `docker-builds.yml` are reused in `_linux-build.yml` through the `calculate-docker-image` step
|
||||
- The `_linux-build.yml` workflow:
|
||||
- Pulls the Docker image determined by the `calculate-docker-image` step
|
||||
- Runs a Docker container with that image
|
||||
- Executes `.ci/pytorch/build.sh` inside the container to build PyTorch
|
||||
|
||||
3. **Usage in Test Workflows** (example: linux-test):
|
||||
- The same Docker images are also used in `_linux-test.yml` for running tests
|
||||
- The `_linux-test.yml` workflow follows a similar pattern:
|
||||
- It uses the `calculate-docker-image` step to determine which Docker image to use
|
||||
- It pulls the Docker image and runs a container with that image
|
||||
- It installs the wheels from the artifacts generated by PyTorch build jobs
|
||||
- It executes test scripts (like `.ci/pytorch/test.sh` or `.ci/pytorch/multigpu-test.sh`) inside the container
|
||||
|
||||
### Understanding File Purposes
|
||||
|
||||
#### `.ci/docker/build.sh` vs `.ci/pytorch/build.sh`
|
||||
- **`.ci/docker/build.sh`**:
|
||||
- Used for building base Docker images
|
||||
- Executed by the `docker-builds.yml` workflow to pre-build Docker images for CI
|
||||
- Contains configurations for different Docker build environments
|
||||
|
||||
- **`.ci/pytorch/build.sh`**:
|
||||
- Used for building PyTorch inside a Docker container
|
||||
- Called by workflows like `_linux-build.yml` after the Docker container is started
|
||||
- Builds PyTorch wheels and other artifacts
|
||||
|
||||
#### `.ci/docker/ci_commit_pins/` vs `.github/ci_commit_pins`
|
||||
- **`.ci/docker/ci_commit_pins/`**:
|
||||
- Used for pinning dependency versions during base Docker image building
|
||||
- Ensures consistent environments for building PyTorch
|
||||
- Changes here trigger base Docker image rebuilds
|
||||
|
||||
- **`.github/ci_commit_pins`**:
|
||||
- Used for pinning dependency versions during PyTorch building and tests
|
||||
- Ensures consistent dependencies for PyTorch across different builds
|
||||
- Used by build scripts running inside Docker containers
|
||||
|
||||
### Step-by-Step Guide for Adding a New Base Docker Image
|
||||
|
||||
#### 1. Add Pinned Commits (If Applicable)
|
||||
|
||||
We use pinned commits for build stability. The `nightly.yml` workflow checks and updates pinned commits for certain repository dependencies daily.
|
||||
|
||||
If your new Docker image needs a library installed from a specific pinned commit or built from source:
|
||||
|
||||
1. Add the repository you want to track in `nightly.yml` and `merge-rules.yml`
|
||||
2. Add the initial pinned commit in `.ci/docker/ci_commit_pins/`. The text filename should match the one defined in step 1
|
||||
|
||||
#### 2. Configure the Base Docker Image
|
||||
1. **Add new Base Docker image configuration** (if applicable):
|
||||
|
||||
Add the configuration in `.ci/docker/build.sh`. For example:
|
||||
```bash
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-new1)
|
||||
CUDA_VERSION=12.8.1
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
NEW_ARG_1=yes
|
||||
;;
|
||||
```
|
||||
|
||||
2. **Add build arguments to Docker build command**:
|
||||
|
||||
If you're introducing a new argument to the Docker build, make sure to add it in the Docker build step in `.ci/docker/build.sh`:
|
||||
```bash
|
||||
docker build \
|
||||
....
|
||||
--build-arg "NEW_ARG_1=${NEW_ARG_1}"
|
||||
```
|
||||
|
||||
3. **Update Dockerfile logic**:
|
||||
|
||||
Update the Dockerfile to use the new argument. For example, in `ubuntu/Dockerfile`:
|
||||
```dockerfile
|
||||
ARG NEW_ARG_1
|
||||
# Set up environment for NEW_ARG_1
|
||||
RUN if [ -n "${NEW_ARG_1}" ]; then bash ./do_something.sh; fi
|
||||
```
|
||||
|
||||
4. **Add the Docker configuration** in `.github/workflows/docker-builds.yml`:
|
||||
|
||||
The `docker-builds.yml` workflow pre-builds the Docker images whenever changes occur in the `.ci/docker/` directory. This includes the
|
||||
pinned commit updates.
|
||||
|
||||
@ -93,6 +93,7 @@ tag=$(echo $image | awk -F':' '{print $2}')
|
||||
case "$tag" in
|
||||
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11)
|
||||
CUDA_VERSION=12.4
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
@ -103,6 +104,7 @@ case "$tag" in
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11)
|
||||
CUDA_VERSION=12.8.1
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
@ -113,6 +115,7 @@ case "$tag" in
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.8.1
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
@ -124,6 +127,7 @@ case "$tag" in
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.8.1
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
@ -135,6 +139,7 @@ case "$tag" in
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.8.1
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.13
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
@ -146,6 +151,7 @@ case "$tag" in
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9)
|
||||
CUDA_VERSION=12.6.3
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
@ -154,18 +160,9 @@ case "$tag" in
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm)
|
||||
CUDA_VERSION=12.8.1
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.6
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
@ -177,6 +174,7 @@ case "$tag" in
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.6
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
@ -188,6 +186,7 @@ case "$tag" in
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.6
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.13
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
@ -199,6 +198,7 @@ case "$tag" in
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9)
|
||||
CUDA_VERSION=12.8.1
|
||||
CUDNN_VERSION=9
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
@ -276,7 +276,7 @@ case "$tag" in
|
||||
NINJA_VERSION=1.9.0
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
|
||||
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
@ -288,6 +288,7 @@ case "$tag" in
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
CUDA_VERSION=12.8.1
|
||||
CUDNN_VERSION=9
|
||||
CLANG_VERSION=12
|
||||
VISION=yes
|
||||
TRITON=yes
|
||||
@ -366,6 +367,7 @@ case "$tag" in
|
||||
fi
|
||||
if [[ "$image" == *cuda* ]]; then
|
||||
extract_version_from_image_name cuda CUDA_VERSION
|
||||
extract_version_from_image_name cudnn CUDNN_VERSION
|
||||
fi
|
||||
if [[ "$image" == *rocm* ]]; then
|
||||
extract_version_from_image_name rocm ROCM_VERSION
|
||||
@ -417,6 +419,9 @@ docker build \
|
||||
--build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
|
||||
--build-arg "GCC_VERSION=${GCC_VERSION}" \
|
||||
--build-arg "CUDA_VERSION=${CUDA_VERSION}" \
|
||||
--build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \
|
||||
--build-arg "TENSORRT_VERSION=${TENSORRT_VERSION}" \
|
||||
--build-arg "GRADLE_VERSION=${GRADLE_VERSION}" \
|
||||
--build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
|
||||
--build-arg "KATEX=${KATEX:-}" \
|
||||
--build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
|
||||
|
||||
26
.ci/docker/common/install_cudnn.sh
Normal file
26
.ci/docker/common/install_cudnn.sh
Normal file
@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [[ -n "${CUDNN_VERSION}" ]]; then
|
||||
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
|
||||
mkdir tmp_cudnn
|
||||
pushd tmp_cudnn
|
||||
if [[ ${CUDA_VERSION:0:4} == "12.9" || ${CUDA_VERSION:0:4} == "12.8" ]]; then
|
||||
CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive"
|
||||
elif [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then
|
||||
CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive"
|
||||
elif [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then
|
||||
CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive"
|
||||
elif [[ ${CUDA_VERSION:0:2} == "11" ]]; then
|
||||
CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda11-archive"
|
||||
else
|
||||
print "Unsupported CUDA version ${CUDA_VERSION}"
|
||||
exit 1
|
||||
fi
|
||||
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
|
||||
tar xf ${CUDNN_NAME}.tar.xz
|
||||
cp -a ${CUDNN_NAME}/include/* /usr/local/cuda/include/
|
||||
cp -a ${CUDNN_NAME}/lib/* /usr/local/cuda/lib64/
|
||||
popd
|
||||
rm -rf tmp_cudnn
|
||||
ldconfig
|
||||
fi
|
||||
@ -30,7 +30,7 @@ EOF
|
||||
|
||||
# we want the patch version of 6.4 instead
|
||||
if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
|
||||
ROCM_VERSION="${ROCM_VERSION}.2"
|
||||
ROCM_VERSION="${ROCM_VERSION}.1"
|
||||
fi
|
||||
|
||||
# Default url values
|
||||
@ -85,19 +85,16 @@ EOF
|
||||
# CI no longer builds for ROCm 6.3, but
|
||||
# ROCm 6.4 did not yet fix the regression, also HIP branch names are different
|
||||
if [[ $(ver $ROCM_VERSION) -ge $(ver 6.4) ]] && [[ $(ver $ROCM_VERSION) -lt $(ver 7.0) ]]; then
|
||||
if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.2) ]]; then
|
||||
HIP_TAG=rocm-6.4.2
|
||||
CLR_HASH=74d78ba3ac4bac235d02bcb48511c30b5cfdd457 # branch release/rocm-rel-6.4.2-statco-hotfix
|
||||
elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.1) ]]; then
|
||||
HIP_TAG=rocm-6.4.1
|
||||
CLR_HASH=efe6c35790b9206923bfeed1209902feff37f386 # branch release/rocm-rel-6.4.1-statco-hotfix
|
||||
if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.1) ]]; then
|
||||
HIP_BRANCH=release/rocm-rel-6.4
|
||||
CLR_HASH=ca18eb3f77fa09292fcda62bc60c3e565d752ada # branch release/rocm-rel-6.4.1-statco-hotfix
|
||||
elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
|
||||
HIP_TAG=rocm-6.4.0
|
||||
HIP_BRANCH=release/rocm-rel-6.4
|
||||
CLR_HASH=600f5b0d2baed94d5121e2174a9de0851b040b0c # branch release/rocm-rel-6.4-statco-hotfix
|
||||
fi
|
||||
# clr build needs CppHeaderParser but can only find it using conda's python
|
||||
python -m pip install CppHeaderParser
|
||||
git clone https://github.com/ROCm/HIP -b $HIP_TAG
|
||||
git clone https://github.com/ROCm/HIP -b $HIP_BRANCH
|
||||
HIP_COMMON_DIR=$(readlink -f HIP)
|
||||
git clone https://github.com/jeffdaily/clr
|
||||
pushd clr
|
||||
|
||||
@ -103,5 +103,5 @@ fi
|
||||
# It depends on torch and triton. We don't want to install
|
||||
# triton and torch from production on Docker CI images
|
||||
if [[ "$ANACONDA_PYTHON_VERSION" != 3.9* ]]; then
|
||||
pip_install helion==0.0.10 --no-deps
|
||||
pip_install helion --no-deps
|
||||
fi
|
||||
|
||||
@ -41,7 +41,7 @@ case ${DOCKER_TAG_PREFIX} in
|
||||
rocm*)
|
||||
# we want the patch version of 6.4 instead
|
||||
if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
|
||||
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
|
||||
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.1"
|
||||
fi
|
||||
BASE_TARGET=rocm
|
||||
GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
|
||||
|
||||
@ -128,7 +128,7 @@ ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
|
||||
ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
|
||||
# Install setuptools and wheel for python 3.12/3.13
|
||||
RUN for cpython_version in "cp312-cp312" "cp313-cp313" "cp313-cp313t"; do \
|
||||
/opt/python/${cpython_version}/bin/python -m pip install setuptools wheel; \
|
||||
/opt/python/${cpython_version}/bin/python -m pip install "setuptools>=77.0.0" "packaging>=24.2" wheel; \
|
||||
done;
|
||||
|
||||
|
||||
|
||||
@ -124,10 +124,9 @@ RUN python3 -mpip install cmake==3.28.0
|
||||
# install newest flatbuffers version first:
|
||||
# for some reason old version is getting pulled in otherwise.
|
||||
# packaging package is required for onnxruntime wheel build.
|
||||
RUN pip3 install flatbuffers && \
|
||||
pip3 install cython 'pkgconfig>=1.5.5' 'setuptools>=77' 'numpy<2.3.0' && \
|
||||
RUN pip3 install 'setuptools>=77.0' 'packaging>=24.2' && \
|
||||
pip3 install flatbuffers cython 'pkgconfig>=1.5.5' 'numpy<2.3.0' && \
|
||||
pip3 install --no-build-isolation h5py==3.11.0 && \
|
||||
pip3 install packaging && \
|
||||
git clone https://github.com/microsoft/onnxruntime && \
|
||||
cd onnxruntime && git checkout v1.21.0 && \
|
||||
git submodule update --init --recursive && \
|
||||
|
||||
@ -77,7 +77,7 @@ case ${image} in
|
||||
manylinux2_28-builder:rocm*)
|
||||
# we want the patch version of 6.4 instead
|
||||
if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
|
||||
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
|
||||
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.1"
|
||||
fi
|
||||
TARGET=rocm_final
|
||||
MANY_LINUX_VERSION="2_28"
|
||||
|
||||
@ -104,10 +104,10 @@ networkx==2.8.8
|
||||
#Pinned versions: 2.8.8
|
||||
#test that import: functorch
|
||||
|
||||
ninja==1.11.1.3
|
||||
ninja==1.11.1.4
|
||||
#Description: build system. Used in some tests. Used in build to generate build
|
||||
#time tracing information
|
||||
#Pinned versions: 1.11.1.3
|
||||
#Pinned versions: 1.11.1.4
|
||||
#test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
|
||||
|
||||
numba==0.49.0 ; python_version < "3.9"
|
||||
@ -221,9 +221,9 @@ pygments==2.15.0
|
||||
#Pinned versions: 2.12.0
|
||||
#test that import: the doctests
|
||||
|
||||
#pyyaml
|
||||
#PyYAML
|
||||
#Description: data serialization format
|
||||
#Pinned versions: 6.0.2
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
|
||||
#requests
|
||||
@ -233,7 +233,7 @@ pygments==2.15.0
|
||||
|
||||
#rich
|
||||
#Description: rich text and beautiful formatting in the terminal
|
||||
#Pinned versions: 14.1.0
|
||||
#Pinned versions: 10.9.0
|
||||
#test that import:
|
||||
|
||||
scikit-image==0.19.3 ; python_version < "3.10"
|
||||
@ -363,9 +363,10 @@ pwlf==2.2.1
|
||||
|
||||
|
||||
# To build PyTorch itself
|
||||
packaging>=24.2
|
||||
pyyaml
|
||||
pyzstd
|
||||
setuptools>=70.1.0
|
||||
setuptools>=77.0.0
|
||||
six
|
||||
|
||||
scons==4.5.2 ; platform_machine == "aarch64"
|
||||
|
||||
@ -4,8 +4,8 @@ sphinx==5.3.0
|
||||
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2
|
||||
|
||||
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
|
||||
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
|
||||
# something related to Docker setup. We can investigate this later.
|
||||
# but it doesn't seem to work and hangs around idly. The initial thought it is probably
|
||||
# something related to Docker setup. We can investigate this later
|
||||
|
||||
sphinxcontrib.katex==0.8.6
|
||||
#Description: This is used to generate PyTorch docs
|
||||
@ -50,8 +50,8 @@ IPython==8.12.0
|
||||
#Pinned versions: 8.12.0
|
||||
|
||||
myst-nb==0.17.2
|
||||
#Description: This is used to generate PyTorch functorch and torch.compile docs
|
||||
#Pinned versions: 0.17.2
|
||||
#Description: This is used to generate PyTorch functorch docs
|
||||
#Pinned versions: 0.13.2
|
||||
|
||||
# The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
|
||||
python-etcd==0.4.5
|
||||
@ -59,3 +59,4 @@ sphinx-copybutton==0.5.0
|
||||
sphinx-design==0.4.0
|
||||
sphinxcontrib-mermaid==1.0.0
|
||||
myst-parser==0.18.1
|
||||
myst-nb
|
||||
|
||||
34
.ci/pytorch/build-mobile.sh
Executable file
34
.ci/pytorch/build-mobile.sh
Executable file
@ -0,0 +1,34 @@
|
||||
#!/usr/bin/env bash
|
||||
# DO NOT ADD 'set -x' not to reveal CircleCI secret context environment variables
|
||||
set -eu -o pipefail
|
||||
|
||||
# This script uses linux host toolchain + mobile build options in order to
|
||||
# build & test mobile libtorch without having to setup Android/iOS
|
||||
# toolchain/simulator.
|
||||
|
||||
# shellcheck source=./common.sh
|
||||
source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
|
||||
# shellcheck source=./common-build.sh
|
||||
source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
|
||||
|
||||
# Install torch & torchvision - used to download & trace test model.
|
||||
# Ideally we should use the libtorch built on the PR so that backward
|
||||
# incompatible changes won't break this script - but it will significantly slow
|
||||
# down mobile CI jobs.
|
||||
# Here we install nightly instead of stable so that we have an option to
|
||||
# temporarily skip mobile CI jobs on BC-breaking PRs until they are in nightly.
|
||||
retry pip install --pre torch torchvision \
|
||||
-f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html \
|
||||
--progress-bar off
|
||||
|
||||
# Run end-to-end process of building mobile library, linking into the predictor
|
||||
# binary, and running forward pass with a real model.
|
||||
if [[ "$BUILD_ENVIRONMENT" == *-mobile-custom-build-static* ]]; then
|
||||
TEST_CUSTOM_BUILD_STATIC=1 test/mobile/custom_build/build.sh
|
||||
elif [[ "$BUILD_ENVIRONMENT" == *-mobile-lightweight-dispatch* ]]; then
|
||||
test/mobile/lightweight_dispatch/build.sh
|
||||
else
|
||||
TEST_DEFAULT_BUILD=1 test/mobile/custom_build/build.sh
|
||||
fi
|
||||
|
||||
print_sccache_stats
|
||||
@ -11,6 +11,10 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
|
||||
# shellcheck source=./common-build.sh
|
||||
source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" == *-mobile-*build* ]]; then
|
||||
exec "$(dirname "${BASH_SOURCE[0]}")/build-mobile.sh" "$@"
|
||||
fi
|
||||
|
||||
echo "Python version:"
|
||||
python --version
|
||||
|
||||
@ -120,8 +124,26 @@ if [[ "$BUILD_ENVIRONMENT" == *libtorch* ]]; then
|
||||
fi
|
||||
|
||||
# Use special scripts for Android builds
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
|
||||
export ANDROID_NDK=/opt/ndk
|
||||
build_args=()
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *-arm-v7a* ]]; then
|
||||
build_args+=("-DANDROID_ABI=armeabi-v7a")
|
||||
elif [[ "${BUILD_ENVIRONMENT}" == *-arm-v8a* ]]; then
|
||||
build_args+=("-DANDROID_ABI=arm64-v8a")
|
||||
elif [[ "${BUILD_ENVIRONMENT}" == *-x86_32* ]]; then
|
||||
build_args+=("-DANDROID_ABI=x86")
|
||||
elif [[ "${BUILD_ENVIRONMENT}" == *-x86_64* ]]; then
|
||||
build_args+=("-DANDROID_ABI=x86_64")
|
||||
fi
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *vulkan* ]]; then
|
||||
build_args+=("-DUSE_VULKAN=ON")
|
||||
fi
|
||||
build_args+=("-DUSE_LITE_INTERPRETER_PROFILER=OFF")
|
||||
exec ./scripts/build_android.sh "${build_args[@]}" "$@"
|
||||
fi
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" == *vulkan* ]]; then
|
||||
if [[ "$BUILD_ENVIRONMENT" != *android* && "$BUILD_ENVIRONMENT" == *vulkan* ]]; then
|
||||
export USE_VULKAN=1
|
||||
# shellcheck disable=SC1091
|
||||
source /var/lib/jenkins/vulkansdk/setup-env.sh
|
||||
@ -203,7 +225,7 @@ if [[ "${BUILD_ENVIRONMENT}" == *-pch* ]]; then
|
||||
export USE_PRECOMPILED_HEADERS=1
|
||||
fi
|
||||
|
||||
if [[ "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
|
||||
if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
|
||||
export BUILD_STATIC_RUNTIME_BENCHMARK=ON
|
||||
fi
|
||||
|
||||
@ -247,6 +269,9 @@ if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then
|
||||
tools/bazel build --config=no-tty "${BAZEL_MEM_LIMIT}" "${BAZEL_CPU_LIMIT}" //...
|
||||
fi
|
||||
else
|
||||
# install build-system requirements before running setup.py commands
|
||||
python -m pip install -r requirements-build.txt
|
||||
|
||||
# check that setup.py would fail with bad arguments
|
||||
echo "The next three invocations are expected to fail with invalid command error messages."
|
||||
( ! get_exit_code python setup.py bad_argument )
|
||||
|
||||
@ -204,32 +204,8 @@ function install_torchrec_and_fbgemm() {
|
||||
pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
|
||||
pip_uninstall fbgemm-gpu-nightly
|
||||
|
||||
# Set ROCM_HOME isn't available, use ROCM_PATH if set or /opt/rocm
|
||||
ROCM_HOME="${ROCM_HOME:-${ROCM_PATH:-/opt/rocm}}"
|
||||
|
||||
# Find rocm_version.h header file for ROCm version extract
|
||||
rocm_version_h="${ROCM_HOME}/include/rocm-core/rocm_version.h"
|
||||
if [ ! -f "$rocm_version_h" ]; then
|
||||
rocm_version_h="${ROCM_HOME}/include/rocm_version.h"
|
||||
fi
|
||||
|
||||
# Error out if rocm_version.h not found
|
||||
if [ ! -f "$rocm_version_h" ]; then
|
||||
echo "Error: rocm_version.h not found in expected locations." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Extract major, minor and patch ROCm version numbers
|
||||
MAJOR_VERSION=$(grep 'ROCM_VERSION_MAJOR' "$rocm_version_h" | awk '{print $3}')
|
||||
MINOR_VERSION=$(grep 'ROCM_VERSION_MINOR' "$rocm_version_h" | awk '{print $3}')
|
||||
PATCH_VERSION=$(grep 'ROCM_VERSION_PATCH' "$rocm_version_h" | awk '{print $3}')
|
||||
ROCM_INT=$((MAJOR_VERSION * 10000 + MINOR_VERSION * 100 + PATCH_VERSION))
|
||||
echo "ROCm version: $ROCM_INT"
|
||||
export BUILD_ROCM_VERSION="$MAJOR_VERSION.$MINOR_VERSION"
|
||||
|
||||
pip_install tabulate # needed for newer fbgemm
|
||||
pip_install patchelf # needed for rocm fbgemm
|
||||
pushd /tmp
|
||||
|
||||
local wheel_dir=dist/fbgemm_gpu
|
||||
local found_whl=0
|
||||
@ -247,7 +223,7 @@ function install_torchrec_and_fbgemm() {
|
||||
pushd fbgemm/fbgemm_gpu
|
||||
git checkout "${fbgemm_commit}"
|
||||
python setup.py bdist_wheel \
|
||||
--build-variant=rocm \
|
||||
--package_variant=rocm \
|
||||
-DHIP_ROOT_DIR="${ROCM_PATH}" \
|
||||
-DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \
|
||||
-DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
|
||||
@ -264,7 +240,6 @@ function install_torchrec_and_fbgemm() {
|
||||
done
|
||||
|
||||
rm -rf fbgemm
|
||||
popd
|
||||
else
|
||||
pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
|
||||
pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu
|
||||
|
||||
123
.ci/pytorch/create_test_cert.py
Normal file
123
.ci/pytorch/create_test_cert.py
Normal file
@ -0,0 +1,123 @@
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from tempfile import mkdtemp
|
||||
|
||||
from cryptography import x509
|
||||
from cryptography.hazmat.primitives import hashes, serialization
|
||||
from cryptography.hazmat.primitives.asymmetric import rsa
|
||||
from cryptography.x509.oid import NameOID
|
||||
|
||||
|
||||
temp_dir = mkdtemp()
|
||||
print(temp_dir)
|
||||
|
||||
|
||||
def genrsa(path):
|
||||
key = rsa.generate_private_key(
|
||||
public_exponent=65537,
|
||||
key_size=2048,
|
||||
)
|
||||
with open(path, "wb") as f:
|
||||
f.write(
|
||||
key.private_bytes(
|
||||
encoding=serialization.Encoding.PEM,
|
||||
format=serialization.PrivateFormat.TraditionalOpenSSL,
|
||||
encryption_algorithm=serialization.NoEncryption(),
|
||||
)
|
||||
)
|
||||
return key
|
||||
|
||||
|
||||
def create_cert(path, C, ST, L, O, key):
|
||||
subject = issuer = x509.Name(
|
||||
[
|
||||
x509.NameAttribute(NameOID.COUNTRY_NAME, C),
|
||||
x509.NameAttribute(NameOID.STATE_OR_PROVINCE_NAME, ST),
|
||||
x509.NameAttribute(NameOID.LOCALITY_NAME, L),
|
||||
x509.NameAttribute(NameOID.ORGANIZATION_NAME, O),
|
||||
]
|
||||
)
|
||||
cert = (
|
||||
x509.CertificateBuilder()
|
||||
.subject_name(subject)
|
||||
.issuer_name(issuer)
|
||||
.public_key(key.public_key())
|
||||
.serial_number(x509.random_serial_number())
|
||||
.not_valid_before(datetime.now(timezone.utc))
|
||||
.not_valid_after(
|
||||
# Our certificate will be valid for 10 days
|
||||
datetime.now(timezone.utc) + timedelta(days=10)
|
||||
)
|
||||
.add_extension(
|
||||
x509.BasicConstraints(ca=True, path_length=None),
|
||||
critical=True,
|
||||
)
|
||||
.sign(key, hashes.SHA256())
|
||||
)
|
||||
# Write our certificate out to disk.
|
||||
with open(path, "wb") as f:
|
||||
f.write(cert.public_bytes(serialization.Encoding.PEM))
|
||||
return cert
|
||||
|
||||
|
||||
def create_req(path, C, ST, L, O, key):
|
||||
csr = (
|
||||
x509.CertificateSigningRequestBuilder()
|
||||
.subject_name(
|
||||
x509.Name(
|
||||
[
|
||||
# Provide various details about who we are.
|
||||
x509.NameAttribute(NameOID.COUNTRY_NAME, C),
|
||||
x509.NameAttribute(NameOID.STATE_OR_PROVINCE_NAME, ST),
|
||||
x509.NameAttribute(NameOID.LOCALITY_NAME, L),
|
||||
x509.NameAttribute(NameOID.ORGANIZATION_NAME, O),
|
||||
]
|
||||
)
|
||||
)
|
||||
.sign(key, hashes.SHA256())
|
||||
)
|
||||
with open(path, "wb") as f:
|
||||
f.write(csr.public_bytes(serialization.Encoding.PEM))
|
||||
return csr
|
||||
|
||||
|
||||
def sign_certificate_request(path, csr_cert, ca_cert, private_ca_key):
|
||||
cert = (
|
||||
x509.CertificateBuilder()
|
||||
.subject_name(csr_cert.subject)
|
||||
.issuer_name(ca_cert.subject)
|
||||
.public_key(csr_cert.public_key())
|
||||
.serial_number(x509.random_serial_number())
|
||||
.not_valid_before(datetime.now(timezone.utc))
|
||||
.not_valid_after(
|
||||
# Our certificate will be valid for 10 days
|
||||
datetime.now(timezone.utc) + timedelta(days=10)
|
||||
# Sign our certificate with our private key
|
||||
)
|
||||
.sign(private_ca_key, hashes.SHA256())
|
||||
)
|
||||
with open(path, "wb") as f:
|
||||
f.write(cert.public_bytes(serialization.Encoding.PEM))
|
||||
return cert
|
||||
|
||||
|
||||
ca_key = genrsa(temp_dir + "/ca.key")
|
||||
ca_cert = create_cert(
|
||||
temp_dir + "/ca.pem",
|
||||
"US",
|
||||
"New York",
|
||||
"New York",
|
||||
"Gloo Certificate Authority",
|
||||
ca_key,
|
||||
)
|
||||
|
||||
pkey = genrsa(temp_dir + "/pkey.key")
|
||||
csr = create_req(
|
||||
temp_dir + "/csr.csr",
|
||||
"US",
|
||||
"California",
|
||||
"San Francisco",
|
||||
"Gloo Testing Company",
|
||||
pkey,
|
||||
)
|
||||
|
||||
cert = sign_certificate_request(temp_dir + "/cert.pem", csr, ca_cert, ca_key)
|
||||
18
.ci/pytorch/run_glootls_test.sh
Executable file
18
.ci/pytorch/run_glootls_test.sh
Executable file
@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
|
||||
CREATE_TEST_CERT="$(dirname "${BASH_SOURCE[0]}")/create_test_cert.py"
|
||||
TMP_CERT_DIR=$(python "$CREATE_TEST_CERT")
|
||||
|
||||
openssl verify -CAfile "${TMP_CERT_DIR}/ca.pem" "${TMP_CERT_DIR}/cert.pem"
|
||||
|
||||
export GLOO_DEVICE_TRANSPORT=TCP_TLS
|
||||
export GLOO_DEVICE_TRANSPORT_TCP_TLS_PKEY=${TMP_CERT_DIR}/pkey.key
|
||||
export GLOO_DEVICE_TRANSPORT_TCP_TLS_CERT=${TMP_CERT_DIR}/cert.pem
|
||||
export GLOO_DEVICE_TRANSPORT_TCP_TLS_CA_FILE=${TMP_CERT_DIR}/ca.pem
|
||||
|
||||
time python test/run_test.py --include distributed/test_c10d_gloo --verbose -- ProcessGroupGlooTest
|
||||
|
||||
unset GLOO_DEVICE_TRANSPORT
|
||||
unset GLOO_DEVICE_TRANSPORT_TCP_TLS_PKEY
|
||||
unset GLOO_DEVICE_TRANSPORT_TCP_TLS_CERT
|
||||
unset GLOO_DEVICE_TRANSPORT_TCP_TLS_CA_FILE
|
||||
@ -35,7 +35,14 @@ MODULES = [
|
||||
"smoke_test": "./vision/test/smoke_test.py",
|
||||
"extension": "extension",
|
||||
"repo_name": "vision",
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "torchaudio",
|
||||
"repo": "https://github.com/pytorch/audio.git",
|
||||
"smoke_test": "./audio/test/smoke_test/smoke_test.py --no-ffmpeg",
|
||||
"extension": "_extension",
|
||||
"repo_name": "audio",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@ -378,29 +385,6 @@ def smoke_test_compile(device: str = "cpu") -> None:
|
||||
x_pt2 = torch.compile(model, mode="max-autotune")(x)
|
||||
|
||||
|
||||
def smoke_test_nvshmem() -> None:
|
||||
if not torch.cuda.is_available():
|
||||
print("CUDA is not available, skipping NVSHMEM test")
|
||||
return
|
||||
|
||||
# Check if NVSHMEM is compiled in current build
|
||||
try:
|
||||
from torch._C._distributed_c10d import _is_nvshmem_available
|
||||
except ImportError:
|
||||
# Not built with NVSHMEM support.
|
||||
# torch is not compiled with NVSHMEM prior to 2.9
|
||||
if torch.__version__ < "2.9":
|
||||
return
|
||||
else:
|
||||
# After 2.9: NVSHMEM is expected to be compiled in current build
|
||||
raise RuntimeError("torch not compiled with NVSHMEM") from None
|
||||
|
||||
print("torch compiled with NVSHMEM")
|
||||
|
||||
# Check if NVSHMEM is available on current system.
|
||||
print(f"NVSHMEM available at run time: {_is_nvshmem_available()}")
|
||||
|
||||
|
||||
def smoke_test_modules():
|
||||
cwd = os.getcwd()
|
||||
for module in MODULES:
|
||||
@ -495,8 +479,6 @@ def main() -> None:
|
||||
options.pypi_pkg_check,
|
||||
)
|
||||
|
||||
smoke_test_nvshmem()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@ -201,7 +201,7 @@ fi
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]] ; then
|
||||
# JIT C++ extensions require ninja.
|
||||
pip_install "ninja==1.10.2"
|
||||
pip_install "ninja==1.11.1.4"
|
||||
# ninja is installed in $HOME/.local/bin, e.g., /var/lib/jenkins/.local/bin for CI user jenkins
|
||||
# but this script should be runnable by any user, including root
|
||||
export PATH="$HOME/.local/bin:$PATH"
|
||||
@ -345,12 +345,6 @@ test_h100_symm_mem() {
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
test_h100_cutlass_backend() {
|
||||
# cutlass backend tests for H100
|
||||
TORCHINDUCTOR_CUTLASS_DIR=$(realpath "./third_party/cutlass") python test/run_test.py --include inductor/test_cutlass_backend -k "not addmm" $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
TORCHINDUCTOR_CUTLASS_DIR=$(realpath "./third_party/cutlass") python test/run_test.py --include inductor/test_cutlass_evt $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
}
|
||||
|
||||
test_lazy_tensor_meta_reference_disabled() {
|
||||
export TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE=1
|
||||
echo "Testing lazy tensor operations without meta reference"
|
||||
@ -365,6 +359,7 @@ test_dynamo_wrapped_shard() {
|
||||
exit 1
|
||||
fi
|
||||
python tools/dynamo/verify_dynamo.py
|
||||
python tools/dynamo/gb_id_mapping.py verify
|
||||
# PLEASE DO NOT ADD ADDITIONAL EXCLUDES HERE.
|
||||
# Instead, use @skipIfTorchDynamo on your tests.
|
||||
time python test/run_test.py --dynamo \
|
||||
@ -462,7 +457,7 @@ test_inductor_aoti() {
|
||||
# rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled
|
||||
/usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}"
|
||||
|
||||
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
|
||||
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile
|
||||
}
|
||||
|
||||
test_inductor_cpp_wrapper_shard() {
|
||||
@ -928,6 +923,12 @@ test_torchbench_gcp_smoketest(){
|
||||
popd
|
||||
}
|
||||
|
||||
test_python_gloo_with_tls() {
|
||||
source "$(dirname "${BASH_SOURCE[0]}")/run_glootls_test.sh"
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
|
||||
test_aten() {
|
||||
# Test ATen
|
||||
# The following test(s) of ATen have already been skipped by caffe2 in rocm environment:
|
||||
@ -974,8 +975,6 @@ test_without_numpy() {
|
||||
if [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
|
||||
python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch;torch.compile(lambda x:print(x))('Hello World')"
|
||||
fi
|
||||
# Regression test for https://github.com/pytorch/pytorch/pull/157734 (torch.onnx should be importable without numpy)
|
||||
python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch; import torch.onnx"
|
||||
popd
|
||||
}
|
||||
|
||||
@ -1320,13 +1319,10 @@ EOF
|
||||
|
||||
# Step 2. Make sure that the public API test "test_correct_module_names" fails when an existing
|
||||
# file is modified to introduce an invalid public API function.
|
||||
# The filepath here must not have __all__ defined in it, otherwise the test will pass.
|
||||
# If your PR introduces __all__ to torch/cuda/streams.py please point this to another file
|
||||
# that does not have __all__ defined.
|
||||
EXISTING_FILEPATH="${TORCH_INSTALL_DIR}/cuda/streams.py"
|
||||
EXISTING_FILEPATH="${TORCH_INSTALL_DIR}/nn/parameter.py"
|
||||
cp -v "${EXISTING_FILEPATH}" "${EXISTING_FILEPATH}.orig"
|
||||
echo "${BAD_PUBLIC_FUNC}" >> "${EXISTING_FILEPATH}"
|
||||
invalid_api="torch.cuda.streams.new_public_func"
|
||||
invalid_api="torch.nn.parameter.new_public_func"
|
||||
echo "Appended an invalid public API function to existing file ${EXISTING_FILEPATH}..."
|
||||
|
||||
check_public_api_test_fails \
|
||||
@ -1560,7 +1556,7 @@ test_executorch() {
|
||||
test_linux_aarch64() {
|
||||
python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
|
||||
test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
|
||||
test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
|
||||
test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops test_cpp_extensions_open_device_registration \
|
||||
--shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
|
||||
|
||||
# Dynamo tests
|
||||
@ -1773,8 +1769,6 @@ elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
|
||||
test_h100_distributed
|
||||
elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
|
||||
test_h100_symm_mem
|
||||
elif [[ "${TEST_CONFIG}" == h100_cutlass_backend ]]; then
|
||||
test_h100_cutlass_backend
|
||||
else
|
||||
install_torchvision
|
||||
install_monkeytype
|
||||
|
||||
@ -1,34 +0,0 @@
|
||||
# If you want to rebuild, run this with $env:REBUILD=1
|
||||
# If you want to build with CUDA, run this with $env:USE_CUDA=1
|
||||
# If you want to build without CUDA, run this with $env:USE_CUDA=0
|
||||
|
||||
# Check for setup.py in the current directory
|
||||
if (-not (Test-Path "setup.py")) {
|
||||
Write-Host "ERROR: Please run this build script from PyTorch root directory."
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Get the script's parent directory
|
||||
$ScriptParentDir = Split-Path -Parent $MyInvocation.MyCommand.Definition
|
||||
|
||||
# Set TMP_DIR and convert to Windows path
|
||||
$env:TMP_DIR = Join-Path (Get-Location) "build\win_tmp"
|
||||
$env:TMP_DIR_WIN = $env:TMP_DIR # Already in Windows format, no cygpath needed
|
||||
|
||||
# Set final package directory with default fallback
|
||||
if (-not $env:PYTORCH_FINAL_PACKAGE_DIR) {
|
||||
$env:PYTORCH_FINAL_PACKAGE_DIR = "C:\w\build-results"
|
||||
}
|
||||
|
||||
# Create the final package directory if it doesn't exist
|
||||
if (-not (Test-Path $env:PYTORCH_FINAL_PACKAGE_DIR)) {
|
||||
New-Item -Path $env:PYTORCH_FINAL_PACKAGE_DIR -ItemType Directory -Force | Out-Null
|
||||
}
|
||||
|
||||
# Set script helpers directory
|
||||
$env:SCRIPT_HELPERS_DIR = Join-Path $ScriptParentDir "win-test-helpers\arm64"
|
||||
|
||||
# Run the main build script
|
||||
& "$env:SCRIPT_HELPERS_DIR\build_pytorch.ps1"
|
||||
|
||||
Write-Host "BUILD PASSED"
|
||||
@ -1,24 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -ex -o pipefail
|
||||
|
||||
SCRIPT_PARENT_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
|
||||
# shellcheck source=./common.sh
|
||||
source "$SCRIPT_PARENT_DIR/common.sh"
|
||||
|
||||
run_tests() {
|
||||
echo Running smoke_test.py...
|
||||
python ./.ci/pytorch/smoke_test/smoke_test.py --package torchonly
|
||||
|
||||
echo Running test_autograd.oy, test_nn.py, test_torch.py...
|
||||
cd test
|
||||
|
||||
CORE_TEST_LIST=("test_autograd.py" "test_nn.py" "test_modules.py")
|
||||
|
||||
for t in "${CORE_TEST_LIST[@]}"; do
|
||||
echo "Running test: $t"
|
||||
python "$t" --verbose --save-xml --use-pytest -vvvv -rfEsxXP -p no:xdist
|
||||
done
|
||||
}
|
||||
|
||||
run_tests
|
||||
echo "TEST PASSED"
|
||||
@ -1,98 +0,0 @@
|
||||
# TODO: we may can use existing build_pytorch.bat for arm64
|
||||
|
||||
if ($env:DEBUG -eq "1") {
|
||||
$env:BUILD_TYPE = "debug"
|
||||
} else {
|
||||
$env:BUILD_TYPE = "release"
|
||||
}
|
||||
|
||||
# This inflates our log size slightly, but it is REALLY useful to be
|
||||
# able to see what our cl.exe commands are. (since you can actually
|
||||
# just copy-paste them into a local Windows setup to just rebuild a
|
||||
# single file.)
|
||||
# log sizes are too long, but leaving this here in case someone wants to use it locally
|
||||
# $env:CMAKE_VERBOSE_MAKEFILE = "1"
|
||||
|
||||
$env:INSTALLER_DIR = Join-Path $env:SCRIPT_HELPERS_DIR "installation-helpers"
|
||||
|
||||
cd ..
|
||||
|
||||
# Environment variables
|
||||
$env:SCCACHE_IDLE_TIMEOUT = "0"
|
||||
$env:SCCACHE_IGNORE_SERVER_IO_ERROR = "1"
|
||||
$env:CMAKE_BUILD_TYPE = $env:BUILD_TYPE
|
||||
$env:CMAKE_C_COMPILER_LAUNCHER = "sccache"
|
||||
$env:CMAKE_CXX_COMPILER_LAUNCHER = "sccache"
|
||||
$env:libuv_ROOT = Join-Path $env:DEPENDENCIES_DIR "libuv\install"
|
||||
$env:MSSdk = "1"
|
||||
|
||||
if ($env:PYTORCH_BUILD_VERSION) {
|
||||
$env:PYTORCH_BUILD_VERSION = $env:PYTORCH_BUILD_VERSION
|
||||
$env:PYTORCH_BUILD_NUMBER = "1"
|
||||
}
|
||||
|
||||
$env:CMAKE_POLICY_VERSION_MINIMUM = "3.5"
|
||||
|
||||
# Set BLAS type
|
||||
if ($env:ENABLE_APL -eq "1") {
|
||||
$env:BLAS = "APL"
|
||||
$env:USE_LAPACK = "1"
|
||||
} elseif ($env:ENABLE_OPENBLAS -eq "1") {
|
||||
$env:BLAS = "OpenBLAS"
|
||||
$env:OpenBLAS_HOME = Join-Path $env:DEPENDENCIES_DIR "OpenBLAS\install"
|
||||
}
|
||||
|
||||
# Change to source directory
|
||||
Set-Location $env:PYTORCH_ROOT
|
||||
|
||||
# Copy libuv.dll
|
||||
Copy-Item -Path (Join-Path $env:libuv_ROOT "lib\Release\uv.dll") -Destination "torch\lib\uv.dll" -Force
|
||||
|
||||
# Create virtual environment
|
||||
python -m venv .venv
|
||||
.\.venv\Scripts\Activate.ps1
|
||||
where.exe python
|
||||
|
||||
# Python install dependencies
|
||||
python -m pip install --upgrade pip
|
||||
pip install setuptools pyyaml
|
||||
pip install -r requirements.txt
|
||||
|
||||
# Set after installing psutil
|
||||
$env:DISTUTILS_USE_SDK = "1"
|
||||
|
||||
# Print all environment variables
|
||||
Get-ChildItem Env:
|
||||
|
||||
# Start and inspect sccache
|
||||
sccache --start-server
|
||||
sccache --zero-stats
|
||||
sccache --show-stats
|
||||
|
||||
# Build the wheel
|
||||
python setup.py bdist_wheel
|
||||
if ($LASTEXITCODE -ne 0) { exit 1 }
|
||||
|
||||
# Install the wheel locally
|
||||
$whl = Get-ChildItem -Path "dist\*.whl" | Select-Object -First 1
|
||||
if ($whl) {
|
||||
python -mpip install --no-index --no-deps $whl.FullName
|
||||
}
|
||||
|
||||
# Copy final wheel
|
||||
robocopy "dist" "$env:PYTORCH_FINAL_PACKAGE_DIR" *.whl
|
||||
|
||||
# Export test times
|
||||
python tools/stats/export_test_times.py
|
||||
|
||||
# Copy additional CI files
|
||||
robocopy ".additional_ci_files" "$env:PYTORCH_FINAL_PACKAGE_DIR\.additional_ci_files" /E
|
||||
|
||||
# Save ninja log
|
||||
Copy-Item -Path "build\.ninja_log" -Destination $env:PYTORCH_FINAL_PACKAGE_DIR -Force
|
||||
|
||||
# Final sccache stats and stop
|
||||
sccache --show-stats
|
||||
sccache --stop-server
|
||||
|
||||
exit 0
|
||||
@ -126,6 +126,11 @@ if "%USE_CUDA%"=="1" (
|
||||
set CMAKE_CUDA_COMPILER_LAUNCHER=%TMP_DIR%/bin/randomtemp.exe;%TMP_DIR%\bin\sccache.exe
|
||||
)
|
||||
|
||||
:: Install build-system requirements before running setup.py commands
|
||||
python -m pip install -r requirements-build.txt
|
||||
if errorlevel 1 goto fail
|
||||
if not errorlevel 0 goto fail
|
||||
|
||||
:: Print all existing environment variable for debugging
|
||||
set
|
||||
|
||||
|
||||
@ -18,5 +18,5 @@ start /wait "" python-amd64.exe /quiet InstallAllUsers=1 PrependPath=0 Include_t
|
||||
if errorlevel 1 exit /b 1
|
||||
|
||||
set "PATH=%CD%\Python\Scripts;%CD%\Python;%PATH%"
|
||||
%PYTHON_EXEC% -m pip install --upgrade pip setuptools packaging wheel
|
||||
%PYTHON_EXEC% -m pip install --upgrade pip "setuptools>=77.0.0" "packaging>=24.2" wheel
|
||||
if errorlevel 1 exit /b 1
|
||||
|
||||
@ -7,6 +7,9 @@ call "internal\install_python.bat"
|
||||
|
||||
%PYTHON_EXEC% --version
|
||||
set "PATH=%CD%\Python\Lib\site-packages\cmake\data\bin;%CD%\Python\Scripts;%CD%\Python;%PATH%"
|
||||
|
||||
%PYTHON_EXEC% -m pip install "setuptools>=77.0.0" "packaging>=24.2"
|
||||
|
||||
if "%DESIRED_PYTHON%" == "3.13t" %PYTHON_EXEC% -m pip install numpy==2.2.1 cmake
|
||||
if "%DESIRED_PYTHON%" == "3.13" %PYTHON_EXEC% -m pip install numpy==2.1.2 cmake
|
||||
if "%DESIRED_PYTHON%" == "3.12" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake
|
||||
@ -16,7 +19,7 @@ if "%DESIRED_PYTHON%" == "3.9" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake
|
||||
|
||||
%PYTHON_EXEC% -m pip install pyyaml
|
||||
%PYTHON_EXEC% -m pip install mkl-include mkl-static
|
||||
%PYTHON_EXEC% -m pip install boto3 ninja typing_extensions setuptools==72.1.0
|
||||
%PYTHON_EXEC% -m pip install boto3 ninja typing-extensions
|
||||
|
||||
where cmake.exe
|
||||
|
||||
|
||||
@ -127,7 +127,7 @@ export INSTALL_TEST=0 # dont install test binaries into site-packages
|
||||
export MACOSX_DEPLOYMENT_TARGET=10.15
|
||||
export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
|
||||
|
||||
SETUPTOOLS_PINNED_VERSION="==70.1.0"
|
||||
SETUPTOOLS_PINNED_VERSION="==77.0.0"
|
||||
PYYAML_PINNED_VERSION="=5.3"
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
@ -135,7 +135,7 @@ RENAME_WHEEL=true
|
||||
case $desired_python in
|
||||
3.13t)
|
||||
echo "Using 3.13 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
SETUPTOOLS_PINNED_VERSION=">=77.0.0"
|
||||
PYYAML_PINNED_VERSION=">=6.0.1"
|
||||
NUMPY_PINNED_VERSION="=2.1.0"
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
@ -145,31 +145,31 @@ case $desired_python in
|
||||
;;
|
||||
3.13)
|
||||
echo "Using 3.13 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
SETUPTOOLS_PINNED_VERSION=">=77.0.0"
|
||||
PYYAML_PINNED_VERSION=">=6.0.1"
|
||||
NUMPY_PINNED_VERSION="=2.1.0"
|
||||
;;
|
||||
3.12)
|
||||
echo "Using 3.12 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
SETUPTOOLS_PINNED_VERSION=">=77.0.0"
|
||||
PYYAML_PINNED_VERSION=">=6.0.1"
|
||||
NUMPY_PINNED_VERSION="=2.0.2"
|
||||
;;
|
||||
3.11)
|
||||
echo "Using 3.11 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
SETUPTOOLS_PINNED_VERSION=">=77.0.0"
|
||||
PYYAML_PINNED_VERSION=">=5.3"
|
||||
NUMPY_PINNED_VERSION="=2.0.2"
|
||||
;;
|
||||
3.10)
|
||||
echo "Using 3.10 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
SETUPTOOLS_PINNED_VERSION=">=77.0.0"
|
||||
PYYAML_PINNED_VERSION=">=5.3"
|
||||
NUMPY_PINNED_VERSION="=2.0.2"
|
||||
;;
|
||||
3.9)
|
||||
echo "Using 3.9 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
SETUPTOOLS_PINNED_VERSION=">=77.0.0"
|
||||
PYYAML_PINNED_VERSION=">=5.3"
|
||||
NUMPY_PINNED_VERSION="=2.0.2"
|
||||
;;
|
||||
|
||||
4
.flake8
4
.flake8
@ -7,12 +7,12 @@ max-line-length = 120
|
||||
# C408 ignored because we like the dict keyword argument syntax
|
||||
# E501 is not flexible enough, we're using B950 instead
|
||||
ignore =
|
||||
E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
|
||||
E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,
|
||||
# shebang has extra meaning in fbcode lints, so I think it's not worth trying
|
||||
# to line this up with executable bit
|
||||
EXE001,
|
||||
# these ignores are from flake8-bugbear; please fix!
|
||||
B007,B008,B017,B019,B023,B028,B903,B904,B905,B906,B907,B908,B910
|
||||
B007,B008,B017,B019,B023,B028,B903,B904,B905,B906,B907
|
||||
# these ignores are from flake8-comprehensions; please fix!
|
||||
C407,
|
||||
# these ignores are from flake8-logging-format; please fix!
|
||||
|
||||
10
.github/actionlint.yaml
vendored
10
.github/actionlint.yaml
vendored
@ -53,12 +53,16 @@ self-hosted-runner:
|
||||
- linux.rocm.gpu.mi250
|
||||
- linux.rocm.gpu.2
|
||||
- linux.rocm.gpu.4
|
||||
# gfx942 runners
|
||||
- linux.rocm.gpu.gfx942.2
|
||||
- linux.rocm.gpu.gfx942.4
|
||||
# MI300 runners
|
||||
- linux.rocm.gpu.mi300.2
|
||||
- linux.rocm.gpu.mi300.4
|
||||
- rocm-docker
|
||||
# Repo-specific Apple hosted runners
|
||||
- macos-m1-ultra
|
||||
- macos-m2-14
|
||||
# Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors)
|
||||
- macos-m1-stable
|
||||
- macos-m1-13
|
||||
- macos-m1-14
|
||||
# GitHub-hosted MacOS runners
|
||||
- macos-latest-xlarge
|
||||
|
||||
78
.github/actions/build-android/action.yml
vendored
Normal file
78
.github/actions/build-android/action.yml
vendored
Normal file
@ -0,0 +1,78 @@
|
||||
name: build android
|
||||
|
||||
description: build android for a specific arch
|
||||
|
||||
inputs:
|
||||
arch:
|
||||
description: arch to build
|
||||
required: true
|
||||
arch-for-build-env:
|
||||
description: |
|
||||
arch to pass to build environment.
|
||||
This is currently different than the arch name we use elsewhere, which
|
||||
should be fixed.
|
||||
required: true
|
||||
github-secret:
|
||||
description: github token
|
||||
required: true
|
||||
build-environment:
|
||||
required: true
|
||||
description: Top-level label for what's being built/tested.
|
||||
docker-image:
|
||||
required: true
|
||||
description: Name of the base docker image to build with.
|
||||
branch:
|
||||
required: true
|
||||
description: What branch we are building on.
|
||||
outputs:
|
||||
container_id:
|
||||
description: Docker container identifier used to build the artifacts
|
||||
value: ${{ steps.build.outputs.container_id }}
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
- name: Build-${{ inputs.arch }}
|
||||
id: build
|
||||
shell: bash
|
||||
env:
|
||||
BRANCH: ${{ inputs.branch }}
|
||||
BUILD_ENVIRONMENT: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-${{ inputs.arch-for-build-env }}-build"
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
|
||||
SCCACHE_REGION: us-east-1
|
||||
DOCKER_IMAGE: ${{ inputs.docker-image }}
|
||||
MATRIX_ARCH: ${{ inputs.arch }}
|
||||
run: |
|
||||
# detached container should get cleaned up by teardown_ec2_linux
|
||||
set -exo pipefail
|
||||
export container_name
|
||||
container_name=$(docker run \
|
||||
-e BUILD_ENVIRONMENT \
|
||||
-e MAX_JOBS="$(nproc --ignore=2)" \
|
||||
-e AWS_DEFAULT_REGION \
|
||||
-e PR_NUMBER \
|
||||
-e SHA1 \
|
||||
-e BRANCH \
|
||||
-e SCCACHE_BUCKET \
|
||||
-e SCCACHE_REGION \
|
||||
-e SKIP_SCCACHE_INITIALIZATION=1 \
|
||||
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
|
||||
--security-opt seccomp=unconfined \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--tty \
|
||||
--detach \
|
||||
--user jenkins \
|
||||
-w /var/lib/jenkins/workspace \
|
||||
"${DOCKER_IMAGE}"
|
||||
)
|
||||
git submodule sync && git submodule update -q --init --recursive --depth 1
|
||||
docker cp "${GITHUB_WORKSPACE}/." "${container_name}:/var/lib/jenkins/workspace"
|
||||
(echo "sudo chown -R jenkins . && .ci/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete" | docker exec -u jenkins -i "${container_name}" bash) 2>&1
|
||||
|
||||
# Copy install binaries back
|
||||
mkdir -p "${GITHUB_WORKSPACE}/build_android_install_${MATRIX_ARCH}"
|
||||
docker cp "${container_name}:/var/lib/jenkins/workspace/build_android/install" "${GITHUB_WORKSPACE}/build_android_install_${MATRIX_ARCH}"
|
||||
echo "container_id=${container_name}" >> "${GITHUB_OUTPUT}"
|
||||
@ -70,7 +70,7 @@ runs:
|
||||
set -eux
|
||||
# PyYAML 6.0 doesn't work with MacOS x86 anymore
|
||||
# This must run on Python-3.7 (AmazonLinux2) so can't use request=3.32.2
|
||||
python3 -m pip install requests==2.27.1 pyyaml==6.0.2
|
||||
python3 -m pip install requests==2.27.1 pyyaml==6.0.1
|
||||
|
||||
- name: Parse ref
|
||||
id: parse-ref
|
||||
|
||||
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
bf305f538005f2e900f8850ed57146024a8bc559
|
||||
00b0c91db92c51a11356249262577b9fa26c18c5
|
||||
|
||||
2
.github/ci_commit_pins/fbgemm_rocm.txt
vendored
2
.github/ci_commit_pins/fbgemm_rocm.txt
vendored
@ -1 +1 @@
|
||||
7f1de94a4c2d14f59ad4ca84538c36084ea6b2c8
|
||||
5fb5024118e9bb9decf96c2b0b1a8f0010bf56be
|
||||
|
||||
1
.github/ci_commit_pins/vllm.txt
vendored
1
.github/ci_commit_pins/vllm.txt
vendored
@ -1 +0,0 @@
|
||||
ca9e2be3ed6320b51f52f536595cd24e254f8bb2
|
||||
2
.github/ci_commit_pins/xla.txt
vendored
2
.github/ci_commit_pins/xla.txt
vendored
@ -1 +1 @@
|
||||
29ae4c76c026185f417a25e841d2cd5e65f087a3
|
||||
1c00dea2c9adb2137903c86b4191e8c247f8fda9
|
||||
|
||||
30
.github/merge_rules.yaml
vendored
30
.github/merge_rules.yaml
vendored
@ -76,8 +76,8 @@
|
||||
- .github/ci_commit_pins/audio.txt
|
||||
- .github/ci_commit_pins/vision.txt
|
||||
- .github/ci_commit_pins/torchdynamo.txt
|
||||
- .github/ci_commit_pins/vllm.txt
|
||||
- .ci/docker/ci_commit_pins/triton.txt
|
||||
- .ci/docker/ci_commit_pins/vllm.txt
|
||||
approved_by:
|
||||
- pytorchbot
|
||||
mandatory_checks_name:
|
||||
@ -131,6 +131,21 @@
|
||||
- Lint
|
||||
- pull
|
||||
|
||||
- name: Mobile
|
||||
patterns:
|
||||
- ios/**
|
||||
- android/**
|
||||
- test/mobile/**
|
||||
approved_by:
|
||||
- linbinyu
|
||||
- IvanKobzarev
|
||||
- dreiss
|
||||
- raziel
|
||||
mandatory_checks_name:
|
||||
- EasyCLA
|
||||
- Lint
|
||||
- pull
|
||||
|
||||
- name: PrimTorch
|
||||
patterns:
|
||||
- torch/_meta_registrations.py
|
||||
@ -477,19 +492,6 @@
|
||||
- srossross
|
||||
- chillee
|
||||
- zou3519
|
||||
- guilhermeleobas
|
||||
mandatory_checks_name:
|
||||
- EasyCLA
|
||||
- Lint
|
||||
- pull
|
||||
|
||||
- name: Dynamo
|
||||
patterns:
|
||||
- torch/_dynamo/**
|
||||
- torch/csrc/dynamo/**
|
||||
- test/dynamo/**
|
||||
approved_by:
|
||||
- guilhermeleobas
|
||||
mandatory_checks_name:
|
||||
- EasyCLA
|
||||
- Lint
|
||||
|
||||
2
.github/pytorch-probot.yml
vendored
2
.github/pytorch-probot.yml
vendored
@ -31,9 +31,7 @@ ciflow_push_tags:
|
||||
- ciflow/pull
|
||||
- ciflow/h100
|
||||
- ciflow/h100-distributed
|
||||
- ciflow/win-arm64
|
||||
- ciflow/h100-symm-mem
|
||||
- ciflow/h100-cutlass-backend
|
||||
retryable_workflows:
|
||||
- pull
|
||||
- trunk
|
||||
|
||||
8
.github/requirements-gha-cache.txt
vendored
8
.github/requirements-gha-cache.txt
vendored
@ -7,9 +7,9 @@
|
||||
# .ci/docker/requirements-ci.txt
|
||||
boto3==1.35.42
|
||||
jinja2==3.1.6
|
||||
lintrunner==0.12.7
|
||||
ninja==1.10.0.post1
|
||||
lintrunner==0.10.7
|
||||
ninja==1.11.1.4
|
||||
nvidia-ml-py==11.525.84
|
||||
pyyaml==6.0.2
|
||||
pyyaml==6.0
|
||||
requests==2.32.4
|
||||
rich==14.1.0
|
||||
rich==10.9.0
|
||||
|
||||
@ -2,17 +2,17 @@ boto3==1.35.42
|
||||
cmake==3.27.*
|
||||
expecttest==0.3.0
|
||||
fbscribelogger==0.1.7
|
||||
filelock==3.18.0
|
||||
filelock==3.6.0
|
||||
hypothesis==6.56.4
|
||||
librosa>=0.6.2
|
||||
mpmath==1.3.0
|
||||
networkx==2.8.7
|
||||
ninja==1.10.2.4
|
||||
ninja==1.11.1.4
|
||||
numba==0.59.0
|
||||
numpy==1.26.4
|
||||
opt-einsum>=3.3
|
||||
optree==0.13.0
|
||||
packaging==23.1
|
||||
packaging==25.0
|
||||
parameterized==0.8.1
|
||||
pillow==10.3.0
|
||||
protobuf==5.29.4
|
||||
@ -26,7 +26,7 @@ pytest-xdist==3.3.1
|
||||
pytest==7.3.2
|
||||
pyyaml==6.0.2
|
||||
scipy==1.12.0
|
||||
setuptools==72.1.0
|
||||
setuptools==80.9.0
|
||||
sympy==1.13.3
|
||||
tlparse==0.3.30
|
||||
tensorboard==2.13.0
|
||||
|
||||
2
.github/scripts/lintrunner.sh
vendored
2
.github/scripts/lintrunner.sh
vendored
@ -2,7 +2,7 @@
|
||||
set -ex
|
||||
|
||||
# Use uv to speed up lintrunner init
|
||||
python3 -m pip install -U uv==0.8.* setuptools
|
||||
python3 -m pip install -U uv setuptools
|
||||
|
||||
CACHE_DIRECTORY="/tmp/.lintbin"
|
||||
# Try to recover the cached binaries
|
||||
|
||||
4
.github/scripts/trymerge.py
vendored
4
.github/scripts/trymerge.py
vendored
@ -1891,9 +1891,7 @@ def validate_revert(
|
||||
else pr.get_comment_by_id(comment_id)
|
||||
)
|
||||
if comment.editor_login is not None:
|
||||
raise PostCommentError(
|
||||
"Halting the revert as the revert comment has been edited."
|
||||
)
|
||||
raise PostCommentError("Don't want to revert based on edited command")
|
||||
author_association = comment.author_association
|
||||
author_login = comment.author_login
|
||||
allowed_reverters = ["COLLABORATOR", "MEMBER", "OWNER"]
|
||||
|
||||
2
.github/scripts/windows/build_triton.bat
vendored
2
.github/scripts/windows/build_triton.bat
vendored
@ -10,7 +10,7 @@ if "%PY_VERS%" == "3.13t" (
|
||||
call conda create -n %PYTHON_PREFIX% -y -c=conda-forge python=%PY_VERS%
|
||||
)
|
||||
:: Fix cmake version for issue https://github.com/pytorch/pytorch/issues/150480
|
||||
call conda run -n %PYTHON_PREFIX% pip install wheel pybind11 certifi cython cmake==3.31.6 setuptools==72.1.0 ninja
|
||||
call conda run -n %PYTHON_PREFIX% pip install wheel pybind11 certifi cython cmake==3.31.6 setuptools==78.1.1 ninja
|
||||
|
||||
dir "%VC_INSTALL_PATH%"
|
||||
|
||||
|
||||
4
.github/workflows/_get-changed-files.yml
vendored
4
.github/workflows/_get-changed-files.yml
vendored
@ -27,7 +27,7 @@ jobs:
|
||||
PR_NUMBER="${{ github.event.number }}"
|
||||
|
||||
# Use gh CLI to get changed files in the PR with explicit repo
|
||||
CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
|
||||
CHANGED_FILES=$(gh pr view "$PR_NUMBER" --repo "${{ github.repository }}" --json files --jq '.files[].path' | tr '\n' ' ' | sed 's/ $//')
|
||||
|
||||
if [ -z "$CHANGED_FILES" ]; then
|
||||
echo "No changed files found, setting to '*'"
|
||||
@ -40,4 +40,4 @@ jobs:
|
||||
else
|
||||
echo "Not in PR context, setting changed files to '*'"
|
||||
echo "changed-files=*" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
fi
|
||||
5
.github/workflows/_mac-test.yml
vendored
5
.github/workflows/_mac-test.yml
vendored
@ -80,6 +80,11 @@ jobs:
|
||||
run: |
|
||||
sysctl machdep.cpu.brand_string kern.osproductversion
|
||||
|
||||
- name: Install build toolchain
|
||||
run: |
|
||||
brew update --quiet
|
||||
brew install --formula cmake ninja
|
||||
|
||||
- name: Clean up leftover processes on MacOS pet runner
|
||||
continue-on-error: true
|
||||
run: |
|
||||
|
||||
4
.github/workflows/_rocm-test.yml
vendored
4
.github/workflows/_rocm-test.yml
vendored
@ -269,8 +269,8 @@ jobs:
|
||||
# copy test results back to the mounted workspace, needed sudo, resulting permissions were correct
|
||||
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test"
|
||||
|
||||
- name: Change permissions (only needed for kubernetes runners for now)
|
||||
if: ${{ always() && steps.test.conclusion && (contains(matrix.runner, 'gfx942') || contains(matrix.runner, 'mi355')) }}
|
||||
- name: Change permissions (only needed for MI300 runners for now)
|
||||
if: ${{ always() && steps.test.conclusion && contains(matrix.runner, 'mi300') }}
|
||||
run: |
|
||||
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"
|
||||
|
||||
|
||||
8
.github/workflows/build-triton-wheel.yml
vendored
8
.github/workflows/build-triton-wheel.yml
vendored
@ -50,7 +50,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
|
||||
py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t" ]
|
||||
device: ["cuda", "rocm", "xpu", "aarch64"]
|
||||
docker-image: ["pytorch/manylinux2_28-builder:cpu"]
|
||||
include:
|
||||
@ -126,12 +126,6 @@ jobs:
|
||||
3.13t)
|
||||
PYTHON_EXECUTABLE=/opt/python/cp313-cp313t/bin/python
|
||||
;;
|
||||
3.14)
|
||||
PYTHON_EXECUTABLE=/opt/python/cp314-cp314/bin/python
|
||||
;;
|
||||
3.14t)
|
||||
PYTHON_EXECUTABLE=/opt/python/cp314-cp314t/bin/python
|
||||
;;
|
||||
*)
|
||||
echo "Unsupported python version ${PY_VERS}"
|
||||
exit 1
|
||||
|
||||
@ -56,7 +56,7 @@ jobs:
|
||||
cache: pip
|
||||
architecture: x64
|
||||
|
||||
- run: pip install pyyaml==6.0.2
|
||||
- run: pip install pyyaml==6.0
|
||||
shell: bash
|
||||
|
||||
- name: Verify mergeability
|
||||
|
||||
2
.github/workflows/cherry-pick.yml
vendored
2
.github/workflows/cherry-pick.yml
vendored
@ -26,7 +26,7 @@ jobs:
|
||||
cache: pip
|
||||
|
||||
# Not the direct dependencies but the script uses trymerge
|
||||
- run: pip install pyyaml==6.0.2
|
||||
- run: pip install pyyaml==6.0
|
||||
|
||||
- name: Setup committer id
|
||||
run: |
|
||||
|
||||
1
.github/workflows/docker-builds.yml
vendored
1
.github/workflows/docker-builds.yml
vendored
@ -50,7 +50,6 @@ jobs:
|
||||
runner: [linux.12xlarge]
|
||||
docker-image-name: [
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks,
|
||||
|
||||
2
.github/workflows/docker-release.yml
vendored
2
.github/workflows/docker-release.yml
vendored
@ -144,7 +144,7 @@ jobs:
|
||||
run: |
|
||||
make -f docker.Makefile "${BUILD_IMAGE_TYPE}-image"
|
||||
- name: Push nightly tags
|
||||
if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' && matrix.platform == 'linux/amd4' }}
|
||||
if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' && matrix.build_platforms == 'linux/amd4' }}
|
||||
run: |
|
||||
PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-cuda${CUDA_VERSION_SHORT}-cudnn${CUDNN_VERSION}-runtime"
|
||||
CUDA_SUFFIX="-cu${CUDA_VERSION}"
|
||||
|
||||
58
.github/workflows/h100-cutlass-backend.yml
vendored
58
.github/workflows/h100-cutlass-backend.yml
vendored
@ -1,58 +0,0 @@
|
||||
name: Limited CI for CUTLASS backend on H100
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- .github/workflows/h100-cutlass-backend.yml
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: 22 9 * * * # every 24 hours about 2:22am PDT
|
||||
push:
|
||||
tags:
|
||||
- ciflow/h100-cutlass-backend/*
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
|
||||
get-label-type:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: get-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-cutlass-backend:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc11-sm90-cutlass-backend
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-cutlass-backend
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '9.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "h100_cutlass_backend", shard: 1, num_shards: 1, runner: "linux.aws.h100", owners: ["oncall:pt2"] },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-sm90-test:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc11-sm90-cutlass-backend
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs:
|
||||
- linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-cutlass-backend
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-cutlass-backend
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-cutlass-backend.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-cutlass-backend.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
@ -2,7 +2,7 @@ name: inductor-perf-nightly-h100
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: 15 0,12 * * 1-6
|
||||
- cron: 15 0,4,8,12,16,20 * * 1-6
|
||||
- cron: 0 7 * * 0
|
||||
# NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
|
||||
# out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
|
||||
@ -126,7 +126,7 @@ jobs:
|
||||
name: cuda12.8-py3.10-gcc9-sm90
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
if: github.event.schedule == '15 0,12 * * 1-6'
|
||||
if: github.event.schedule == '15 0,4,8,12,16,20 * * 1-6'
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
|
||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
|
||||
|
||||
@ -88,23 +88,23 @@ jobs:
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
4
.github/workflows/inductor-rocm-mi300.yml
vendored
4
.github/workflows/inductor-rocm-mi300.yml
vendored
@ -47,8 +47,8 @@ jobs:
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
1
.github/workflows/mac-mps.yml
vendored
1
.github/workflows/mac-mps.yml
vendored
@ -28,6 +28,7 @@ jobs:
|
||||
# than our AWS macos-m1-14 runners
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "test_mps", shard: 1, num_shards: 1, runner: "macos-m1-13" },
|
||||
{ config: "test_mps", shard: 1, num_shards: 1, runner: "macos-m1-14" },
|
||||
{ config: "test_mps", shard: 1, num_shards: 1, runner: "macos-m2-15" },
|
||||
]}
|
||||
|
||||
2
.github/workflows/nightly.yml
vendored
2
.github/workflows/nightly.yml
vendored
@ -86,7 +86,7 @@ jobs:
|
||||
- repo-name: vllm
|
||||
repo-owner: vllm-project
|
||||
branch: main
|
||||
pin-folder: .github/ci_commit_pins
|
||||
pin-folder: .ci/docker/ci_commit_pins
|
||||
# Allow this to be triggered on either a schedule or on workflow_dispatch to allow for easier testing
|
||||
if: github.repository_owner == 'pytorch' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||
steps:
|
||||
|
||||
6
.github/workflows/periodic-rocm-mi300.yml
vendored
6
.github/workflows/periodic-rocm-mi300.yml
vendored
@ -59,9 +59,9 @@ jobs:
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
15
.github/workflows/pull.yml
vendored
15
.github/workflows/pull.yml
vendored
@ -315,6 +315,21 @@ jobs:
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3-clang18-mobile-build:
|
||||
name: linux-jammy-py3-clang18-mobile-build
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3-clang12-mobile-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
|
||||
build-generates-artifacts: false
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 1 },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-cudnn9-py3_9-clang12-build:
|
||||
name: linux-jammy-cuda12.8-cudnn9-py3.9-clang12
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
|
||||
2
.github/workflows/revert.yml
vendored
2
.github/workflows/revert.yml
vendored
@ -26,7 +26,7 @@ jobs:
|
||||
architecture: x64
|
||||
check-latest: false
|
||||
cache: pip
|
||||
- run: pip install pyyaml==6.0.2
|
||||
- run: pip install pyyaml==6.0
|
||||
|
||||
- name: Setup committer id
|
||||
run: |
|
||||
|
||||
12
.github/workflows/rocm-mi300.yml
vendored
12
.github/workflows/rocm-mi300.yml
vendored
@ -48,12 +48,12 @@ jobs:
|
||||
sync-tag: rocm-build
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
|
||||
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
68
.github/workflows/rocm-mi355.yml
vendored
68
.github/workflows/rocm-mi355.yml
vendored
@ -1,68 +0,0 @@
|
||||
name: rocm-mi355
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: 30 11,1 * * * # about 4:30am PDT and 6:30pm PDT
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
target-determination:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: before-test
|
||||
uses: ./.github/workflows/target_determination.yml
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-noble-rocm-py3_12-build:
|
||||
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
|
||||
name: linux-noble-rocm-py3.12-mi355
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-noble-rocm-py3.12-mi355
|
||||
docker-image-name: ci-image:pytorch-linux-noble-rocm-alpha-py3
|
||||
sync-tag: rocm-build
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
|
||||
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
|
||||
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
|
||||
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
|
||||
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
|
||||
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-noble-rocm-py3_12-test:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-noble-rocm-py3.12-mi355
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-noble-rocm-py3_12-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-noble-rocm-py3.12-mi355
|
||||
docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }}
|
||||
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
|
||||
secrets: inherit
|
||||
1
.github/workflows/trunk.yml
vendored
1
.github/workflows/trunk.yml
vendored
@ -94,6 +94,7 @@ jobs:
|
||||
{ config: "default", shard: 1, num_shards: 3, runner: "macos-m1-stable" },
|
||||
{ config: "default", shard: 2, num_shards: 3, runner: "macos-m1-stable" },
|
||||
{ config: "default", shard: 3, num_shards: 3, runner: "macos-m1-stable" },
|
||||
{ config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-13" },
|
||||
{ config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-14" },
|
||||
{ config: "mps", shard: 1, num_shards: 1, runner: "macos-m2-15" },
|
||||
]}
|
||||
|
||||
2
.github/workflows/trymerge.yml
vendored
2
.github/workflows/trymerge.yml
vendored
@ -28,7 +28,7 @@ jobs:
|
||||
check-latest: false
|
||||
cache: pip
|
||||
architecture: x64
|
||||
- run: pip install pyyaml==6.0.2
|
||||
- run: pip install pyyaml==6.0
|
||||
|
||||
- name: Setup committer id
|
||||
run: |
|
||||
|
||||
2
.github/workflows/tryrebase.yml
vendored
2
.github/workflows/tryrebase.yml
vendored
@ -25,7 +25,7 @@ jobs:
|
||||
architecture: x64
|
||||
check-latest: false
|
||||
cache: pip
|
||||
- run: pip install pyyaml==6.0.2
|
||||
- run: pip install pyyaml==6.0
|
||||
|
||||
- name: Setup committer id
|
||||
run: |
|
||||
|
||||
1
.github/workflows/upload-test-stats.yml
vendored
1
.github/workflows/upload-test-stats.yml
vendored
@ -14,7 +14,6 @@ on:
|
||||
- inductor-periodic
|
||||
- rocm
|
||||
- rocm-mi300
|
||||
- rocm-mi355
|
||||
- inductor-micro-benchmark
|
||||
- inductor-micro-benchmark-x86
|
||||
- inductor-cu124
|
||||
|
||||
187
.github/workflows/win-arm64-build-test.yml
vendored
187
.github/workflows/win-arm64-build-test.yml
vendored
@ -1,187 +0,0 @@
|
||||
name: windows-arm64-build-test
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- ciflow/win-arm64/*
|
||||
|
||||
env:
|
||||
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
|
||||
PYTHON_VERSION: "3.12"
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
DOWNLOADS_DIR: c:\temp\downloads
|
||||
DEPENDENCIES_DIR: c:\temp\dependencies
|
||||
ENABLE_APL: 1
|
||||
ENABLE_OPENBLAS: 0
|
||||
BUILD_TYPE: release
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
build:
|
||||
# Don't run on forked repos.
|
||||
if: github.repository_owner == 'pytorch'
|
||||
runs-on: "windows-11-arm64-preview"
|
||||
timeout-minutes: 240
|
||||
steps:
|
||||
- name: configure aws credentials
|
||||
id: aws_creds
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_sscache
|
||||
aws-region: us-east-1
|
||||
role-duration-seconds: 18000
|
||||
|
||||
- name: Enable long paths
|
||||
shell: cmd
|
||||
run: |
|
||||
git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
|
||||
git config --system core.longpaths true
|
||||
|
||||
- name: Git checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
path: pytorch
|
||||
submodules: recursive
|
||||
|
||||
- name: Bootstrap Python
|
||||
shell: cmd
|
||||
run: |
|
||||
"pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
|
||||
|
||||
- name: Parse ref
|
||||
id: parse-ref
|
||||
shell: bash
|
||||
run: python pytorch/.github/scripts/parse_ref.py
|
||||
|
||||
- name: Get workflow job id
|
||||
shell: bash
|
||||
id: get-job-id
|
||||
run: |
|
||||
set -eux
|
||||
python pytorch/.github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}"
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Bootstrap APL
|
||||
shell: cmd
|
||||
run: |
|
||||
"pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
|
||||
|
||||
- name: Bootstrap Rust
|
||||
shell: cmd
|
||||
run: |
|
||||
"pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
|
||||
|
||||
- name: Bootstrap sccache
|
||||
shell: cmd
|
||||
run: |
|
||||
"pytorch/.ci/pytorch/windows/arm64/bootstrap_sccache.bat"
|
||||
|
||||
- name: Bootstrap Libuv
|
||||
shell: cmd
|
||||
run: |
|
||||
"pytorch/.ci/pytorch/windows/arm64/bootstrap_libuv.bat"
|
||||
|
||||
- name: Build
|
||||
id: build
|
||||
shell: cmd
|
||||
env:
|
||||
PYTORCH_FINAL_PACKAGE_DIR: C:/${{ github.run_id }}/build-results/
|
||||
BRANCH: ${{ steps.parse-ref.outputs.branch }}
|
||||
BUILD_WHEEL: 1
|
||||
MAX_JOBS: 8
|
||||
PYTHON_VERSION: "3.12"
|
||||
SCCACHE_BUCKET: "ossci-compiler-cache"
|
||||
SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
|
||||
SCCACHE_REGION: us-east-1
|
||||
VC_PRODUCT: "BuildTools"
|
||||
VC_VERSION: ""
|
||||
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
USE_CUDA: '0'
|
||||
USE_XPU: '0'
|
||||
OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
|
||||
run: |
|
||||
cd pytorch
|
||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64
|
||||
powershell -ExecutionPolicy Bypass -File ".ci/pytorch/win-arm64-build.ps1"
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4.4.0
|
||||
if: always()
|
||||
with:
|
||||
name: torch-wheel-win-arm64-py3-12
|
||||
retention-days: 14
|
||||
if-no-files-found: error
|
||||
path: C:\${{ github.run_id }}\build-results
|
||||
|
||||
test:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
strategy:
|
||||
fail-fast: false
|
||||
runs-on: "windows-11-arm64-preview"
|
||||
needs: build
|
||||
steps:
|
||||
- name: Enable long paths
|
||||
shell: cmd
|
||||
run: |
|
||||
git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
|
||||
git config --system core.longpaths true
|
||||
|
||||
- name: Git checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
path: pytorch
|
||||
submodules: recursive
|
||||
|
||||
- name: Bootstrap Python
|
||||
shell: cmd
|
||||
run: |
|
||||
"pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
|
||||
|
||||
- name: Bootstrap Rust
|
||||
shell: cmd
|
||||
run: |
|
||||
"pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
|
||||
|
||||
- name: Get workflow job id
|
||||
shell: bash
|
||||
id: get-job-id
|
||||
run: |
|
||||
set -eux
|
||||
python pytorch/.github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}"
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Download Build Artifacts
|
||||
uses: actions/download-artifact@v4.1.7
|
||||
with:
|
||||
name: torch-wheel-win-arm64-py3-12
|
||||
path: C:\${{ github.run_id }}\build-results
|
||||
|
||||
- name: Test
|
||||
id: test
|
||||
shell: cmd
|
||||
env:
|
||||
USE_CUDA: '0'
|
||||
INSTALL_WINDOWS_SDK: 1
|
||||
PYTHON_VERSION: "3.12"
|
||||
VC_PRODUCT: "BuildTools"
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
GITHUB_REPOSITORY: ${{ github.repository }}
|
||||
GITHUB_WORKFLOW: ${{ github.workflow }}
|
||||
GITHUB_JOB: ${{ github.job }}
|
||||
GITHUB_RUN_ID: ${{ github.run_id }}
|
||||
GITHUB_RUN_NUMBER: ${{ github.run_number }}
|
||||
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
|
||||
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
|
||||
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
|
||||
PYTORCH_FINAL_PACKAGE_DIR: C:/${{ github.run_id }}/build-results/
|
||||
run: |
|
||||
mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
|
||||
call pytorch/.ci/pytorch/windows/arm64/bootstrap_tests.bat
|
||||
set GIT_BASH=C:\Program Files\Git\usr\bin\bash.exe
|
||||
"%GIT_BASH%" -c "bash --noprofile --norc .ci/pytorch/win-arm64-test.sh"
|
||||
@ -39,16 +39,16 @@ init_command = [
|
||||
'python3',
|
||||
'tools/linter/adapters/pip_init.py',
|
||||
'--dry-run={{DRYRUN}}',
|
||||
'flake8==7.3.0',
|
||||
'flake8-bugbear==24.12.12',
|
||||
'flake8-comprehensions==3.16.0',
|
||||
'flake8==6.1.0',
|
||||
'flake8-bugbear==23.3.23',
|
||||
'flake8-comprehensions==3.15.0',
|
||||
'flake8-executable==2.1.3',
|
||||
'flake8-logging-format==2024.24.12',
|
||||
'flake8-pyi==25.5.0',
|
||||
'flake8-simplify==0.22.0',
|
||||
'flake8-logging-format==0.9.0',
|
||||
'flake8-pyi==23.3.1',
|
||||
'flake8-simplify==0.19.3',
|
||||
'mccabe==0.7.0',
|
||||
'pycodestyle==2.14.0',
|
||||
'pyflakes==3.4.0',
|
||||
'pycodestyle==2.11.1',
|
||||
'pyflakes==3.1.0',
|
||||
'torchfix==0.4.0 ; python_version >= "3.9" and python_version < "3.13"',
|
||||
]
|
||||
|
||||
@ -158,16 +158,16 @@ init_command = [
|
||||
'mypy==1.16.0',
|
||||
'sympy==1.13.3',
|
||||
'types-requests==2.27.25',
|
||||
'types-pyyaml==6.0.2',
|
||||
'types-pyyaml==6.0.1',
|
||||
'types-tabulate==0.8.8',
|
||||
'types-protobuf==5.29.1.20250403',
|
||||
'types-setuptools==79.0.0.20250422',
|
||||
'types-jinja2==2.11.9',
|
||||
'types-colorama==0.4.6',
|
||||
'filelock==3.18.0',
|
||||
'filelock==3.13.1',
|
||||
'junitparser==2.1.1',
|
||||
'rich==14.1.0',
|
||||
'pyyaml==6.0.2',
|
||||
'rich==10.9.0',
|
||||
'pyyaml==6.0.1',
|
||||
'optree==0.13.0',
|
||||
'dataclasses-json==0.6.7',
|
||||
'pandas==2.2.3',
|
||||
@ -1111,7 +1111,7 @@ init_command = [
|
||||
'python3',
|
||||
'tools/linter/adapters/pip_init.py',
|
||||
'--dry-run={{DRYRUN}}',
|
||||
'pyyaml==6.0.2',
|
||||
'PyYAML==6.0.1',
|
||||
]
|
||||
|
||||
[[linter]]
|
||||
@ -1133,7 +1133,7 @@ init_command = [
|
||||
'python3',
|
||||
'tools/linter/adapters/pip_init.py',
|
||||
'--dry-run={{DRYRUN}}',
|
||||
'pyyaml==6.0.2',
|
||||
'PyYAML==6.0.1',
|
||||
]
|
||||
|
||||
[[linter]]
|
||||
@ -1794,12 +1794,3 @@ include_patterns = [
|
||||
'torch/header_only_apis.txt',
|
||||
]
|
||||
is_formatter = false
|
||||
|
||||
|
||||
[[linter]]
|
||||
code = "GB_REGISTRY"
|
||||
include_patterns = ["torch/_dynamo/**/*.py"]
|
||||
command = [
|
||||
"python3",
|
||||
"tools/linter/adapters/gb_registry_linter.py",
|
||||
]
|
||||
|
||||
@ -679,7 +679,6 @@ cc_library(
|
||||
[
|
||||
"torch/*.h",
|
||||
"torch/csrc/**/*.h",
|
||||
"torch/nativert/**/*.h",
|
||||
"torch/csrc/distributed/c10d/**/*.hpp",
|
||||
"torch/lib/libshm/*.h",
|
||||
],
|
||||
|
||||
@ -564,7 +564,7 @@ if(MSVC)
|
||||
set(CMAKE_NINJA_CMCLDEPS_RC OFF)
|
||||
if(MSVC_Z7_OVERRIDE)
|
||||
# CMake set debug flags to use /Z7
|
||||
set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$<CONFIG:Debug,RelWithDebInfo>:Embedded>")
|
||||
set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT Embedded)
|
||||
endif()
|
||||
foreach(
|
||||
flag_var
|
||||
@ -872,14 +872,6 @@ cmake_dependent_option(
|
||||
"USE_CUDA OR USE_ROCM;NOT MSVC"
|
||||
OFF)
|
||||
|
||||
cmake_dependent_option(
|
||||
USE_FBGEMM_GENAI
|
||||
"Whether to build FBGEMM GenAI quantized GEMM kernels.\
|
||||
Will be disabled if not supported by the platform"
|
||||
OFF
|
||||
"USE_CUDA OR USE_ROCM"
|
||||
OFF)
|
||||
|
||||
# CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem
|
||||
# Eff Attention won't
|
||||
cmake_dependent_option(
|
||||
@ -913,10 +905,6 @@ if(USE_FBGEMM)
|
||||
string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM")
|
||||
endif()
|
||||
|
||||
if(USE_FBGEMM_GENAI)
|
||||
string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM_GENAI")
|
||||
endif()
|
||||
|
||||
if(USE_PYTORCH_QNNPACK)
|
||||
string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_QNNPACK")
|
||||
endif()
|
||||
|
||||
12
CODEOWNERS
12
CODEOWNERS
@ -51,12 +51,12 @@ nn/qat/ @jerryzh168
|
||||
/torch/csrc/distributed/c10d/Ops.* @kwen2501
|
||||
|
||||
# ONNX Export
|
||||
/torch/_dynamo/backends/onnxrt.py @titaiwangms @xadupre @justinchuby
|
||||
/torch/csrc/jit/passes/onnx.h @titaiwangms @xadupre
|
||||
/torch/csrc/jit/passes/onnx.cpp @titaiwangms @xadupre
|
||||
/torch/csrc/jit/passes/onnx/ @titaiwangms @xadupre
|
||||
/torch/onnx/ @titaiwangms @xadupre @justinchuby
|
||||
/test/onnx/ @titaiwangms @xadupre @justinchuby
|
||||
/torch/_dynamo/backends/onnxrt.py @wschin
|
||||
/torch/csrc/jit/passes/onnx.h @titaiwangms @shubhambhokare1
|
||||
/torch/csrc/jit/passes/onnx.cpp @titaiwangms @shubhambhokare1
|
||||
/torch/csrc/jit/passes/onnx/ @titaiwangms @shubhambhokare1
|
||||
/torch/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin
|
||||
/test/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin
|
||||
|
||||
# CI
|
||||
/.ci @pytorch/pytorch-dev-infra
|
||||
|
||||
15
Dockerfile
15
Dockerfile
@ -47,6 +47,18 @@ WORKDIR /opt/pytorch
|
||||
COPY . .
|
||||
RUN git submodule update --init --recursive
|
||||
|
||||
FROM conda as build
|
||||
ARG CMAKE_VARS
|
||||
WORKDIR /opt/pytorch
|
||||
COPY --from=conda /opt/conda /opt/conda
|
||||
COPY --from=submodule-update /opt/pytorch /opt/pytorch
|
||||
RUN make triton
|
||||
RUN --mount=type=cache,target=/opt/ccache \
|
||||
export eval ${CMAKE_VARS} && \
|
||||
TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 8.9 9.0 9.0a" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
|
||||
CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
|
||||
python -m pip install --no-build-isolation -v .
|
||||
|
||||
FROM conda as conda-installs
|
||||
ARG PYTHON_VERSION=3.11
|
||||
ARG CUDA_PATH=cu121
|
||||
@ -97,5 +109,4 @@ WORKDIR /workspace
|
||||
|
||||
FROM official as dev
|
||||
# Should override the already installed version from the official-image stage
|
||||
COPY --from=conda /opt/conda /opt/conda
|
||||
COPY --from=submodule-update /opt/pytorch /opt/pytorch
|
||||
COPY --from=build /opt/conda /opt/conda
|
||||
|
||||
@ -294,12 +294,14 @@ Install PyTorch
|
||||
|
||||
```bash
|
||||
export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
|
||||
python -m pip install -r requirements-build.txt
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
```
|
||||
|
||||
**On macOS**
|
||||
|
||||
```bash
|
||||
python -m pip install -r requirements-build.txt
|
||||
python -m pip install --no-build-isolation -v -e .
|
||||
```
|
||||
|
||||
|
||||
@ -247,50 +247,6 @@ if(USE_MEM_EFF_ATTENTION)
|
||||
list(APPEND ATen_ATTENTION_KERNEL_SRCS ${mem_eff_attention_cuda_kernels_cu})
|
||||
endif()
|
||||
|
||||
IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
|
||||
message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF")
|
||||
set(USE_FBGEMM_GENAI off)
|
||||
endif()
|
||||
|
||||
# FBGEMM GenAI
|
||||
IF(USE_FBGEMM_GENAI)
|
||||
set(FBGEMM_THIRD_PARTY ${PROJECT_SOURCE_DIR}/third_party/fbgemm/external/)
|
||||
set(FBGEMM_GENAI_DIR ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize)
|
||||
|
||||
if(USE_ROCM)
|
||||
# Only include the kernels we want to build to avoid increasing binary size.
|
||||
file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
|
||||
"${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
|
||||
"${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
|
||||
set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
|
||||
|
||||
# Add additional HIPCC compiler flags for performance
|
||||
set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
|
||||
-mllvm
|
||||
-amdgpu-coerce-illegal-types=1
|
||||
-mllvm
|
||||
-enable-post-misched=0
|
||||
-mllvm
|
||||
-greedy-reverse-local-assignment=1
|
||||
-fhip-new-launch-api)
|
||||
|
||||
hip_add_library(
|
||||
fbgemm_genai STATIC
|
||||
${fbgemm_genai_native_rocm_hip}
|
||||
HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
|
||||
set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
|
||||
|
||||
target_include_directories(fbgemm_genai PUBLIC
|
||||
# FBGEMM version of Composable Kernel is used due to some customizations
|
||||
${FBGEMM_THIRD_PARTY}/composable_kernel/include
|
||||
${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
|
||||
${FBGEMM_GENAI_DIR}/include/
|
||||
${FBGEMM_GENAI_DIR}/common/include/
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# XNNPACK
|
||||
file(GLOB native_xnnpack "native/xnnpack/*.cpp")
|
||||
|
||||
@ -630,10 +586,17 @@ if(USE_CUDA AND NOT USE_ROCM)
|
||||
CUDA::cufft_static_nocallback
|
||||
)
|
||||
if(NOT BUILD_LAZY_CUDA_LINALG)
|
||||
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
|
||||
CUDA::cusolver_static
|
||||
${CUDAToolkit_LIBRARY_DIR}/libcusolver_lapack_static.a # needed for libcusolver_static
|
||||
)
|
||||
if(CUDA_VERSION_MAJOR LESS_EQUAL 11)
|
||||
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
|
||||
CUDA::cusolver_static
|
||||
${CUDAToolkit_LIBRARY_DIR}/liblapack_static.a # needed for libcusolver_static
|
||||
)
|
||||
elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 12)
|
||||
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
|
||||
CUDA::cusolver_static
|
||||
${CUDAToolkit_LIBRARY_DIR}/libcusolver_lapack_static.a # needed for libcusolver_static
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
else()
|
||||
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
|
||||
|
||||
@ -14,9 +14,7 @@
|
||||
#include <ATen/cpu/FlushDenormal.h>
|
||||
|
||||
#ifdef USE_FBGEMM
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
|
||||
#include <fbgemm/Fbgemm.h>
|
||||
C10_DIAGNOSTIC_POP()
|
||||
#endif // USE_FBGEMM
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE)
|
||||
#include <cpuinfo.h>
|
||||
@ -334,14 +332,6 @@ void Context::setBenchmarkLimitCuDNN(int b) {
|
||||
benchmark_limit_cudnn = b;
|
||||
}
|
||||
|
||||
bool Context::immediateMiopen() const {
|
||||
return immediate_miopen;
|
||||
}
|
||||
|
||||
void Context::setImmediateMiopen(bool b) {
|
||||
immediate_miopen = b;
|
||||
}
|
||||
|
||||
bool Context::allowTF32CuBLAS() const {
|
||||
#ifdef USE_ROCM
|
||||
const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
|
||||
@ -512,7 +502,7 @@ at::BlasBackend Context::blasPreferredBackend() {
|
||||
static const std::vector<std::string> archs = {
|
||||
"gfx90a", "gfx942",
|
||||
#if ROCM_VERSION >= 60300
|
||||
"gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908",
|
||||
"gfx1100", "gfx1101", "gfx1200", "gfx1201",
|
||||
#endif
|
||||
#if ROCM_VERSION >= 60500
|
||||
"gfx950"
|
||||
|
||||
@ -205,8 +205,6 @@ class TORCH_API Context {
|
||||
void setBenchmarkCuDNN(bool);
|
||||
int benchmarkLimitCuDNN() const;
|
||||
void setBenchmarkLimitCuDNN(int);
|
||||
bool immediateMiopen() const;
|
||||
void setImmediateMiopen(bool);
|
||||
bool deterministicCuDNN() const;
|
||||
void setDeterministicCuDNN(bool);
|
||||
bool deterministicMkldnn() const;
|
||||
@ -442,7 +440,6 @@ class TORCH_API Context {
|
||||
bool enabled_overrideable = true;
|
||||
bool allow_fp16_bf16_reduction_mathSDP = false;
|
||||
bool benchmark_cudnn = false;
|
||||
bool immediate_miopen = false;
|
||||
Float32MatmulPrecision float32_matmul_precision =
|
||||
c10::utils::check_env("TORCH_ALLOW_TF32_CUBLAS_OVERRIDE") == true
|
||||
? at::Float32MatmulPrecision::HIGH
|
||||
|
||||
@ -132,9 +132,6 @@ DLDevice torchDeviceToDLDevice(at::Device device) {
|
||||
case DeviceType::PrivateUse1:
|
||||
ctx.device_type = DLDeviceType::kDLExtDev;
|
||||
break;
|
||||
case DeviceType::MPS:
|
||||
ctx.device_type = DLDeviceType::kDLMetal;
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK_BUFFER(false, "Cannot pack tensors on " + device.str());
|
||||
}
|
||||
@ -167,8 +164,6 @@ static Device getATenDevice(DLDeviceType type, c10::DeviceIndex index, void* dat
|
||||
return at::Device(DeviceType::MAIA, index);
|
||||
case DLDeviceType::kDLExtDev:
|
||||
return at::Device(DeviceType::PrivateUse1, index);
|
||||
case DLDeviceType::kDLMetal:
|
||||
return at::Device(DeviceType::MPS, index);
|
||||
default:
|
||||
TORCH_CHECK_BUFFER(
|
||||
false, "Unsupported device_type: ", std::to_string(type));
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/CachingDeviceAllocator.h>
|
||||
#include <c10/core/DeviceType.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
|
||||
@ -72,6 +73,27 @@ TORCH_API c10::DeviceIndex exchangeDevice(c10::DeviceIndex device_index);
|
||||
// original device index that was active before the change.
|
||||
TORCH_API c10::DeviceIndex maybeExchangeDevice(c10::DeviceIndex device_index);
|
||||
|
||||
TORCH_API inline void emptyCache() {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
at::getDeviceAllocator(device_type)->emptyCache();
|
||||
}
|
||||
|
||||
TORCH_API inline at::CachingDeviceAllocator::DeviceStats getDeviceStats(
|
||||
c10::DeviceIndex device_index) {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
return at::getDeviceAllocator(device_type)->getDeviceStats(device_index);
|
||||
}
|
||||
|
||||
TORCH_API inline void resetAccumulatedStats(c10::DeviceIndex device_index) {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
at::getDeviceAllocator(device_type)->resetAccumulatedStats(device_index);
|
||||
}
|
||||
|
||||
TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
|
||||
}
|
||||
|
||||
} // namespace at::accelerator
|
||||
|
||||
namespace at {
|
||||
|
||||
@ -9,36 +9,7 @@
|
||||
|
||||
namespace at {
|
||||
|
||||
/*
|
||||
* Design:
|
||||
* 1. ZeroTensors are regular tensors with TensorOptions, a storage
|
||||
* pointing to nullptr and a ZeroTensor dispatch key set.
|
||||
*
|
||||
* 2. ZeroTensors are immutable. This is done to prevent data race in the case of multithreading
|
||||
* (when two threads try to read the same zero tensor and materialize it in-place).
|
||||
*
|
||||
* 3. ZeroTensor has a boxed fallback that will be dispatched to any ops that don't
|
||||
* have special ZeroTensor handling. This fallback materializes each ZeroTensor to
|
||||
* `at::zeros({}, tensor.options()).expand(tensor.sizes())`.
|
||||
|
||||
* 4. ZeroTensors are handled above autograd. This is necessary because fallback
|
||||
* operations are not differentiable.
|
||||
* - Example: Consider add in the case it was using the fallback: zerotensor_a + b.
|
||||
* zerotensor_a would be materialized to c=torch.zeros_like(zerotensor_a) after
|
||||
* passing through the fallback. If this happens above the autograd, then the
|
||||
* gradients would be populated on c instead of zerotensor_a.
|
||||
*
|
||||
* 5. The grad field is always populated with an honest to goodness tensor. This
|
||||
* materialization of ZeroTensors will happen in:
|
||||
* - AccumulateGrad for Backward Mode AD.
|
||||
* - will never be required for ForwardMode AD.
|
||||
* - This is because if all the tangents were undefined (efficient ZeroTensors),
|
||||
* no computation will be performed (this is ensured via an existing pre-check).
|
||||
*
|
||||
* Today ZeroTensors are primarily used to represent undefined gradients in forward AD,
|
||||
* it does not perfectly handle NaNs and Infs as we don't check the actual values
|
||||
* and assume that they are non-zero, non-inf, non-NaN etc.
|
||||
*/
|
||||
// TODO: add a note explaining the design decisions
|
||||
// ZeroTensors are designed to be immutable. Thus, we error out when an in-place operation is performed on ZeroTensors
|
||||
static void zeroTensorFallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
|
||||
const auto& arguments = op.schema().arguments();
|
||||
|
||||
@ -1 +1,55 @@
|
||||
#include <torch/headeronly/cpu/vec/intrinsics.h>
|
||||
#pragma once
|
||||
#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
|
||||
/* GCC or clang-compatible compiler, targeting x86/x86-64 */
|
||||
#include <x86intrin.h>
|
||||
#elif defined(__clang__) && (defined(__ARM_NEON__) || defined(__aarch64__))
|
||||
/* Clang-compatible compiler, targeting arm neon */
|
||||
#include <arm_neon.h>
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
/* CLANG-compatible compiler, targeting ARM with SVE */
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
#elif defined(_MSC_VER)
|
||||
/* Microsoft C/C++-compatible compiler */
|
||||
#include <intrin.h>
|
||||
#if _MSC_VER <= 1900
|
||||
#define _mm256_extract_epi64(X, Y) \
|
||||
(_mm_extract_epi64(_mm256_extractf128_si256(X, Y >> 1), Y % 2))
|
||||
#define _mm256_extract_epi32(X, Y) \
|
||||
(_mm_extract_epi32(_mm256_extractf128_si256(X, Y >> 2), Y % 4))
|
||||
#define _mm256_extract_epi16(X, Y) \
|
||||
(_mm_extract_epi16(_mm256_extractf128_si256(X, Y >> 3), Y % 8))
|
||||
#define _mm256_extract_epi8(X, Y) \
|
||||
(_mm_extract_epi8(_mm256_extractf128_si256(X, Y >> 4), Y % 16))
|
||||
#endif
|
||||
#elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__aarch64__))
|
||||
/* GCC-compatible compiler, targeting ARM with NEON */
|
||||
#include <arm_neon.h>
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
/* GCC-compatible compiler, targeting ARM with SVE */
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
#if defined(MISSING_ARM_VLD1)
|
||||
#include <ATen/cpu/vec/vec256/missing_vld1_neon.h>
|
||||
#elif defined(MISSING_ARM_VST1)
|
||||
#include <ATen/cpu/vec/vec256/missing_vst1_neon.h>
|
||||
#endif
|
||||
#elif defined(__GNUC__) && defined(__IWMMXT__)
|
||||
/* GCC-compatible compiler, targeting ARM with WMMX */
|
||||
#include <mmintrin.h>
|
||||
#elif defined(__s390x__)
|
||||
// targets Z/architecture
|
||||
// we will include vecintrin later
|
||||
#elif (defined(__GNUC__) || defined(__xlC__)) && \
|
||||
(defined(__VEC__) || defined(__ALTIVEC__))
|
||||
/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
|
||||
#include <altivec.h>
|
||||
/* We need to undef those tokens defined by <altivec.h> to avoid conflicts
|
||||
with the C++ types. => Can still use __bool/__vector */
|
||||
#undef bool
|
||||
#undef vector
|
||||
#undef pixel
|
||||
#elif defined(__GNUC__) && defined(__SPE__)
|
||||
/* GCC-compatible compiler, targeting PowerPC with SPE */
|
||||
#include <spe.h>
|
||||
#endif
|
||||
|
||||
@ -5,7 +5,6 @@
|
||||
#include <ATen/cpu/vec/sve/vec_common_sve.h>
|
||||
#include <ATen/cpu/vec/sve/vec_float.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <c10/util/bit_cast.h>
|
||||
#include <cmath>
|
||||
namespace at {
|
||||
namespace vec {
|
||||
@ -37,7 +36,7 @@ class Vectorized<BFloat16> {
|
||||
return VECTOR_WIDTH / sizeof(BFloat16);
|
||||
}
|
||||
|
||||
Vectorized();
|
||||
Vectorized() {}
|
||||
Vectorized(svbfloat16_t v) : values(v) {}
|
||||
Vectorized(int val);
|
||||
Vectorized(BFloat16 val);
|
||||
@ -307,11 +306,6 @@ Vectorized<c10::BFloat16> inline operator/(
|
||||
return binary_operator_via_float(std::divides<Vectorized<float>>(), a, b);
|
||||
}
|
||||
|
||||
inline Vectorized<BFloat16>::Vectorized() {
|
||||
const short zero = 0;
|
||||
values = svdup_n_bf16(c10::bit_cast<bfloat16_t>(zero));
|
||||
}
|
||||
|
||||
inline Vectorized<BFloat16>::Vectorized(int val) {
|
||||
auto vals_f = svdup_n_f32(val);
|
||||
values = convert_float_bfloat16(vals_f, vals_f);
|
||||
|
||||
@ -38,9 +38,7 @@ class Vectorized<double> {
|
||||
static constexpr size_type size() {
|
||||
return VECTOR_WIDTH / sizeof(double);
|
||||
}
|
||||
Vectorized() {
|
||||
values = svdup_n_f64(0);
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(svfloat64_t v) : values(v) {}
|
||||
Vectorized(double val) {
|
||||
values = svdup_n_f64(val);
|
||||
@ -587,30 +585,6 @@ Vectorized<double> inline fmadd(
|
||||
return svmad_f64_x(ptrue, a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline fnmadd(
|
||||
const Vectorized<double>& a,
|
||||
const Vectorized<double>& b,
|
||||
const Vectorized<double>& c) {
|
||||
return svmsb_f64_x(ptrue, a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline fmsub(
|
||||
const Vectorized<double>& a,
|
||||
const Vectorized<double>& b,
|
||||
const Vectorized<double>& c) {
|
||||
return svnmsb_f64_x(ptrue, a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline fnmsub(
|
||||
const Vectorized<double>& a,
|
||||
const Vectorized<double>& b,
|
||||
const Vectorized<double>& c) {
|
||||
return svnmad_f64_x(ptrue, a, b, c);
|
||||
}
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
|
||||
@ -38,9 +38,7 @@ class Vectorized<float> {
|
||||
static constexpr size_type size() {
|
||||
return VECTOR_WIDTH / sizeof(float);
|
||||
}
|
||||
Vectorized() {
|
||||
values = svdup_n_f32(0);
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(svfloat32_t v) : values(v) {}
|
||||
Vectorized(float val) {
|
||||
values = svdup_n_f32(val);
|
||||
@ -758,30 +756,6 @@ Vectorized<float> inline fmadd(
|
||||
return svmad_f32_x(ptrue, a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fnmadd(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& c) {
|
||||
return svmsb_f32_x(ptrue, a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fmsub(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& c) {
|
||||
return svnmsb_f32_x(ptrue, a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fnmsub(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& c) {
|
||||
return svnmad_f32_x(ptrue, a, b, c);
|
||||
}
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
|
||||
@ -32,9 +32,7 @@ inline namespace CPU_CAPABILITY {
|
||||
static constexpr size_type size() { \
|
||||
return vl; \
|
||||
} \
|
||||
Vectorized() { \
|
||||
values = svdup_n_s##bit(0); \
|
||||
} \
|
||||
Vectorized() {} \
|
||||
Vectorized(svint##bit##_t v) : values(v) {} \
|
||||
Vectorized(int##bit##_t val) { \
|
||||
values = svdup_n_s##bit(val); \
|
||||
|
||||
@ -552,15 +552,6 @@ Vectorized<c10::BFloat16> inline fmadd(
|
||||
return a * b + c;
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<c10::BFloat16> inline fnmadd(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b,
|
||||
const Vectorized<c10::BFloat16>& c) {
|
||||
// See NOTE [BF16 FMA] above.
|
||||
return -a * b + c;
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<c10::BFloat16> inline fmsub(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
@ -570,15 +561,6 @@ Vectorized<c10::BFloat16> inline fmsub(
|
||||
return a * b - c;
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<c10::BFloat16> inline fnmsub(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b,
|
||||
const Vectorized<c10::BFloat16>& c) {
|
||||
// See NOTE [BF16 FMA] above.
|
||||
return -a * b - c;
|
||||
}
|
||||
|
||||
#endif // !defined(C10_MOBILE) && defined(__aarch64__)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
|
||||
@ -83,9 +83,7 @@ class Vectorized<float> {
|
||||
static constexpr size_type size() {
|
||||
return 4;
|
||||
}
|
||||
Vectorized() {
|
||||
values = vmovq_n_f32(0);
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(float32x4_t v) : values(v) {}
|
||||
Vectorized(float val) : values{vdupq_n_f32(val)} {}
|
||||
Vectorized(float val0, float val1, float val2, float val3)
|
||||
@ -584,14 +582,6 @@ Vectorized<float> inline fmadd(
|
||||
return Vectorized<float>(vfmaq_f32(c, a, b));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fnmadd(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& c) {
|
||||
return Vectorized<float>(vfmsq_f32(c, a, b));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fmsub(
|
||||
const Vectorized<float>& a,
|
||||
@ -600,14 +590,6 @@ Vectorized<float> inline fmsub(
|
||||
return Vectorized<float>(vnegq_f32(vfmsq_f32(c, a, b)));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fnmsub(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& c) {
|
||||
return Vectorized<float>(vnegq_f32(vfmaq_f32(c, a, b)));
|
||||
}
|
||||
|
||||
inline Vectorized<float> Vectorized<float>::erf() const {
|
||||
// constants
|
||||
const Vectorized<float> neg_zero_vec(-0.f);
|
||||
|
||||
@ -621,18 +621,6 @@ Vectorized<c10::Half> inline fmadd(
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<c10::Half> inline fnmadd(
|
||||
const Vectorized<c10::Half>& a,
|
||||
const Vectorized<c10::Half>& b,
|
||||
const Vectorized<c10::Half>& c) {
|
||||
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
||||
return Vectorized<c10::Half>(vfmsq_f16(c, a, b));
|
||||
#else
|
||||
return -a * b + c;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<c10::Half> inline fmsub(
|
||||
const Vectorized<c10::Half>& a,
|
||||
@ -644,18 +632,6 @@ Vectorized<c10::Half> inline fmsub(
|
||||
return a * b - c;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<c10::Half> inline fnmsub(
|
||||
const Vectorized<c10::Half>& a,
|
||||
const Vectorized<c10::Half>& b,
|
||||
const Vectorized<c10::Half>& c) {
|
||||
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
||||
return Vectorized<c10::Half>(vnegq_f16(vfmaq_f16(c, a, b)));
|
||||
#else
|
||||
return -a * b - c;
|
||||
#endif
|
||||
}
|
||||
#endif // !defined(C10_MOBILE) && defined(__aarch64__)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
|
||||
@ -1 +1,396 @@
|
||||
#include <torch/headeronly/cpu/vec/vec256/missing_vld1_neon.h>
|
||||
/* Workaround for missing vld1_*_x2 and vst1_*_x2 intrinsics in gcc-7. */
|
||||
|
||||
__extension__ extern __inline uint8x8x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_u8_x2(const uint8_t* __a) {
|
||||
uint8x8x2_t ret;
|
||||
asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline int8x8x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_s8_x2(const int8_t* __a) {
|
||||
int8x8x2_t ret;
|
||||
asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint16x4x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_u16_x2(const uint16_t* __a) {
|
||||
uint16x4x2_t ret;
|
||||
asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline int16x4x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_s16_x2(const int16_t* __a) {
|
||||
int16x4x2_t ret;
|
||||
asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint32x2x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_u32_x2(const uint32_t* __a) {
|
||||
uint32x2x2_t ret;
|
||||
asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline int32x2x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_s32_x2(const int32_t* __a) {
|
||||
int32x2x2_t ret;
|
||||
asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint64x1x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_u64_x2(const uint64_t* __a) {
|
||||
uint64x1x2_t ret;
|
||||
asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline int64x1x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_s64_x2(const int64_t* __a) {
|
||||
int64x1x2_t ret;
|
||||
__builtin_aarch64_simd_oi __o;
|
||||
asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline float16x4x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_f16_x2(const float16_t* __a) {
|
||||
float16x4x2_t ret;
|
||||
asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline float32x2x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_f32_x2(const float32_t* __a) {
|
||||
float32x2x2_t ret;
|
||||
asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline float64x1x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_f64_x2(const float64_t* __a) {
|
||||
float64x1x2_t ret;
|
||||
asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline poly8x8x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_p8_x2(const poly8_t* __a) {
|
||||
poly8x8x2_t ret;
|
||||
asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline poly16x4x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_p16_x2(const poly16_t* __a) {
|
||||
poly16x4x2_t ret;
|
||||
asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline poly64x1x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1_p64_x2(const poly64_t* __a) {
|
||||
poly64x1x2_t ret;
|
||||
asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint8x16x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_u8_x2(const uint8_t* __a) {
|
||||
uint8x16x2_t ret;
|
||||
asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline int8x16x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_s8_x2(const int8_t* __a) {
|
||||
int8x16x2_t ret;
|
||||
asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint16x8x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_u16_x2(const uint16_t* __a) {
|
||||
uint16x8x2_t ret;
|
||||
asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline int16x8x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_s16_x2(const int16_t* __a) {
|
||||
int16x8x2_t ret;
|
||||
asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint32x4x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_u32_x2(const uint32_t* __a) {
|
||||
uint32x4x2_t ret;
|
||||
asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline int32x4x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_s32_x2(const int32_t* __a) {
|
||||
int32x4x2_t ret;
|
||||
asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline uint64x2x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_u64_x2(const uint64_t* __a) {
|
||||
uint64x2x2_t ret;
|
||||
asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline int64x2x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_s64_x2(const int64_t* __a) {
|
||||
int64x2x2_t ret;
|
||||
asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline float16x8x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_f16_x2(const float16_t* __a) {
|
||||
float16x8x2_t ret;
|
||||
asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline float32x4x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_f32_x2(const float32_t* __a) {
|
||||
float32x4x2_t ret;
|
||||
asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline float64x2x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_f64_x2(const float64_t* __a) {
|
||||
float64x2x2_t ret;
|
||||
asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline poly8x16x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_p8_x2(const poly8_t* __a) {
|
||||
poly8x16x2_t ret;
|
||||
asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline poly16x8x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_p16_x2(const poly16_t* __a) {
|
||||
poly16x8x2_t ret;
|
||||
asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__extension__ extern __inline poly64x2x2_t
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vld1q_p64_x2(const poly64_t* __a) {
|
||||
poly64x2x2_t ret;
|
||||
asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* vst1x2 */
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_s64_x2(int64_t* __a, int64x1x2_t val) {
|
||||
asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_u64_x2(uint64_t* __a, uint64x1x2_t val) {
|
||||
asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_f64_x2(float64_t* __a, float64x1x2_t val) {
|
||||
asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_s8_x2(int8_t* __a, int8x8x2_t val) {
|
||||
asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_p8_x2(poly8_t* __a, poly8x8x2_t val) {
|
||||
asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_s16_x2(int16_t* __a, int16x4x2_t val) {
|
||||
asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_p16_x2(poly16_t* __a, poly16x4x2_t val) {
|
||||
asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_s32_x2(int32_t* __a, int32x2x2_t val) {
|
||||
asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_u8_x2(uint8_t* __a, uint8x8x2_t val) {
|
||||
asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_u16_x2(uint16_t* __a, uint16x4x2_t val) {
|
||||
asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_u32_x2(uint32_t* __a, uint32x2x2_t val) {
|
||||
asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_f16_x2(float16_t* __a, float16x4x2_t val) {
|
||||
asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_f32_x2(float32_t* __a, float32x2x2_t val) {
|
||||
asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1_p64_x2(poly64_t* __a, poly64x1x2_t val) {
|
||||
asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_s8_x2(int8_t* __a, int8x16x2_t val) {
|
||||
asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_p8_x2(poly8_t* __a, poly8x16x2_t val) {
|
||||
asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_s16_x2(int16_t* __a, int16x8x2_t val) {
|
||||
asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_p16_x2(poly16_t* __a, poly16x8x2_t val) {
|
||||
asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_s32_x2(int32_t* __a, int32x4x2_t val) {
|
||||
asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_s64_x2(int64_t* __a, int64x2x2_t val) {
|
||||
asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_u8_x2(uint8_t* __a, uint8x16x2_t val) {
|
||||
asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_u16_x2(uint16_t* __a, uint16x8x2_t val) {
|
||||
asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_u32_x2(uint32_t* __a, uint32x4x2_t val) {
|
||||
asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_u64_x2(uint64_t* __a, uint64x2x2_t val) {
|
||||
asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_f16_x2(float16_t* __a, float16x8x2_t val) {
|
||||
asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_f32_x2(float32_t* __a, float32x4x2_t val) {
|
||||
asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_f64_x2(float64_t* __a, float64x2x2_t val) {
|
||||
asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_p64_x2(poly64_t* __a, poly64x2x2_t val) {
|
||||
asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
@ -1 +1,7 @@
|
||||
#include <torch/headeronly/cpu/vec/vec256/missing_vst1_neon.h>
|
||||
/* Workaround for missing vst1q_f32_x2 in gcc-8. */
|
||||
|
||||
__extension__ extern __inline void
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
vst1q_f32_x2(float32_t* __a, float32x4x2_t val) {
|
||||
asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
|
||||
}
|
||||
|
||||
@ -34,9 +34,7 @@ class Vectorized<c10::complex<double>> {
|
||||
static constexpr size_type size() {
|
||||
return 2;
|
||||
}
|
||||
Vectorized() {
|
||||
values = _mm256_setzero_pd();
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(__m256d v) : values(v) {}
|
||||
Vectorized(c10::complex<double> val) {
|
||||
double real_value = val.real();
|
||||
|
||||
@ -33,9 +33,7 @@ class Vectorized<c10::complex<float>> {
|
||||
static constexpr size_type size() {
|
||||
return 4;
|
||||
}
|
||||
Vectorized() {
|
||||
values = _mm256_setzero_ps();
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(__m256 v) : values(v) {}
|
||||
Vectorized(c10::complex<float> val) {
|
||||
float real_value = val.real();
|
||||
|
||||
@ -31,9 +31,7 @@ class Vectorized<double> {
|
||||
static constexpr size_type size() {
|
||||
return 4;
|
||||
}
|
||||
Vectorized() {
|
||||
values = _mm256_setzero_pd();
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(__m256d v) : values(v) {}
|
||||
Vectorized(double val) {
|
||||
values = _mm256_set1_pd(val);
|
||||
@ -495,14 +493,6 @@ Vectorized<double> inline fmadd(
|
||||
return _mm256_fmadd_pd(a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline fnmadd(
|
||||
const Vectorized<double>& a,
|
||||
const Vectorized<double>& b,
|
||||
const Vectorized<double>& c) {
|
||||
return _mm256_fnmadd_pd(a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline fmsub(
|
||||
const Vectorized<double>& a,
|
||||
@ -510,14 +500,6 @@ Vectorized<double> inline fmsub(
|
||||
const Vectorized<double>& c) {
|
||||
return _mm256_fmsub_pd(a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline fnmsub(
|
||||
const Vectorized<double>& a,
|
||||
const Vectorized<double>& b,
|
||||
const Vectorized<double>& c) {
|
||||
return _mm256_fnmsub_pd(a, b, c);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@ -30,9 +30,7 @@ class Vectorized<float> {
|
||||
static constexpr size_type size() {
|
||||
return 8;
|
||||
}
|
||||
Vectorized() {
|
||||
values = _mm256_setzero_ps();
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(__m256 v) : values(v) {}
|
||||
Vectorized(float val) {
|
||||
values = _mm256_set1_ps(val);
|
||||
@ -696,14 +694,6 @@ Vectorized<float> inline fmadd(
|
||||
return _mm256_fmadd_ps(a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fnmadd(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& c) {
|
||||
return _mm256_fnmadd_ps(a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fmsub(
|
||||
const Vectorized<float>& a,
|
||||
@ -712,14 +702,6 @@ Vectorized<float> inline fmsub(
|
||||
return _mm256_fmsub_ps(a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fnmsub(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& c) {
|
||||
return _mm256_fnmsub_ps(a, b, c);
|
||||
}
|
||||
|
||||
// TODO: rewrite with ATEN vectorized (need to add unpack and shuffle)
|
||||
// Used by Inductor CPP codegen for micro gemm
|
||||
inline void transpose_block(at::vec::VectorizedN<float, 8>& input) {
|
||||
|
||||
@ -23,9 +23,7 @@ struct Vectorizedi {
|
||||
}
|
||||
|
||||
public:
|
||||
Vectorizedi() {
|
||||
values = _mm256_setzero_si256();
|
||||
}
|
||||
Vectorizedi() {}
|
||||
Vectorizedi(__m256i v) : values(v) {}
|
||||
operator __m256i() const {
|
||||
return values;
|
||||
@ -55,9 +53,7 @@ class Vectorized<int64_t> : public Vectorizedi {
|
||||
return 4;
|
||||
}
|
||||
using Vectorizedi::Vectorizedi;
|
||||
Vectorized() {
|
||||
values = _mm256_setzero_si256();
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(int64_t v) {
|
||||
values = _mm256_set1_epi64x(v);
|
||||
}
|
||||
|
||||
@ -54,9 +54,7 @@ struct Vectorizedqi {
|
||||
#endif
|
||||
|
||||
public:
|
||||
Vectorizedqi() {
|
||||
vals = _mm256_setzero_si256();
|
||||
}
|
||||
Vectorizedqi() {}
|
||||
Vectorizedqi(__m256i v) : vals(v) {}
|
||||
operator __m256i() const {
|
||||
return vals;
|
||||
|
||||
@ -192,9 +192,7 @@ class Vectorized16 {
|
||||
static constexpr size_type size() {
|
||||
return 32;
|
||||
}
|
||||
Vectorized16() {
|
||||
values = _mm512_setzero_si512();
|
||||
}
|
||||
Vectorized16() {}
|
||||
Vectorized16(__m512i v) : values(v) {}
|
||||
Vectorized16(T val) {
|
||||
value_type uw = val.x;
|
||||
|
||||
@ -34,9 +34,7 @@ class Vectorized<c10::complex<double>> {
|
||||
static constexpr size_type size() {
|
||||
return 4;
|
||||
}
|
||||
Vectorized() {
|
||||
values = _mm512_setzero_pd();
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(__m512d v) : values(v) {}
|
||||
Vectorized(c10::complex<double> val) {
|
||||
double real_value = val.real();
|
||||
|
||||
@ -34,9 +34,7 @@ class Vectorized<c10::complex<float>> {
|
||||
static constexpr size_type size() {
|
||||
return 8;
|
||||
}
|
||||
Vectorized() {
|
||||
values = _mm512_setzero_ps();
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(__m512 v) : values(v) {}
|
||||
Vectorized(c10::complex<float> val) {
|
||||
float real_value = val.real();
|
||||
|
||||
@ -34,9 +34,7 @@ class Vectorized<double> {
|
||||
static constexpr size_type size() {
|
||||
return 8;
|
||||
}
|
||||
Vectorized() {
|
||||
values = _mm512_setzero_pd();
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(__m512d v) : values(v) {}
|
||||
Vectorized(double val) {
|
||||
values = _mm512_set1_pd(val);
|
||||
@ -536,14 +534,6 @@ Vectorized<double> inline fmadd(
|
||||
return _mm512_fmadd_pd(a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline fnmadd(
|
||||
const Vectorized<double>& a,
|
||||
const Vectorized<double>& b,
|
||||
const Vectorized<double>& c) {
|
||||
return _mm512_fnmadd_pd(a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline fmsub(
|
||||
const Vectorized<double>& a,
|
||||
@ -552,14 +542,6 @@ Vectorized<double> inline fmsub(
|
||||
return _mm512_fmsub_pd(a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<double> inline fnmsub(
|
||||
const Vectorized<double>& a,
|
||||
const Vectorized<double>& b,
|
||||
const Vectorized<double>& c) {
|
||||
return _mm512_fnmsub_pd(a, b, c);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
|
||||
@ -32,9 +32,7 @@ class Vectorized<float> {
|
||||
static constexpr size_type size() {
|
||||
return 16;
|
||||
}
|
||||
Vectorized() {
|
||||
values = _mm512_setzero_ps();
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(__m512 v) : values(v) {}
|
||||
Vectorized(float val) {
|
||||
values = _mm512_set1_ps(val);
|
||||
@ -749,14 +747,6 @@ Vectorized<float> inline fmadd(
|
||||
return _mm512_fmadd_ps(a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fnmadd(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& c) {
|
||||
return _mm512_fnmadd_ps(a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fmsub(
|
||||
const Vectorized<float>& a,
|
||||
@ -765,14 +755,6 @@ Vectorized<float> inline fmsub(
|
||||
return _mm512_fmsub_ps(a, b, c);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fnmsub(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& c) {
|
||||
return _mm512_fnmsub_ps(a, b, c);
|
||||
}
|
||||
|
||||
// TODO: rewrite with ATEN vectorized (need to add unpack and shuffle)
|
||||
// Used by Inductor CPP codegen for micro gemm
|
||||
// Code referred to FBGEMM:
|
||||
|
||||
@ -53,9 +53,7 @@ class Vectorized<int64_t> : public Vectorizedi {
|
||||
return 8;
|
||||
}
|
||||
using Vectorizedi::Vectorizedi;
|
||||
Vectorized() {
|
||||
values = _mm512_setzero_si512();
|
||||
}
|
||||
Vectorized() {}
|
||||
Vectorized(int64_t v) {
|
||||
values = _mm512_set1_epi64(v);
|
||||
}
|
||||
|
||||
@ -55,9 +55,7 @@ struct Vectorizedqi {
|
||||
#endif
|
||||
|
||||
public:
|
||||
Vectorizedqi() {
|
||||
vals = _mm512_setzero_si512();
|
||||
}
|
||||
Vectorizedqi() {}
|
||||
Vectorizedqi(__m512i v) : vals(v) {}
|
||||
operator __m512i() const {
|
||||
return vals;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user