Compare commits

..

1 Commits

Author SHA1 Message Date
cf1a2abf35 force kwarg=copy on .to in proxy_call in export 2025-07-24 16:48:22 -07:00
1029 changed files with 15252 additions and 36339 deletions

View File

@ -12,7 +12,7 @@ fi
SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
source $SCRIPTPATH/../manywheel/set_desired_python.sh
pip install -q numpy==${NUMPY_VERSION} pyyaml==6.0.2 scons==4.7.0 ninja==1.11.1 patchelf==0.17.2
pip install -q numpy==${NUMPY_VERSION} pyyaml==6.0.2 scons==4.7.0 ninja==1.11.1.4 patchelf==0.17.2
for tool in python python3 pip pip3 ninja scons patchelf; do
ln -sf ${DESIRED_PYTHON_BIN_DIR}/${tool} /usr/local/bin;

View File

@ -36,104 +36,3 @@ See `build.sh` for valid build environments (it's the giant switch).
# Set flags (see build.sh) and build image
sudo bash -c 'TRITON=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
```
## [Guidance] Adding a New Base Docker Image
### Background
The base Docker images in directory `.ci/docker/` are built by the `docker-builds.yml` workflow. Those images are used throughout the PyTorch CI/CD pipeline. You should only create or modify a base Docker image if you need specific environment changes or dependencies before building PyTorch on CI.
1. **Automatic Rebuilding**:
- The Docker image building process is triggered automatically when changes are made to files in the `.ci/docker/*` directory
- This ensures all images stay up-to-date with the latest dependencies and configurations
2. **Image Reuse in PyTorch Build Workflows** (example: linux-build):
- The images generated by `docker-builds.yml` are reused in `_linux-build.yml` through the `calculate-docker-image` step
- The `_linux-build.yml` workflow:
- Pulls the Docker image determined by the `calculate-docker-image` step
- Runs a Docker container with that image
- Executes `.ci/pytorch/build.sh` inside the container to build PyTorch
3. **Usage in Test Workflows** (example: linux-test):
- The same Docker images are also used in `_linux-test.yml` for running tests
- The `_linux-test.yml` workflow follows a similar pattern:
- It uses the `calculate-docker-image` step to determine which Docker image to use
- It pulls the Docker image and runs a container with that image
- It installs the wheels from the artifacts generated by PyTorch build jobs
- It executes test scripts (like `.ci/pytorch/test.sh` or `.ci/pytorch/multigpu-test.sh`) inside the container
### Understanding File Purposes
#### `.ci/docker/build.sh` vs `.ci/pytorch/build.sh`
- **`.ci/docker/build.sh`**:
- Used for building base Docker images
- Executed by the `docker-builds.yml` workflow to pre-build Docker images for CI
- Contains configurations for different Docker build environments
- **`.ci/pytorch/build.sh`**:
- Used for building PyTorch inside a Docker container
- Called by workflows like `_linux-build.yml` after the Docker container is started
- Builds PyTorch wheels and other artifacts
#### `.ci/docker/ci_commit_pins/` vs `.github/ci_commit_pins`
- **`.ci/docker/ci_commit_pins/`**:
- Used for pinning dependency versions during base Docker image building
- Ensures consistent environments for building PyTorch
- Changes here trigger base Docker image rebuilds
- **`.github/ci_commit_pins`**:
- Used for pinning dependency versions during PyTorch building and tests
- Ensures consistent dependencies for PyTorch across different builds
- Used by build scripts running inside Docker containers
### Step-by-Step Guide for Adding a New Base Docker Image
#### 1. Add Pinned Commits (If Applicable)
We use pinned commits for build stability. The `nightly.yml` workflow checks and updates pinned commits for certain repository dependencies daily.
If your new Docker image needs a library installed from a specific pinned commit or built from source:
1. Add the repository you want to track in `nightly.yml` and `merge-rules.yml`
2. Add the initial pinned commit in `.ci/docker/ci_commit_pins/`. The text filename should match the one defined in step 1
#### 2. Configure the Base Docker Image
1. **Add new Base Docker image configuration** (if applicable):
Add the configuration in `.ci/docker/build.sh`. For example:
```bash
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-new1)
CUDA_VERSION=12.8.1
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=11
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
NEW_ARG_1=yes
;;
```
2. **Add build arguments to Docker build command**:
If you're introducing a new argument to the Docker build, make sure to add it in the Docker build step in `.ci/docker/build.sh`:
```bash
docker build \
....
--build-arg "NEW_ARG_1=${NEW_ARG_1}"
```
3. **Update Dockerfile logic**:
Update the Dockerfile to use the new argument. For example, in `ubuntu/Dockerfile`:
```dockerfile
ARG NEW_ARG_1
# Set up environment for NEW_ARG_1
RUN if [ -n "${NEW_ARG_1}" ]; then bash ./do_something.sh; fi
```
4. **Add the Docker configuration** in `.github/workflows/docker-builds.yml`:
The `docker-builds.yml` workflow pre-builds the Docker images whenever changes occur in the `.ci/docker/` directory. This includes the
pinned commit updates.

View File

@ -93,6 +93,7 @@ tag=$(echo $image | awk -F':' '{print $2}')
case "$tag" in
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11)
CUDA_VERSION=12.4
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=11
VISION=yes
@ -103,6 +104,7 @@ case "$tag" in
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11)
CUDA_VERSION=12.8.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=11
VISION=yes
@ -113,6 +115,7 @@ case "$tag" in
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
CUDA_VERSION=12.8.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
VISION=yes
@ -124,6 +127,7 @@ case "$tag" in
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks)
CUDA_VERSION=12.8.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=9
VISION=yes
@ -135,6 +139,7 @@ case "$tag" in
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks)
CUDA_VERSION=12.8.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.13
GCC_VERSION=9
VISION=yes
@ -146,6 +151,7 @@ case "$tag" in
;;
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9)
CUDA_VERSION=12.6.3
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
VISION=yes
@ -154,18 +160,9 @@ case "$tag" in
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm)
CUDA_VERSION=12.8.1
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=11
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
;;
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
CUDA_VERSION=12.6
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
VISION=yes
@ -177,6 +174,7 @@ case "$tag" in
;;
pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
CUDA_VERSION=12.6
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=9
VISION=yes
@ -188,6 +186,7 @@ case "$tag" in
;;
pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
CUDA_VERSION=12.6
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.13
GCC_VERSION=9
VISION=yes
@ -199,6 +198,7 @@ case "$tag" in
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9)
CUDA_VERSION=12.8.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
VISION=yes
@ -276,7 +276,7 @@ case "$tag" in
NINJA_VERSION=1.9.0
TRITON=yes
;;
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=11
VISION=yes
@ -288,6 +288,7 @@ case "$tag" in
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12)
ANACONDA_PYTHON_VERSION=3.9
CUDA_VERSION=12.8.1
CUDNN_VERSION=9
CLANG_VERSION=12
VISION=yes
TRITON=yes
@ -366,6 +367,7 @@ case "$tag" in
fi
if [[ "$image" == *cuda* ]]; then
extract_version_from_image_name cuda CUDA_VERSION
extract_version_from_image_name cudnn CUDNN_VERSION
fi
if [[ "$image" == *rocm* ]]; then
extract_version_from_image_name rocm ROCM_VERSION
@ -417,6 +419,9 @@ docker build \
--build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
--build-arg "GCC_VERSION=${GCC_VERSION}" \
--build-arg "CUDA_VERSION=${CUDA_VERSION}" \
--build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \
--build-arg "TENSORRT_VERSION=${TENSORRT_VERSION}" \
--build-arg "GRADLE_VERSION=${GRADLE_VERSION}" \
--build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
--build-arg "KATEX=${KATEX:-}" \
--build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \

View File

@ -0,0 +1,26 @@
#!/bin/bash
if [[ -n "${CUDNN_VERSION}" ]]; then
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
mkdir tmp_cudnn
pushd tmp_cudnn
if [[ ${CUDA_VERSION:0:4} == "12.9" || ${CUDA_VERSION:0:4} == "12.8" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive"
elif [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive"
elif [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive"
elif [[ ${CUDA_VERSION:0:2} == "11" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda11-archive"
else
print "Unsupported CUDA version ${CUDA_VERSION}"
exit 1
fi
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
tar xf ${CUDNN_NAME}.tar.xz
cp -a ${CUDNN_NAME}/include/* /usr/local/cuda/include/
cp -a ${CUDNN_NAME}/lib/* /usr/local/cuda/lib64/
popd
rm -rf tmp_cudnn
ldconfig
fi

View File

@ -30,7 +30,7 @@ EOF
# we want the patch version of 6.4 instead
if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
ROCM_VERSION="${ROCM_VERSION}.2"
ROCM_VERSION="${ROCM_VERSION}.1"
fi
# Default url values
@ -85,19 +85,16 @@ EOF
# CI no longer builds for ROCm 6.3, but
# ROCm 6.4 did not yet fix the regression, also HIP branch names are different
if [[ $(ver $ROCM_VERSION) -ge $(ver 6.4) ]] && [[ $(ver $ROCM_VERSION) -lt $(ver 7.0) ]]; then
if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.2) ]]; then
HIP_TAG=rocm-6.4.2
CLR_HASH=74d78ba3ac4bac235d02bcb48511c30b5cfdd457 # branch release/rocm-rel-6.4.2-statco-hotfix
elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.1) ]]; then
HIP_TAG=rocm-6.4.1
CLR_HASH=efe6c35790b9206923bfeed1209902feff37f386 # branch release/rocm-rel-6.4.1-statco-hotfix
if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.1) ]]; then
HIP_BRANCH=release/rocm-rel-6.4
CLR_HASH=ca18eb3f77fa09292fcda62bc60c3e565d752ada # branch release/rocm-rel-6.4.1-statco-hotfix
elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
HIP_TAG=rocm-6.4.0
HIP_BRANCH=release/rocm-rel-6.4
CLR_HASH=600f5b0d2baed94d5121e2174a9de0851b040b0c # branch release/rocm-rel-6.4-statco-hotfix
fi
# clr build needs CppHeaderParser but can only find it using conda's python
python -m pip install CppHeaderParser
git clone https://github.com/ROCm/HIP -b $HIP_TAG
git clone https://github.com/ROCm/HIP -b $HIP_BRANCH
HIP_COMMON_DIR=$(readlink -f HIP)
git clone https://github.com/jeffdaily/clr
pushd clr

View File

@ -103,5 +103,5 @@ fi
# It depends on torch and triton. We don't want to install
# triton and torch from production on Docker CI images
if [[ "$ANACONDA_PYTHON_VERSION" != 3.9* ]]; then
pip_install helion==0.0.10 --no-deps
pip_install helion --no-deps
fi

View File

@ -41,7 +41,7 @@ case ${DOCKER_TAG_PREFIX} in
rocm*)
# we want the patch version of 6.4 instead
if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.1"
fi
BASE_TARGET=rocm
GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete

View File

@ -128,7 +128,7 @@ ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
# Install setuptools and wheel for python 3.12/3.13
RUN for cpython_version in "cp312-cp312" "cp313-cp313" "cp313-cp313t"; do \
/opt/python/${cpython_version}/bin/python -m pip install setuptools wheel; \
/opt/python/${cpython_version}/bin/python -m pip install "setuptools>=77.0.0" "packaging>=24.2" wheel; \
done;

View File

@ -124,10 +124,9 @@ RUN python3 -mpip install cmake==3.28.0
# install newest flatbuffers version first:
# for some reason old version is getting pulled in otherwise.
# packaging package is required for onnxruntime wheel build.
RUN pip3 install flatbuffers && \
pip3 install cython 'pkgconfig>=1.5.5' 'setuptools>=77' 'numpy<2.3.0' && \
RUN pip3 install 'setuptools>=77.0' 'packaging>=24.2' && \
pip3 install flatbuffers cython 'pkgconfig>=1.5.5' 'numpy<2.3.0' && \
pip3 install --no-build-isolation h5py==3.11.0 && \
pip3 install packaging && \
git clone https://github.com/microsoft/onnxruntime && \
cd onnxruntime && git checkout v1.21.0 && \
git submodule update --init --recursive && \

View File

@ -77,7 +77,7 @@ case ${image} in
manylinux2_28-builder:rocm*)
# we want the patch version of 6.4 instead
if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.1"
fi
TARGET=rocm_final
MANY_LINUX_VERSION="2_28"

View File

@ -104,10 +104,10 @@ networkx==2.8.8
#Pinned versions: 2.8.8
#test that import: functorch
ninja==1.11.1.3
ninja==1.11.1.4
#Description: build system. Used in some tests. Used in build to generate build
#time tracing information
#Pinned versions: 1.11.1.3
#Pinned versions: 1.11.1.4
#test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
numba==0.49.0 ; python_version < "3.9"
@ -221,9 +221,9 @@ pygments==2.15.0
#Pinned versions: 2.12.0
#test that import: the doctests
#pyyaml
#PyYAML
#Description: data serialization format
#Pinned versions: 6.0.2
#Pinned versions:
#test that import:
#requests
@ -233,7 +233,7 @@ pygments==2.15.0
#rich
#Description: rich text and beautiful formatting in the terminal
#Pinned versions: 14.1.0
#Pinned versions: 10.9.0
#test that import:
scikit-image==0.19.3 ; python_version < "3.10"
@ -363,9 +363,10 @@ pwlf==2.2.1
# To build PyTorch itself
packaging>=24.2
pyyaml
pyzstd
setuptools>=70.1.0
setuptools>=77.0.0
six
scons==4.5.2 ; platform_machine == "aarch64"

View File

@ -4,8 +4,8 @@ sphinx==5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
# something related to Docker setup. We can investigate this later.
# but it doesn't seem to work and hangs around idly. The initial thought it is probably
# something related to Docker setup. We can investigate this later
sphinxcontrib.katex==0.8.6
#Description: This is used to generate PyTorch docs
@ -50,8 +50,8 @@ IPython==8.12.0
#Pinned versions: 8.12.0
myst-nb==0.17.2
#Description: This is used to generate PyTorch functorch and torch.compile docs
#Pinned versions: 0.17.2
#Description: This is used to generate PyTorch functorch docs
#Pinned versions: 0.13.2
# The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
python-etcd==0.4.5
@ -59,3 +59,4 @@ sphinx-copybutton==0.5.0
sphinx-design==0.4.0
sphinxcontrib-mermaid==1.0.0
myst-parser==0.18.1
myst-nb

34
.ci/pytorch/build-mobile.sh Executable file
View File

@ -0,0 +1,34 @@
#!/usr/bin/env bash
# DO NOT ADD 'set -x' not to reveal CircleCI secret context environment variables
set -eu -o pipefail
# This script uses linux host toolchain + mobile build options in order to
# build & test mobile libtorch without having to setup Android/iOS
# toolchain/simulator.
# shellcheck source=./common.sh
source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
# shellcheck source=./common-build.sh
source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
# Install torch & torchvision - used to download & trace test model.
# Ideally we should use the libtorch built on the PR so that backward
# incompatible changes won't break this script - but it will significantly slow
# down mobile CI jobs.
# Here we install nightly instead of stable so that we have an option to
# temporarily skip mobile CI jobs on BC-breaking PRs until they are in nightly.
retry pip install --pre torch torchvision \
-f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html \
--progress-bar off
# Run end-to-end process of building mobile library, linking into the predictor
# binary, and running forward pass with a real model.
if [[ "$BUILD_ENVIRONMENT" == *-mobile-custom-build-static* ]]; then
TEST_CUSTOM_BUILD_STATIC=1 test/mobile/custom_build/build.sh
elif [[ "$BUILD_ENVIRONMENT" == *-mobile-lightweight-dispatch* ]]; then
test/mobile/lightweight_dispatch/build.sh
else
TEST_DEFAULT_BUILD=1 test/mobile/custom_build/build.sh
fi
print_sccache_stats

View File

@ -11,6 +11,10 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
# shellcheck source=./common-build.sh
source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
if [[ "$BUILD_ENVIRONMENT" == *-mobile-*build* ]]; then
exec "$(dirname "${BASH_SOURCE[0]}")/build-mobile.sh" "$@"
fi
echo "Python version:"
python --version
@ -120,8 +124,26 @@ if [[ "$BUILD_ENVIRONMENT" == *libtorch* ]]; then
fi
# Use special scripts for Android builds
if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
export ANDROID_NDK=/opt/ndk
build_args=()
if [[ "${BUILD_ENVIRONMENT}" == *-arm-v7a* ]]; then
build_args+=("-DANDROID_ABI=armeabi-v7a")
elif [[ "${BUILD_ENVIRONMENT}" == *-arm-v8a* ]]; then
build_args+=("-DANDROID_ABI=arm64-v8a")
elif [[ "${BUILD_ENVIRONMENT}" == *-x86_32* ]]; then
build_args+=("-DANDROID_ABI=x86")
elif [[ "${BUILD_ENVIRONMENT}" == *-x86_64* ]]; then
build_args+=("-DANDROID_ABI=x86_64")
fi
if [[ "${BUILD_ENVIRONMENT}" == *vulkan* ]]; then
build_args+=("-DUSE_VULKAN=ON")
fi
build_args+=("-DUSE_LITE_INTERPRETER_PROFILER=OFF")
exec ./scripts/build_android.sh "${build_args[@]}" "$@"
fi
if [[ "$BUILD_ENVIRONMENT" == *vulkan* ]]; then
if [[ "$BUILD_ENVIRONMENT" != *android* && "$BUILD_ENVIRONMENT" == *vulkan* ]]; then
export USE_VULKAN=1
# shellcheck disable=SC1091
source /var/lib/jenkins/vulkansdk/setup-env.sh
@ -203,7 +225,7 @@ if [[ "${BUILD_ENVIRONMENT}" == *-pch* ]]; then
export USE_PRECOMPILED_HEADERS=1
fi
if [[ "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
export BUILD_STATIC_RUNTIME_BENCHMARK=ON
fi
@ -247,6 +269,9 @@ if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then
tools/bazel build --config=no-tty "${BAZEL_MEM_LIMIT}" "${BAZEL_CPU_LIMIT}" //...
fi
else
# install build-system requirements before running setup.py commands
python -m pip install -r requirements-build.txt
# check that setup.py would fail with bad arguments
echo "The next three invocations are expected to fail with invalid command error messages."
( ! get_exit_code python setup.py bad_argument )

View File

@ -204,32 +204,8 @@ function install_torchrec_and_fbgemm() {
pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
pip_uninstall fbgemm-gpu-nightly
# Set ROCM_HOME isn't available, use ROCM_PATH if set or /opt/rocm
ROCM_HOME="${ROCM_HOME:-${ROCM_PATH:-/opt/rocm}}"
# Find rocm_version.h header file for ROCm version extract
rocm_version_h="${ROCM_HOME}/include/rocm-core/rocm_version.h"
if [ ! -f "$rocm_version_h" ]; then
rocm_version_h="${ROCM_HOME}/include/rocm_version.h"
fi
# Error out if rocm_version.h not found
if [ ! -f "$rocm_version_h" ]; then
echo "Error: rocm_version.h not found in expected locations." >&2
exit 1
fi
# Extract major, minor and patch ROCm version numbers
MAJOR_VERSION=$(grep 'ROCM_VERSION_MAJOR' "$rocm_version_h" | awk '{print $3}')
MINOR_VERSION=$(grep 'ROCM_VERSION_MINOR' "$rocm_version_h" | awk '{print $3}')
PATCH_VERSION=$(grep 'ROCM_VERSION_PATCH' "$rocm_version_h" | awk '{print $3}')
ROCM_INT=$((MAJOR_VERSION * 10000 + MINOR_VERSION * 100 + PATCH_VERSION))
echo "ROCm version: $ROCM_INT"
export BUILD_ROCM_VERSION="$MAJOR_VERSION.$MINOR_VERSION"
pip_install tabulate # needed for newer fbgemm
pip_install patchelf # needed for rocm fbgemm
pushd /tmp
local wheel_dir=dist/fbgemm_gpu
local found_whl=0
@ -247,7 +223,7 @@ function install_torchrec_and_fbgemm() {
pushd fbgemm/fbgemm_gpu
git checkout "${fbgemm_commit}"
python setup.py bdist_wheel \
--build-variant=rocm \
--package_variant=rocm \
-DHIP_ROOT_DIR="${ROCM_PATH}" \
-DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \
-DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
@ -264,7 +240,6 @@ function install_torchrec_and_fbgemm() {
done
rm -rf fbgemm
popd
else
pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu

View File

@ -0,0 +1,123 @@
from datetime import datetime, timedelta, timezone
from tempfile import mkdtemp
from cryptography import x509
from cryptography.hazmat.primitives import hashes, serialization
from cryptography.hazmat.primitives.asymmetric import rsa
from cryptography.x509.oid import NameOID
temp_dir = mkdtemp()
print(temp_dir)
def genrsa(path):
key = rsa.generate_private_key(
public_exponent=65537,
key_size=2048,
)
with open(path, "wb") as f:
f.write(
key.private_bytes(
encoding=serialization.Encoding.PEM,
format=serialization.PrivateFormat.TraditionalOpenSSL,
encryption_algorithm=serialization.NoEncryption(),
)
)
return key
def create_cert(path, C, ST, L, O, key):
subject = issuer = x509.Name(
[
x509.NameAttribute(NameOID.COUNTRY_NAME, C),
x509.NameAttribute(NameOID.STATE_OR_PROVINCE_NAME, ST),
x509.NameAttribute(NameOID.LOCALITY_NAME, L),
x509.NameAttribute(NameOID.ORGANIZATION_NAME, O),
]
)
cert = (
x509.CertificateBuilder()
.subject_name(subject)
.issuer_name(issuer)
.public_key(key.public_key())
.serial_number(x509.random_serial_number())
.not_valid_before(datetime.now(timezone.utc))
.not_valid_after(
# Our certificate will be valid for 10 days
datetime.now(timezone.utc) + timedelta(days=10)
)
.add_extension(
x509.BasicConstraints(ca=True, path_length=None),
critical=True,
)
.sign(key, hashes.SHA256())
)
# Write our certificate out to disk.
with open(path, "wb") as f:
f.write(cert.public_bytes(serialization.Encoding.PEM))
return cert
def create_req(path, C, ST, L, O, key):
csr = (
x509.CertificateSigningRequestBuilder()
.subject_name(
x509.Name(
[
# Provide various details about who we are.
x509.NameAttribute(NameOID.COUNTRY_NAME, C),
x509.NameAttribute(NameOID.STATE_OR_PROVINCE_NAME, ST),
x509.NameAttribute(NameOID.LOCALITY_NAME, L),
x509.NameAttribute(NameOID.ORGANIZATION_NAME, O),
]
)
)
.sign(key, hashes.SHA256())
)
with open(path, "wb") as f:
f.write(csr.public_bytes(serialization.Encoding.PEM))
return csr
def sign_certificate_request(path, csr_cert, ca_cert, private_ca_key):
cert = (
x509.CertificateBuilder()
.subject_name(csr_cert.subject)
.issuer_name(ca_cert.subject)
.public_key(csr_cert.public_key())
.serial_number(x509.random_serial_number())
.not_valid_before(datetime.now(timezone.utc))
.not_valid_after(
# Our certificate will be valid for 10 days
datetime.now(timezone.utc) + timedelta(days=10)
# Sign our certificate with our private key
)
.sign(private_ca_key, hashes.SHA256())
)
with open(path, "wb") as f:
f.write(cert.public_bytes(serialization.Encoding.PEM))
return cert
ca_key = genrsa(temp_dir + "/ca.key")
ca_cert = create_cert(
temp_dir + "/ca.pem",
"US",
"New York",
"New York",
"Gloo Certificate Authority",
ca_key,
)
pkey = genrsa(temp_dir + "/pkey.key")
csr = create_req(
temp_dir + "/csr.csr",
"US",
"California",
"San Francisco",
"Gloo Testing Company",
pkey,
)
cert = sign_certificate_request(temp_dir + "/cert.pem", csr, ca_cert, ca_key)

18
.ci/pytorch/run_glootls_test.sh Executable file
View File

@ -0,0 +1,18 @@
#!/bin/bash
CREATE_TEST_CERT="$(dirname "${BASH_SOURCE[0]}")/create_test_cert.py"
TMP_CERT_DIR=$(python "$CREATE_TEST_CERT")
openssl verify -CAfile "${TMP_CERT_DIR}/ca.pem" "${TMP_CERT_DIR}/cert.pem"
export GLOO_DEVICE_TRANSPORT=TCP_TLS
export GLOO_DEVICE_TRANSPORT_TCP_TLS_PKEY=${TMP_CERT_DIR}/pkey.key
export GLOO_DEVICE_TRANSPORT_TCP_TLS_CERT=${TMP_CERT_DIR}/cert.pem
export GLOO_DEVICE_TRANSPORT_TCP_TLS_CA_FILE=${TMP_CERT_DIR}/ca.pem
time python test/run_test.py --include distributed/test_c10d_gloo --verbose -- ProcessGroupGlooTest
unset GLOO_DEVICE_TRANSPORT
unset GLOO_DEVICE_TRANSPORT_TCP_TLS_PKEY
unset GLOO_DEVICE_TRANSPORT_TCP_TLS_CERT
unset GLOO_DEVICE_TRANSPORT_TCP_TLS_CA_FILE

View File

@ -35,7 +35,14 @@ MODULES = [
"smoke_test": "./vision/test/smoke_test.py",
"extension": "extension",
"repo_name": "vision",
}
},
{
"name": "torchaudio",
"repo": "https://github.com/pytorch/audio.git",
"smoke_test": "./audio/test/smoke_test/smoke_test.py --no-ffmpeg",
"extension": "_extension",
"repo_name": "audio",
},
]
@ -378,29 +385,6 @@ def smoke_test_compile(device: str = "cpu") -> None:
x_pt2 = torch.compile(model, mode="max-autotune")(x)
def smoke_test_nvshmem() -> None:
if not torch.cuda.is_available():
print("CUDA is not available, skipping NVSHMEM test")
return
# Check if NVSHMEM is compiled in current build
try:
from torch._C._distributed_c10d import _is_nvshmem_available
except ImportError:
# Not built with NVSHMEM support.
# torch is not compiled with NVSHMEM prior to 2.9
if torch.__version__ < "2.9":
return
else:
# After 2.9: NVSHMEM is expected to be compiled in current build
raise RuntimeError("torch not compiled with NVSHMEM") from None
print("torch compiled with NVSHMEM")
# Check if NVSHMEM is available on current system.
print(f"NVSHMEM available at run time: {_is_nvshmem_available()}")
def smoke_test_modules():
cwd = os.getcwd()
for module in MODULES:
@ -495,8 +479,6 @@ def main() -> None:
options.pypi_pkg_check,
)
smoke_test_nvshmem()
if __name__ == "__main__":
main()

View File

@ -201,7 +201,7 @@ fi
if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]] ; then
# JIT C++ extensions require ninja.
pip_install "ninja==1.10.2"
pip_install "ninja==1.11.1.4"
# ninja is installed in $HOME/.local/bin, e.g., /var/lib/jenkins/.local/bin for CI user jenkins
# but this script should be runnable by any user, including root
export PATH="$HOME/.local/bin:$PATH"
@ -345,12 +345,6 @@ test_h100_symm_mem() {
assert_git_not_dirty
}
test_h100_cutlass_backend() {
# cutlass backend tests for H100
TORCHINDUCTOR_CUTLASS_DIR=$(realpath "./third_party/cutlass") python test/run_test.py --include inductor/test_cutlass_backend -k "not addmm" $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
TORCHINDUCTOR_CUTLASS_DIR=$(realpath "./third_party/cutlass") python test/run_test.py --include inductor/test_cutlass_evt $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
}
test_lazy_tensor_meta_reference_disabled() {
export TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE=1
echo "Testing lazy tensor operations without meta reference"
@ -365,6 +359,7 @@ test_dynamo_wrapped_shard() {
exit 1
fi
python tools/dynamo/verify_dynamo.py
python tools/dynamo/gb_id_mapping.py verify
# PLEASE DO NOT ADD ADDITIONAL EXCLUDES HERE.
# Instead, use @skipIfTorchDynamo on your tests.
time python test/run_test.py --dynamo \
@ -462,7 +457,7 @@ test_inductor_aoti() {
# rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled
/usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}"
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile
}
test_inductor_cpp_wrapper_shard() {
@ -928,6 +923,12 @@ test_torchbench_gcp_smoketest(){
popd
}
test_python_gloo_with_tls() {
source "$(dirname "${BASH_SOURCE[0]}")/run_glootls_test.sh"
assert_git_not_dirty
}
test_aten() {
# Test ATen
# The following test(s) of ATen have already been skipped by caffe2 in rocm environment:
@ -974,8 +975,6 @@ test_without_numpy() {
if [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch;torch.compile(lambda x:print(x))('Hello World')"
fi
# Regression test for https://github.com/pytorch/pytorch/pull/157734 (torch.onnx should be importable without numpy)
python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch; import torch.onnx"
popd
}
@ -1320,13 +1319,10 @@ EOF
# Step 2. Make sure that the public API test "test_correct_module_names" fails when an existing
# file is modified to introduce an invalid public API function.
# The filepath here must not have __all__ defined in it, otherwise the test will pass.
# If your PR introduces __all__ to torch/cuda/streams.py please point this to another file
# that does not have __all__ defined.
EXISTING_FILEPATH="${TORCH_INSTALL_DIR}/cuda/streams.py"
EXISTING_FILEPATH="${TORCH_INSTALL_DIR}/nn/parameter.py"
cp -v "${EXISTING_FILEPATH}" "${EXISTING_FILEPATH}.orig"
echo "${BAD_PUBLIC_FUNC}" >> "${EXISTING_FILEPATH}"
invalid_api="torch.cuda.streams.new_public_func"
invalid_api="torch.nn.parameter.new_public_func"
echo "Appended an invalid public API function to existing file ${EXISTING_FILEPATH}..."
check_public_api_test_fails \
@ -1560,7 +1556,7 @@ test_executorch() {
test_linux_aarch64() {
python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops test_cpp_extensions_open_device_registration \
--shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
# Dynamo tests
@ -1773,8 +1769,6 @@ elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
test_h100_distributed
elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
test_h100_symm_mem
elif [[ "${TEST_CONFIG}" == h100_cutlass_backend ]]; then
test_h100_cutlass_backend
else
install_torchvision
install_monkeytype

View File

@ -1,34 +0,0 @@
# If you want to rebuild, run this with $env:REBUILD=1
# If you want to build with CUDA, run this with $env:USE_CUDA=1
# If you want to build without CUDA, run this with $env:USE_CUDA=0
# Check for setup.py in the current directory
if (-not (Test-Path "setup.py")) {
Write-Host "ERROR: Please run this build script from PyTorch root directory."
exit 1
}
# Get the script's parent directory
$ScriptParentDir = Split-Path -Parent $MyInvocation.MyCommand.Definition
# Set TMP_DIR and convert to Windows path
$env:TMP_DIR = Join-Path (Get-Location) "build\win_tmp"
$env:TMP_DIR_WIN = $env:TMP_DIR # Already in Windows format, no cygpath needed
# Set final package directory with default fallback
if (-not $env:PYTORCH_FINAL_PACKAGE_DIR) {
$env:PYTORCH_FINAL_PACKAGE_DIR = "C:\w\build-results"
}
# Create the final package directory if it doesn't exist
if (-not (Test-Path $env:PYTORCH_FINAL_PACKAGE_DIR)) {
New-Item -Path $env:PYTORCH_FINAL_PACKAGE_DIR -ItemType Directory -Force | Out-Null
}
# Set script helpers directory
$env:SCRIPT_HELPERS_DIR = Join-Path $ScriptParentDir "win-test-helpers\arm64"
# Run the main build script
& "$env:SCRIPT_HELPERS_DIR\build_pytorch.ps1"
Write-Host "BUILD PASSED"

View File

@ -1,24 +0,0 @@
#!/bin/bash
set -ex -o pipefail
SCRIPT_PARENT_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
# shellcheck source=./common.sh
source "$SCRIPT_PARENT_DIR/common.sh"
run_tests() {
echo Running smoke_test.py...
python ./.ci/pytorch/smoke_test/smoke_test.py --package torchonly
echo Running test_autograd.oy, test_nn.py, test_torch.py...
cd test
CORE_TEST_LIST=("test_autograd.py" "test_nn.py" "test_modules.py")
for t in "${CORE_TEST_LIST[@]}"; do
echo "Running test: $t"
python "$t" --verbose --save-xml --use-pytest -vvvv -rfEsxXP -p no:xdist
done
}
run_tests
echo "TEST PASSED"

View File

@ -1,98 +0,0 @@
# TODO: we may can use existing build_pytorch.bat for arm64
if ($env:DEBUG -eq "1") {
$env:BUILD_TYPE = "debug"
} else {
$env:BUILD_TYPE = "release"
}
# This inflates our log size slightly, but it is REALLY useful to be
# able to see what our cl.exe commands are. (since you can actually
# just copy-paste them into a local Windows setup to just rebuild a
# single file.)
# log sizes are too long, but leaving this here in case someone wants to use it locally
# $env:CMAKE_VERBOSE_MAKEFILE = "1"
$env:INSTALLER_DIR = Join-Path $env:SCRIPT_HELPERS_DIR "installation-helpers"
cd ..
# Environment variables
$env:SCCACHE_IDLE_TIMEOUT = "0"
$env:SCCACHE_IGNORE_SERVER_IO_ERROR = "1"
$env:CMAKE_BUILD_TYPE = $env:BUILD_TYPE
$env:CMAKE_C_COMPILER_LAUNCHER = "sccache"
$env:CMAKE_CXX_COMPILER_LAUNCHER = "sccache"
$env:libuv_ROOT = Join-Path $env:DEPENDENCIES_DIR "libuv\install"
$env:MSSdk = "1"
if ($env:PYTORCH_BUILD_VERSION) {
$env:PYTORCH_BUILD_VERSION = $env:PYTORCH_BUILD_VERSION
$env:PYTORCH_BUILD_NUMBER = "1"
}
$env:CMAKE_POLICY_VERSION_MINIMUM = "3.5"
# Set BLAS type
if ($env:ENABLE_APL -eq "1") {
$env:BLAS = "APL"
$env:USE_LAPACK = "1"
} elseif ($env:ENABLE_OPENBLAS -eq "1") {
$env:BLAS = "OpenBLAS"
$env:OpenBLAS_HOME = Join-Path $env:DEPENDENCIES_DIR "OpenBLAS\install"
}
# Change to source directory
Set-Location $env:PYTORCH_ROOT
# Copy libuv.dll
Copy-Item -Path (Join-Path $env:libuv_ROOT "lib\Release\uv.dll") -Destination "torch\lib\uv.dll" -Force
# Create virtual environment
python -m venv .venv
.\.venv\Scripts\Activate.ps1
where.exe python
# Python install dependencies
python -m pip install --upgrade pip
pip install setuptools pyyaml
pip install -r requirements.txt
# Set after installing psutil
$env:DISTUTILS_USE_SDK = "1"
# Print all environment variables
Get-ChildItem Env:
# Start and inspect sccache
sccache --start-server
sccache --zero-stats
sccache --show-stats
# Build the wheel
python setup.py bdist_wheel
if ($LASTEXITCODE -ne 0) { exit 1 }
# Install the wheel locally
$whl = Get-ChildItem -Path "dist\*.whl" | Select-Object -First 1
if ($whl) {
python -mpip install --no-index --no-deps $whl.FullName
}
# Copy final wheel
robocopy "dist" "$env:PYTORCH_FINAL_PACKAGE_DIR" *.whl
# Export test times
python tools/stats/export_test_times.py
# Copy additional CI files
robocopy ".additional_ci_files" "$env:PYTORCH_FINAL_PACKAGE_DIR\.additional_ci_files" /E
# Save ninja log
Copy-Item -Path "build\.ninja_log" -Destination $env:PYTORCH_FINAL_PACKAGE_DIR -Force
# Final sccache stats and stop
sccache --show-stats
sccache --stop-server
exit 0

View File

@ -126,6 +126,11 @@ if "%USE_CUDA%"=="1" (
set CMAKE_CUDA_COMPILER_LAUNCHER=%TMP_DIR%/bin/randomtemp.exe;%TMP_DIR%\bin\sccache.exe
)
:: Install build-system requirements before running setup.py commands
python -m pip install -r requirements-build.txt
if errorlevel 1 goto fail
if not errorlevel 0 goto fail
:: Print all existing environment variable for debugging
set

View File

@ -18,5 +18,5 @@ start /wait "" python-amd64.exe /quiet InstallAllUsers=1 PrependPath=0 Include_t
if errorlevel 1 exit /b 1
set "PATH=%CD%\Python\Scripts;%CD%\Python;%PATH%"
%PYTHON_EXEC% -m pip install --upgrade pip setuptools packaging wheel
%PYTHON_EXEC% -m pip install --upgrade pip "setuptools>=77.0.0" "packaging>=24.2" wheel
if errorlevel 1 exit /b 1

View File

@ -7,6 +7,9 @@ call "internal\install_python.bat"
%PYTHON_EXEC% --version
set "PATH=%CD%\Python\Lib\site-packages\cmake\data\bin;%CD%\Python\Scripts;%CD%\Python;%PATH%"
%PYTHON_EXEC% -m pip install "setuptools>=77.0.0" "packaging>=24.2"
if "%DESIRED_PYTHON%" == "3.13t" %PYTHON_EXEC% -m pip install numpy==2.2.1 cmake
if "%DESIRED_PYTHON%" == "3.13" %PYTHON_EXEC% -m pip install numpy==2.1.2 cmake
if "%DESIRED_PYTHON%" == "3.12" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake
@ -16,7 +19,7 @@ if "%DESIRED_PYTHON%" == "3.9" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake
%PYTHON_EXEC% -m pip install pyyaml
%PYTHON_EXEC% -m pip install mkl-include mkl-static
%PYTHON_EXEC% -m pip install boto3 ninja typing_extensions setuptools==72.1.0
%PYTHON_EXEC% -m pip install boto3 ninja typing-extensions
where cmake.exe

View File

@ -127,7 +127,7 @@ export INSTALL_TEST=0 # dont install test binaries into site-packages
export MACOSX_DEPLOYMENT_TARGET=10.15
export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
SETUPTOOLS_PINNED_VERSION="==70.1.0"
SETUPTOOLS_PINNED_VERSION="==77.0.0"
PYYAML_PINNED_VERSION="=5.3"
EXTRA_CONDA_INSTALL_FLAGS=""
CONDA_ENV_CREATE_FLAGS=""
@ -135,7 +135,7 @@ RENAME_WHEEL=true
case $desired_python in
3.13t)
echo "Using 3.13 deps"
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
SETUPTOOLS_PINNED_VERSION=">=77.0.0"
PYYAML_PINNED_VERSION=">=6.0.1"
NUMPY_PINNED_VERSION="=2.1.0"
CONDA_ENV_CREATE_FLAGS="python-freethreading"
@ -145,31 +145,31 @@ case $desired_python in
;;
3.13)
echo "Using 3.13 deps"
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
SETUPTOOLS_PINNED_VERSION=">=77.0.0"
PYYAML_PINNED_VERSION=">=6.0.1"
NUMPY_PINNED_VERSION="=2.1.0"
;;
3.12)
echo "Using 3.12 deps"
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
SETUPTOOLS_PINNED_VERSION=">=77.0.0"
PYYAML_PINNED_VERSION=">=6.0.1"
NUMPY_PINNED_VERSION="=2.0.2"
;;
3.11)
echo "Using 3.11 deps"
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
SETUPTOOLS_PINNED_VERSION=">=77.0.0"
PYYAML_PINNED_VERSION=">=5.3"
NUMPY_PINNED_VERSION="=2.0.2"
;;
3.10)
echo "Using 3.10 deps"
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
SETUPTOOLS_PINNED_VERSION=">=77.0.0"
PYYAML_PINNED_VERSION=">=5.3"
NUMPY_PINNED_VERSION="=2.0.2"
;;
3.9)
echo "Using 3.9 deps"
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
SETUPTOOLS_PINNED_VERSION=">=77.0.0"
PYYAML_PINNED_VERSION=">=5.3"
NUMPY_PINNED_VERSION="=2.0.2"
;;

View File

@ -7,12 +7,12 @@ max-line-length = 120
# C408 ignored because we like the dict keyword argument syntax
# E501 is not flexible enough, we're using B950 instead
ignore =
E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,
# shebang has extra meaning in fbcode lints, so I think it's not worth trying
# to line this up with executable bit
EXE001,
# these ignores are from flake8-bugbear; please fix!
B007,B008,B017,B019,B023,B028,B903,B904,B905,B906,B907,B908,B910
B007,B008,B017,B019,B023,B028,B903,B904,B905,B906,B907
# these ignores are from flake8-comprehensions; please fix!
C407,
# these ignores are from flake8-logging-format; please fix!

View File

@ -53,12 +53,16 @@ self-hosted-runner:
- linux.rocm.gpu.mi250
- linux.rocm.gpu.2
- linux.rocm.gpu.4
# gfx942 runners
- linux.rocm.gpu.gfx942.2
- linux.rocm.gpu.gfx942.4
# MI300 runners
- linux.rocm.gpu.mi300.2
- linux.rocm.gpu.mi300.4
- rocm-docker
# Repo-specific Apple hosted runners
- macos-m1-ultra
- macos-m2-14
# Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors)
- macos-m1-stable
- macos-m1-13
- macos-m1-14
# GitHub-hosted MacOS runners
- macos-latest-xlarge

View File

@ -0,0 +1,78 @@
name: build android
description: build android for a specific arch
inputs:
arch:
description: arch to build
required: true
arch-for-build-env:
description: |
arch to pass to build environment.
This is currently different than the arch name we use elsewhere, which
should be fixed.
required: true
github-secret:
description: github token
required: true
build-environment:
required: true
description: Top-level label for what's being built/tested.
docker-image:
required: true
description: Name of the base docker image to build with.
branch:
required: true
description: What branch we are building on.
outputs:
container_id:
description: Docker container identifier used to build the artifacts
value: ${{ steps.build.outputs.container_id }}
runs:
using: composite
steps:
- name: Build-${{ inputs.arch }}
id: build
shell: bash
env:
BRANCH: ${{ inputs.branch }}
BUILD_ENVIRONMENT: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-${{ inputs.arch-for-build-env }}-build"
AWS_DEFAULT_REGION: us-east-1
PR_NUMBER: ${{ github.event.pull_request.number }}
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
SCCACHE_REGION: us-east-1
DOCKER_IMAGE: ${{ inputs.docker-image }}
MATRIX_ARCH: ${{ inputs.arch }}
run: |
# detached container should get cleaned up by teardown_ec2_linux
set -exo pipefail
export container_name
container_name=$(docker run \
-e BUILD_ENVIRONMENT \
-e MAX_JOBS="$(nproc --ignore=2)" \
-e AWS_DEFAULT_REGION \
-e PR_NUMBER \
-e SHA1 \
-e BRANCH \
-e SCCACHE_BUCKET \
-e SCCACHE_REGION \
-e SKIP_SCCACHE_INITIALIZATION=1 \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--tty \
--detach \
--user jenkins \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}"
)
git submodule sync && git submodule update -q --init --recursive --depth 1
docker cp "${GITHUB_WORKSPACE}/." "${container_name}:/var/lib/jenkins/workspace"
(echo "sudo chown -R jenkins . && .ci/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete" | docker exec -u jenkins -i "${container_name}" bash) 2>&1
# Copy install binaries back
mkdir -p "${GITHUB_WORKSPACE}/build_android_install_${MATRIX_ARCH}"
docker cp "${container_name}:/var/lib/jenkins/workspace/build_android/install" "${GITHUB_WORKSPACE}/build_android_install_${MATRIX_ARCH}"
echo "container_id=${container_name}" >> "${GITHUB_OUTPUT}"

View File

@ -70,7 +70,7 @@ runs:
set -eux
# PyYAML 6.0 doesn't work with MacOS x86 anymore
# This must run on Python-3.7 (AmazonLinux2) so can't use request=3.32.2
python3 -m pip install requests==2.27.1 pyyaml==6.0.2
python3 -m pip install requests==2.27.1 pyyaml==6.0.1
- name: Parse ref
id: parse-ref

View File

@ -1 +1 @@
bf305f538005f2e900f8850ed57146024a8bc559
00b0c91db92c51a11356249262577b9fa26c18c5

View File

@ -1 +1 @@
7f1de94a4c2d14f59ad4ca84538c36084ea6b2c8
5fb5024118e9bb9decf96c2b0b1a8f0010bf56be

View File

@ -1 +0,0 @@
ca9e2be3ed6320b51f52f536595cd24e254f8bb2

View File

@ -1 +1 @@
29ae4c76c026185f417a25e841d2cd5e65f087a3
1c00dea2c9adb2137903c86b4191e8c247f8fda9

View File

@ -76,8 +76,8 @@
- .github/ci_commit_pins/audio.txt
- .github/ci_commit_pins/vision.txt
- .github/ci_commit_pins/torchdynamo.txt
- .github/ci_commit_pins/vllm.txt
- .ci/docker/ci_commit_pins/triton.txt
- .ci/docker/ci_commit_pins/vllm.txt
approved_by:
- pytorchbot
mandatory_checks_name:
@ -131,6 +131,21 @@
- Lint
- pull
- name: Mobile
patterns:
- ios/**
- android/**
- test/mobile/**
approved_by:
- linbinyu
- IvanKobzarev
- dreiss
- raziel
mandatory_checks_name:
- EasyCLA
- Lint
- pull
- name: PrimTorch
patterns:
- torch/_meta_registrations.py
@ -477,19 +492,6 @@
- srossross
- chillee
- zou3519
- guilhermeleobas
mandatory_checks_name:
- EasyCLA
- Lint
- pull
- name: Dynamo
patterns:
- torch/_dynamo/**
- torch/csrc/dynamo/**
- test/dynamo/**
approved_by:
- guilhermeleobas
mandatory_checks_name:
- EasyCLA
- Lint

View File

@ -31,9 +31,7 @@ ciflow_push_tags:
- ciflow/pull
- ciflow/h100
- ciflow/h100-distributed
- ciflow/win-arm64
- ciflow/h100-symm-mem
- ciflow/h100-cutlass-backend
retryable_workflows:
- pull
- trunk

View File

@ -7,9 +7,9 @@
# .ci/docker/requirements-ci.txt
boto3==1.35.42
jinja2==3.1.6
lintrunner==0.12.7
ninja==1.10.0.post1
lintrunner==0.10.7
ninja==1.11.1.4
nvidia-ml-py==11.525.84
pyyaml==6.0.2
pyyaml==6.0
requests==2.32.4
rich==14.1.0
rich==10.9.0

View File

@ -2,17 +2,17 @@ boto3==1.35.42
cmake==3.27.*
expecttest==0.3.0
fbscribelogger==0.1.7
filelock==3.18.0
filelock==3.6.0
hypothesis==6.56.4
librosa>=0.6.2
mpmath==1.3.0
networkx==2.8.7
ninja==1.10.2.4
ninja==1.11.1.4
numba==0.59.0
numpy==1.26.4
opt-einsum>=3.3
optree==0.13.0
packaging==23.1
packaging==25.0
parameterized==0.8.1
pillow==10.3.0
protobuf==5.29.4
@ -26,7 +26,7 @@ pytest-xdist==3.3.1
pytest==7.3.2
pyyaml==6.0.2
scipy==1.12.0
setuptools==72.1.0
setuptools==80.9.0
sympy==1.13.3
tlparse==0.3.30
tensorboard==2.13.0

View File

@ -2,7 +2,7 @@
set -ex
# Use uv to speed up lintrunner init
python3 -m pip install -U uv==0.8.* setuptools
python3 -m pip install -U uv setuptools
CACHE_DIRECTORY="/tmp/.lintbin"
# Try to recover the cached binaries

View File

@ -1891,9 +1891,7 @@ def validate_revert(
else pr.get_comment_by_id(comment_id)
)
if comment.editor_login is not None:
raise PostCommentError(
"Halting the revert as the revert comment has been edited."
)
raise PostCommentError("Don't want to revert based on edited command")
author_association = comment.author_association
author_login = comment.author_login
allowed_reverters = ["COLLABORATOR", "MEMBER", "OWNER"]

View File

@ -10,7 +10,7 @@ if "%PY_VERS%" == "3.13t" (
call conda create -n %PYTHON_PREFIX% -y -c=conda-forge python=%PY_VERS%
)
:: Fix cmake version for issue https://github.com/pytorch/pytorch/issues/150480
call conda run -n %PYTHON_PREFIX% pip install wheel pybind11 certifi cython cmake==3.31.6 setuptools==72.1.0 ninja
call conda run -n %PYTHON_PREFIX% pip install wheel pybind11 certifi cython cmake==3.31.6 setuptools==78.1.1 ninja
dir "%VC_INSTALL_PATH%"

View File

@ -27,7 +27,7 @@ jobs:
PR_NUMBER="${{ github.event.number }}"
# Use gh CLI to get changed files in the PR with explicit repo
CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
CHANGED_FILES=$(gh pr view "$PR_NUMBER" --repo "${{ github.repository }}" --json files --jq '.files[].path' | tr '\n' ' ' | sed 's/ $//')
if [ -z "$CHANGED_FILES" ]; then
echo "No changed files found, setting to '*'"
@ -40,4 +40,4 @@ jobs:
else
echo "Not in PR context, setting changed files to '*'"
echo "changed-files=*" >> "$GITHUB_OUTPUT"
fi
fi

View File

@ -80,6 +80,11 @@ jobs:
run: |
sysctl machdep.cpu.brand_string kern.osproductversion
- name: Install build toolchain
run: |
brew update --quiet
brew install --formula cmake ninja
- name: Clean up leftover processes on MacOS pet runner
continue-on-error: true
run: |

View File

@ -269,8 +269,8 @@ jobs:
# copy test results back to the mounted workspace, needed sudo, resulting permissions were correct
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test"
- name: Change permissions (only needed for kubernetes runners for now)
if: ${{ always() && steps.test.conclusion && (contains(matrix.runner, 'gfx942') || contains(matrix.runner, 'mi355')) }}
- name: Change permissions (only needed for MI300 runners for now)
if: ${{ always() && steps.test.conclusion && contains(matrix.runner, 'mi300') }}
run: |
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"

View File

@ -50,7 +50,7 @@ jobs:
strategy:
fail-fast: false
matrix:
py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t" ]
device: ["cuda", "rocm", "xpu", "aarch64"]
docker-image: ["pytorch/manylinux2_28-builder:cpu"]
include:
@ -126,12 +126,6 @@ jobs:
3.13t)
PYTHON_EXECUTABLE=/opt/python/cp313-cp313t/bin/python
;;
3.14)
PYTHON_EXECUTABLE=/opt/python/cp314-cp314/bin/python
;;
3.14t)
PYTHON_EXECUTABLE=/opt/python/cp314-cp314t/bin/python
;;
*)
echo "Unsupported python version ${PY_VERS}"
exit 1

View File

@ -56,7 +56,7 @@ jobs:
cache: pip
architecture: x64
- run: pip install pyyaml==6.0.2
- run: pip install pyyaml==6.0
shell: bash
- name: Verify mergeability

View File

@ -26,7 +26,7 @@ jobs:
cache: pip
# Not the direct dependencies but the script uses trymerge
- run: pip install pyyaml==6.0.2
- run: pip install pyyaml==6.0
- name: Setup committer id
run: |

View File

@ -50,7 +50,6 @@ jobs:
runner: [linux.12xlarge]
docker-image-name: [
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks,

View File

@ -144,7 +144,7 @@ jobs:
run: |
make -f docker.Makefile "${BUILD_IMAGE_TYPE}-image"
- name: Push nightly tags
if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' && matrix.platform == 'linux/amd4' }}
if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' && matrix.build_platforms == 'linux/amd4' }}
run: |
PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-cuda${CUDA_VERSION_SHORT}-cudnn${CUDNN_VERSION}-runtime"
CUDA_SUFFIX="-cu${CUDA_VERSION}"

View File

@ -1,58 +0,0 @@
name: Limited CI for CUTLASS backend on H100
on:
pull_request:
paths:
- .github/workflows/h100-cutlass-backend.yml
workflow_dispatch:
schedule:
- cron: 22 9 * * * # every 24 hours about 2:22am PDT
push:
tags:
- ciflow/h100-cutlass-backend/*
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
get-label-type:
if: github.repository_owner == 'pytorch'
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-cutlass-backend:
name: linux-jammy-cuda12.8-py3.10-gcc11-sm90-cutlass-backend
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-cutlass-backend
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '9.0'
test-matrix: |
{ include: [
{ config: "h100_cutlass_backend", shard: 1, num_shards: 1, runner: "linux.aws.h100", owners: ["oncall:pt2"] },
]}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc11-sm90-test:
name: linux-jammy-cuda12.8-py3.10-gcc11-sm90-cutlass-backend
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-cutlass-backend
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-cutlass-backend
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-cutlass-backend.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-cutlass-backend.outputs.test-matrix }}
secrets: inherit

View File

@ -2,7 +2,7 @@ name: inductor-perf-nightly-h100
on:
schedule:
- cron: 15 0,12 * * 1-6
- cron: 15 0,4,8,12,16,20 * * 1-6
- cron: 0 7 * * 0
# NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
# out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
@ -126,7 +126,7 @@ jobs:
name: cuda12.8-py3.10-gcc9-sm90
uses: ./.github/workflows/_linux-test.yml
needs: build
if: github.event.schedule == '15 0,12 * * 1-6'
if: github.event.schedule == '15 0,4,8,12,16,20 * * 1-6'
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true

View File

@ -88,23 +88,23 @@ jobs:
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
]}
secrets: inherit

View File

@ -47,8 +47,8 @@ jobs:
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
test-matrix: |
{ include: [
{ config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
{ config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
]}
secrets: inherit

View File

@ -28,6 +28,7 @@ jobs:
# than our AWS macos-m1-14 runners
test-matrix: |
{ include: [
{ config: "test_mps", shard: 1, num_shards: 1, runner: "macos-m1-13" },
{ config: "test_mps", shard: 1, num_shards: 1, runner: "macos-m1-14" },
{ config: "test_mps", shard: 1, num_shards: 1, runner: "macos-m2-15" },
]}

View File

@ -86,7 +86,7 @@ jobs:
- repo-name: vllm
repo-owner: vllm-project
branch: main
pin-folder: .github/ci_commit_pins
pin-folder: .ci/docker/ci_commit_pins
# Allow this to be triggered on either a schedule or on workflow_dispatch to allow for easier testing
if: github.repository_owner == 'pytorch' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
steps:

View File

@ -59,9 +59,9 @@ jobs:
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
test-matrix: |
{ include: [
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
]}
secrets: inherit

View File

@ -315,6 +315,21 @@ jobs:
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-py3-clang18-mobile-build:
name: linux-jammy-py3-clang18-mobile-build
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3-clang12-mobile-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
build-generates-artifacts: false
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 1 },
]}
secrets: inherit
linux-jammy-cuda12_8-cudnn9-py3_9-clang12-build:
name: linux-jammy-cuda12.8-cudnn9-py3.9-clang12
uses: ./.github/workflows/_linux-build.yml

View File

@ -26,7 +26,7 @@ jobs:
architecture: x64
check-latest: false
cache: pip
- run: pip install pyyaml==6.0.2
- run: pip install pyyaml==6.0
- name: Setup committer id
run: |

View File

@ -48,12 +48,12 @@ jobs:
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
]}
secrets: inherit

View File

@ -1,68 +0,0 @@
name: rocm-mi355
on:
workflow_dispatch:
schedule:
- cron: 30 11,1 * * * # about 4:30am PDT and 6:30pm PDT
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions: read-all
jobs:
target-determination:
if: github.repository_owner == 'pytorch'
name: before-test
uses: ./.github/workflows/target_determination.yml
permissions:
id-token: write
contents: read
get-label-type:
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
linux-noble-rocm-py3_12-build:
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
name: linux-noble-rocm-py3.12-mi355
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-noble-rocm-py3.12-mi355
docker-image-name: ci-image:pytorch-linux-noble-rocm-alpha-py3
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
]}
secrets: inherit
linux-noble-rocm-py3_12-test:
permissions:
id-token: write
contents: read
name: linux-noble-rocm-py3.12-mi355
uses: ./.github/workflows/_rocm-test.yml
needs:
- linux-noble-rocm-py3_12-build
- target-determination
with:
build-environment: linux-noble-rocm-py3.12-mi355
docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }}
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
secrets: inherit

View File

@ -94,6 +94,7 @@ jobs:
{ config: "default", shard: 1, num_shards: 3, runner: "macos-m1-stable" },
{ config: "default", shard: 2, num_shards: 3, runner: "macos-m1-stable" },
{ config: "default", shard: 3, num_shards: 3, runner: "macos-m1-stable" },
{ config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-13" },
{ config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-14" },
{ config: "mps", shard: 1, num_shards: 1, runner: "macos-m2-15" },
]}

View File

@ -28,7 +28,7 @@ jobs:
check-latest: false
cache: pip
architecture: x64
- run: pip install pyyaml==6.0.2
- run: pip install pyyaml==6.0
- name: Setup committer id
run: |

View File

@ -25,7 +25,7 @@ jobs:
architecture: x64
check-latest: false
cache: pip
- run: pip install pyyaml==6.0.2
- run: pip install pyyaml==6.0
- name: Setup committer id
run: |

View File

@ -14,7 +14,6 @@ on:
- inductor-periodic
- rocm
- rocm-mi300
- rocm-mi355
- inductor-micro-benchmark
- inductor-micro-benchmark-x86
- inductor-cu124

View File

@ -1,187 +0,0 @@
name: windows-arm64-build-test
on:
push:
tags:
- ciflow/win-arm64/*
env:
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
PYTHON_VERSION: "3.12"
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
DOWNLOADS_DIR: c:\temp\downloads
DEPENDENCIES_DIR: c:\temp\dependencies
ENABLE_APL: 1
ENABLE_OPENBLAS: 0
BUILD_TYPE: release
permissions:
id-token: write
contents: read
jobs:
build:
# Don't run on forked repos.
if: github.repository_owner == 'pytorch'
runs-on: "windows-11-arm64-preview"
timeout-minutes: 240
steps:
- name: configure aws credentials
id: aws_creds
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_sscache
aws-region: us-east-1
role-duration-seconds: 18000
- name: Enable long paths
shell: cmd
run: |
git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
git config --system core.longpaths true
- name: Git checkout PyTorch
uses: actions/checkout@v4
with:
path: pytorch
submodules: recursive
- name: Bootstrap Python
shell: cmd
run: |
"pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
- name: Parse ref
id: parse-ref
shell: bash
run: python pytorch/.github/scripts/parse_ref.py
- name: Get workflow job id
shell: bash
id: get-job-id
run: |
set -eux
python pytorch/.github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}"
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Bootstrap APL
shell: cmd
run: |
"pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
- name: Bootstrap Rust
shell: cmd
run: |
"pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
- name: Bootstrap sccache
shell: cmd
run: |
"pytorch/.ci/pytorch/windows/arm64/bootstrap_sccache.bat"
- name: Bootstrap Libuv
shell: cmd
run: |
"pytorch/.ci/pytorch/windows/arm64/bootstrap_libuv.bat"
- name: Build
id: build
shell: cmd
env:
PYTORCH_FINAL_PACKAGE_DIR: C:/${{ github.run_id }}/build-results/
BRANCH: ${{ steps.parse-ref.outputs.branch }}
BUILD_WHEEL: 1
MAX_JOBS: 8
PYTHON_VERSION: "3.12"
SCCACHE_BUCKET: "ossci-compiler-cache"
SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
SCCACHE_REGION: us-east-1
VC_PRODUCT: "BuildTools"
VC_VERSION: ""
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
AWS_DEFAULT_REGION: us-east-1
USE_CUDA: '0'
USE_XPU: '0'
OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
run: |
cd pytorch
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64
powershell -ExecutionPolicy Bypass -File ".ci/pytorch/win-arm64-build.ps1"
- name: Upload artifacts
uses: actions/upload-artifact@v4.4.0
if: always()
with:
name: torch-wheel-win-arm64-py3-12
retention-days: 14
if-no-files-found: error
path: C:\${{ github.run_id }}\build-results
test:
if: github.repository_owner == 'pytorch'
strategy:
fail-fast: false
runs-on: "windows-11-arm64-preview"
needs: build
steps:
- name: Enable long paths
shell: cmd
run: |
git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
git config --system core.longpaths true
- name: Git checkout PyTorch
uses: actions/checkout@v4
with:
path: pytorch
submodules: recursive
- name: Bootstrap Python
shell: cmd
run: |
"pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
- name: Bootstrap Rust
shell: cmd
run: |
"pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
- name: Get workflow job id
shell: bash
id: get-job-id
run: |
set -eux
python pytorch/.github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}"
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Download Build Artifacts
uses: actions/download-artifact@v4.1.7
with:
name: torch-wheel-win-arm64-py3-12
path: C:\${{ github.run_id }}\build-results
- name: Test
id: test
shell: cmd
env:
USE_CUDA: '0'
INSTALL_WINDOWS_SDK: 1
PYTHON_VERSION: "3.12"
VC_PRODUCT: "BuildTools"
AWS_DEFAULT_REGION: us-east-1
GITHUB_REPOSITORY: ${{ github.repository }}
GITHUB_WORKFLOW: ${{ github.workflow }}
GITHUB_JOB: ${{ github.job }}
GITHUB_RUN_ID: ${{ github.run_id }}
GITHUB_RUN_NUMBER: ${{ github.run_number }}
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
PYTORCH_FINAL_PACKAGE_DIR: C:/${{ github.run_id }}/build-results/
run: |
mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
call pytorch/.ci/pytorch/windows/arm64/bootstrap_tests.bat
set GIT_BASH=C:\Program Files\Git\usr\bin\bash.exe
"%GIT_BASH%" -c "bash --noprofile --norc .ci/pytorch/win-arm64-test.sh"

View File

@ -39,16 +39,16 @@ init_command = [
'python3',
'tools/linter/adapters/pip_init.py',
'--dry-run={{DRYRUN}}',
'flake8==7.3.0',
'flake8-bugbear==24.12.12',
'flake8-comprehensions==3.16.0',
'flake8==6.1.0',
'flake8-bugbear==23.3.23',
'flake8-comprehensions==3.15.0',
'flake8-executable==2.1.3',
'flake8-logging-format==2024.24.12',
'flake8-pyi==25.5.0',
'flake8-simplify==0.22.0',
'flake8-logging-format==0.9.0',
'flake8-pyi==23.3.1',
'flake8-simplify==0.19.3',
'mccabe==0.7.0',
'pycodestyle==2.14.0',
'pyflakes==3.4.0',
'pycodestyle==2.11.1',
'pyflakes==3.1.0',
'torchfix==0.4.0 ; python_version >= "3.9" and python_version < "3.13"',
]
@ -158,16 +158,16 @@ init_command = [
'mypy==1.16.0',
'sympy==1.13.3',
'types-requests==2.27.25',
'types-pyyaml==6.0.2',
'types-pyyaml==6.0.1',
'types-tabulate==0.8.8',
'types-protobuf==5.29.1.20250403',
'types-setuptools==79.0.0.20250422',
'types-jinja2==2.11.9',
'types-colorama==0.4.6',
'filelock==3.18.0',
'filelock==3.13.1',
'junitparser==2.1.1',
'rich==14.1.0',
'pyyaml==6.0.2',
'rich==10.9.0',
'pyyaml==6.0.1',
'optree==0.13.0',
'dataclasses-json==0.6.7',
'pandas==2.2.3',
@ -1111,7 +1111,7 @@ init_command = [
'python3',
'tools/linter/adapters/pip_init.py',
'--dry-run={{DRYRUN}}',
'pyyaml==6.0.2',
'PyYAML==6.0.1',
]
[[linter]]
@ -1133,7 +1133,7 @@ init_command = [
'python3',
'tools/linter/adapters/pip_init.py',
'--dry-run={{DRYRUN}}',
'pyyaml==6.0.2',
'PyYAML==6.0.1',
]
[[linter]]
@ -1794,12 +1794,3 @@ include_patterns = [
'torch/header_only_apis.txt',
]
is_formatter = false
[[linter]]
code = "GB_REGISTRY"
include_patterns = ["torch/_dynamo/**/*.py"]
command = [
"python3",
"tools/linter/adapters/gb_registry_linter.py",
]

View File

@ -679,7 +679,6 @@ cc_library(
[
"torch/*.h",
"torch/csrc/**/*.h",
"torch/nativert/**/*.h",
"torch/csrc/distributed/c10d/**/*.hpp",
"torch/lib/libshm/*.h",
],

View File

@ -564,7 +564,7 @@ if(MSVC)
set(CMAKE_NINJA_CMCLDEPS_RC OFF)
if(MSVC_Z7_OVERRIDE)
# CMake set debug flags to use /Z7
set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$<CONFIG:Debug,RelWithDebInfo>:Embedded>")
set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT Embedded)
endif()
foreach(
flag_var
@ -872,14 +872,6 @@ cmake_dependent_option(
"USE_CUDA OR USE_ROCM;NOT MSVC"
OFF)
cmake_dependent_option(
USE_FBGEMM_GENAI
"Whether to build FBGEMM GenAI quantized GEMM kernels.\
Will be disabled if not supported by the platform"
OFF
"USE_CUDA OR USE_ROCM"
OFF)
# CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem
# Eff Attention won't
cmake_dependent_option(
@ -913,10 +905,6 @@ if(USE_FBGEMM)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM")
endif()
if(USE_FBGEMM_GENAI)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM_GENAI")
endif()
if(USE_PYTORCH_QNNPACK)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_QNNPACK")
endif()

View File

@ -51,12 +51,12 @@ nn/qat/ @jerryzh168
/torch/csrc/distributed/c10d/Ops.* @kwen2501
# ONNX Export
/torch/_dynamo/backends/onnxrt.py @titaiwangms @xadupre @justinchuby
/torch/csrc/jit/passes/onnx.h @titaiwangms @xadupre
/torch/csrc/jit/passes/onnx.cpp @titaiwangms @xadupre
/torch/csrc/jit/passes/onnx/ @titaiwangms @xadupre
/torch/onnx/ @titaiwangms @xadupre @justinchuby
/test/onnx/ @titaiwangms @xadupre @justinchuby
/torch/_dynamo/backends/onnxrt.py @wschin
/torch/csrc/jit/passes/onnx.h @titaiwangms @shubhambhokare1
/torch/csrc/jit/passes/onnx.cpp @titaiwangms @shubhambhokare1
/torch/csrc/jit/passes/onnx/ @titaiwangms @shubhambhokare1
/torch/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin
/test/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin
# CI
/.ci @pytorch/pytorch-dev-infra

View File

@ -47,6 +47,18 @@ WORKDIR /opt/pytorch
COPY . .
RUN git submodule update --init --recursive
FROM conda as build
ARG CMAKE_VARS
WORKDIR /opt/pytorch
COPY --from=conda /opt/conda /opt/conda
COPY --from=submodule-update /opt/pytorch /opt/pytorch
RUN make triton
RUN --mount=type=cache,target=/opt/ccache \
export eval ${CMAKE_VARS} && \
TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 8.9 9.0 9.0a" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
python -m pip install --no-build-isolation -v .
FROM conda as conda-installs
ARG PYTHON_VERSION=3.11
ARG CUDA_PATH=cu121
@ -97,5 +109,4 @@ WORKDIR /workspace
FROM official as dev
# Should override the already installed version from the official-image stage
COPY --from=conda /opt/conda /opt/conda
COPY --from=submodule-update /opt/pytorch /opt/pytorch
COPY --from=build /opt/conda /opt/conda

View File

@ -294,12 +294,14 @@ Install PyTorch
```bash
export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
python -m pip install -r requirements-build.txt
python -m pip install --no-build-isolation -v -e .
```
**On macOS**
```bash
python -m pip install -r requirements-build.txt
python -m pip install --no-build-isolation -v -e .
```

View File

@ -247,50 +247,6 @@ if(USE_MEM_EFF_ATTENTION)
list(APPEND ATen_ATTENTION_KERNEL_SRCS ${mem_eff_attention_cuda_kernels_cu})
endif()
IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF")
set(USE_FBGEMM_GENAI off)
endif()
# FBGEMM GenAI
IF(USE_FBGEMM_GENAI)
set(FBGEMM_THIRD_PARTY ${PROJECT_SOURCE_DIR}/third_party/fbgemm/external/)
set(FBGEMM_GENAI_DIR ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize)
if(USE_ROCM)
# Only include the kernels we want to build to avoid increasing binary size.
file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
"${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
"${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
# Add additional HIPCC compiler flags for performance
set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
-mllvm
-amdgpu-coerce-illegal-types=1
-mllvm
-enable-post-misched=0
-mllvm
-greedy-reverse-local-assignment=1
-fhip-new-launch-api)
hip_add_library(
fbgemm_genai STATIC
${fbgemm_genai_native_rocm_hip}
HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
target_include_directories(fbgemm_genai PUBLIC
# FBGEMM version of Composable Kernel is used due to some customizations
${FBGEMM_THIRD_PARTY}/composable_kernel/include
${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
${FBGEMM_GENAI_DIR}/include/
${FBGEMM_GENAI_DIR}/common/include/
)
endif()
endif()
# XNNPACK
file(GLOB native_xnnpack "native/xnnpack/*.cpp")
@ -630,10 +586,17 @@ if(USE_CUDA AND NOT USE_ROCM)
CUDA::cufft_static_nocallback
)
if(NOT BUILD_LAZY_CUDA_LINALG)
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
CUDA::cusolver_static
${CUDAToolkit_LIBRARY_DIR}/libcusolver_lapack_static.a # needed for libcusolver_static
)
if(CUDA_VERSION_MAJOR LESS_EQUAL 11)
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
CUDA::cusolver_static
${CUDAToolkit_LIBRARY_DIR}/liblapack_static.a # needed for libcusolver_static
)
elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 12)
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
CUDA::cusolver_static
${CUDAToolkit_LIBRARY_DIR}/libcusolver_lapack_static.a # needed for libcusolver_static
)
endif()
endif()
else()
list(APPEND ATen_CUDA_DEPENDENCY_LIBS

View File

@ -14,9 +14,7 @@
#include <ATen/cpu/FlushDenormal.h>
#ifdef USE_FBGEMM
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
#include <fbgemm/Fbgemm.h>
C10_DIAGNOSTIC_POP()
#endif // USE_FBGEMM
#if defined(__aarch64__) && !defined(C10_MOBILE)
#include <cpuinfo.h>
@ -334,14 +332,6 @@ void Context::setBenchmarkLimitCuDNN(int b) {
benchmark_limit_cudnn = b;
}
bool Context::immediateMiopen() const {
return immediate_miopen;
}
void Context::setImmediateMiopen(bool b) {
immediate_miopen = b;
}
bool Context::allowTF32CuBLAS() const {
#ifdef USE_ROCM
const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
@ -512,7 +502,7 @@ at::BlasBackend Context::blasPreferredBackend() {
static const std::vector<std::string> archs = {
"gfx90a", "gfx942",
#if ROCM_VERSION >= 60300
"gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908",
"gfx1100", "gfx1101", "gfx1200", "gfx1201",
#endif
#if ROCM_VERSION >= 60500
"gfx950"

View File

@ -205,8 +205,6 @@ class TORCH_API Context {
void setBenchmarkCuDNN(bool);
int benchmarkLimitCuDNN() const;
void setBenchmarkLimitCuDNN(int);
bool immediateMiopen() const;
void setImmediateMiopen(bool);
bool deterministicCuDNN() const;
void setDeterministicCuDNN(bool);
bool deterministicMkldnn() const;
@ -442,7 +440,6 @@ class TORCH_API Context {
bool enabled_overrideable = true;
bool allow_fp16_bf16_reduction_mathSDP = false;
bool benchmark_cudnn = false;
bool immediate_miopen = false;
Float32MatmulPrecision float32_matmul_precision =
c10::utils::check_env("TORCH_ALLOW_TF32_CUBLAS_OVERRIDE") == true
? at::Float32MatmulPrecision::HIGH

View File

@ -132,9 +132,6 @@ DLDevice torchDeviceToDLDevice(at::Device device) {
case DeviceType::PrivateUse1:
ctx.device_type = DLDeviceType::kDLExtDev;
break;
case DeviceType::MPS:
ctx.device_type = DLDeviceType::kDLMetal;
break;
default:
TORCH_CHECK_BUFFER(false, "Cannot pack tensors on " + device.str());
}
@ -167,8 +164,6 @@ static Device getATenDevice(DLDeviceType type, c10::DeviceIndex index, void* dat
return at::Device(DeviceType::MAIA, index);
case DLDeviceType::kDLExtDev:
return at::Device(DeviceType::PrivateUse1, index);
case DLDeviceType::kDLMetal:
return at::Device(DeviceType::MPS, index);
default:
TORCH_CHECK_BUFFER(
false, "Unsupported device_type: ", std::to_string(type));

View File

@ -1,5 +1,6 @@
#pragma once
#include <c10/core/CachingDeviceAllocator.h>
#include <c10/core/DeviceType.h>
#include <c10/macros/Macros.h>
@ -72,6 +73,27 @@ TORCH_API c10::DeviceIndex exchangeDevice(c10::DeviceIndex device_index);
// original device index that was active before the change.
TORCH_API c10::DeviceIndex maybeExchangeDevice(c10::DeviceIndex device_index);
TORCH_API inline void emptyCache() {
const auto device_type = getAccelerator(true).value();
at::getDeviceAllocator(device_type)->emptyCache();
}
TORCH_API inline at::CachingDeviceAllocator::DeviceStats getDeviceStats(
c10::DeviceIndex device_index) {
const auto device_type = getAccelerator(true).value();
return at::getDeviceAllocator(device_type)->getDeviceStats(device_index);
}
TORCH_API inline void resetAccumulatedStats(c10::DeviceIndex device_index) {
const auto device_type = getAccelerator(true).value();
at::getDeviceAllocator(device_type)->resetAccumulatedStats(device_index);
}
TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) {
const auto device_type = getAccelerator(true).value();
at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
}
} // namespace at::accelerator
namespace at {

View File

@ -9,36 +9,7 @@
namespace at {
/*
* Design:
* 1. ZeroTensors are regular tensors with TensorOptions, a storage
* pointing to nullptr and a ZeroTensor dispatch key set.
*
* 2. ZeroTensors are immutable. This is done to prevent data race in the case of multithreading
* (when two threads try to read the same zero tensor and materialize it in-place).
*
* 3. ZeroTensor has a boxed fallback that will be dispatched to any ops that don't
* have special ZeroTensor handling. This fallback materializes each ZeroTensor to
* `at::zeros({}, tensor.options()).expand(tensor.sizes())`.
* 4. ZeroTensors are handled above autograd. This is necessary because fallback
* operations are not differentiable.
* - Example: Consider add in the case it was using the fallback: zerotensor_a + b.
* zerotensor_a would be materialized to c=torch.zeros_like(zerotensor_a) after
* passing through the fallback. If this happens above the autograd, then the
* gradients would be populated on c instead of zerotensor_a.
*
* 5. The grad field is always populated with an honest to goodness tensor. This
* materialization of ZeroTensors will happen in:
* - AccumulateGrad for Backward Mode AD.
* - will never be required for ForwardMode AD.
* - This is because if all the tangents were undefined (efficient ZeroTensors),
* no computation will be performed (this is ensured via an existing pre-check).
*
* Today ZeroTensors are primarily used to represent undefined gradients in forward AD,
* it does not perfectly handle NaNs and Infs as we don't check the actual values
* and assume that they are non-zero, non-inf, non-NaN etc.
*/
// TODO: add a note explaining the design decisions
// ZeroTensors are designed to be immutable. Thus, we error out when an in-place operation is performed on ZeroTensors
static void zeroTensorFallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
const auto& arguments = op.schema().arguments();

View File

@ -1 +1,55 @@
#include <torch/headeronly/cpu/vec/intrinsics.h>
#pragma once
#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
/* GCC or clang-compatible compiler, targeting x86/x86-64 */
#include <x86intrin.h>
#elif defined(__clang__) && (defined(__ARM_NEON__) || defined(__aarch64__))
/* Clang-compatible compiler, targeting arm neon */
#include <arm_neon.h>
#if defined(__ARM_FEATURE_SVE)
/* CLANG-compatible compiler, targeting ARM with SVE */
#include <arm_sve.h>
#endif
#elif defined(_MSC_VER)
/* Microsoft C/C++-compatible compiler */
#include <intrin.h>
#if _MSC_VER <= 1900
#define _mm256_extract_epi64(X, Y) \
(_mm_extract_epi64(_mm256_extractf128_si256(X, Y >> 1), Y % 2))
#define _mm256_extract_epi32(X, Y) \
(_mm_extract_epi32(_mm256_extractf128_si256(X, Y >> 2), Y % 4))
#define _mm256_extract_epi16(X, Y) \
(_mm_extract_epi16(_mm256_extractf128_si256(X, Y >> 3), Y % 8))
#define _mm256_extract_epi8(X, Y) \
(_mm_extract_epi8(_mm256_extractf128_si256(X, Y >> 4), Y % 16))
#endif
#elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__aarch64__))
/* GCC-compatible compiler, targeting ARM with NEON */
#include <arm_neon.h>
#if defined(__ARM_FEATURE_SVE)
/* GCC-compatible compiler, targeting ARM with SVE */
#include <arm_sve.h>
#endif
#if defined(MISSING_ARM_VLD1)
#include <ATen/cpu/vec/vec256/missing_vld1_neon.h>
#elif defined(MISSING_ARM_VST1)
#include <ATen/cpu/vec/vec256/missing_vst1_neon.h>
#endif
#elif defined(__GNUC__) && defined(__IWMMXT__)
/* GCC-compatible compiler, targeting ARM with WMMX */
#include <mmintrin.h>
#elif defined(__s390x__)
// targets Z/architecture
// we will include vecintrin later
#elif (defined(__GNUC__) || defined(__xlC__)) && \
(defined(__VEC__) || defined(__ALTIVEC__))
/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
#include <altivec.h>
/* We need to undef those tokens defined by <altivec.h> to avoid conflicts
with the C++ types. => Can still use __bool/__vector */
#undef bool
#undef vector
#undef pixel
#elif defined(__GNUC__) && defined(__SPE__)
/* GCC-compatible compiler, targeting PowerPC with SPE */
#include <spe.h>
#endif

View File

@ -5,7 +5,6 @@
#include <ATen/cpu/vec/sve/vec_common_sve.h>
#include <ATen/cpu/vec/sve/vec_float.h>
#include <ATen/cpu/vec/vec_base.h>
#include <c10/util/bit_cast.h>
#include <cmath>
namespace at {
namespace vec {
@ -37,7 +36,7 @@ class Vectorized<BFloat16> {
return VECTOR_WIDTH / sizeof(BFloat16);
}
Vectorized();
Vectorized() {}
Vectorized(svbfloat16_t v) : values(v) {}
Vectorized(int val);
Vectorized(BFloat16 val);
@ -307,11 +306,6 @@ Vectorized<c10::BFloat16> inline operator/(
return binary_operator_via_float(std::divides<Vectorized<float>>(), a, b);
}
inline Vectorized<BFloat16>::Vectorized() {
const short zero = 0;
values = svdup_n_bf16(c10::bit_cast<bfloat16_t>(zero));
}
inline Vectorized<BFloat16>::Vectorized(int val) {
auto vals_f = svdup_n_f32(val);
values = convert_float_bfloat16(vals_f, vals_f);

View File

@ -38,9 +38,7 @@ class Vectorized<double> {
static constexpr size_type size() {
return VECTOR_WIDTH / sizeof(double);
}
Vectorized() {
values = svdup_n_f64(0);
}
Vectorized() {}
Vectorized(svfloat64_t v) : values(v) {}
Vectorized(double val) {
values = svdup_n_f64(val);
@ -587,30 +585,6 @@ Vectorized<double> inline fmadd(
return svmad_f64_x(ptrue, a, b, c);
}
template <>
Vectorized<double> inline fnmadd(
const Vectorized<double>& a,
const Vectorized<double>& b,
const Vectorized<double>& c) {
return svmsb_f64_x(ptrue, a, b, c);
}
template <>
Vectorized<double> inline fmsub(
const Vectorized<double>& a,
const Vectorized<double>& b,
const Vectorized<double>& c) {
return svnmsb_f64_x(ptrue, a, b, c);
}
template <>
Vectorized<double> inline fnmsub(
const Vectorized<double>& a,
const Vectorized<double>& b,
const Vectorized<double>& c) {
return svnmad_f64_x(ptrue, a, b, c);
}
#endif // defined(CPU_CAPABILITY_SVE)
} // namespace CPU_CAPABILITY

View File

@ -38,9 +38,7 @@ class Vectorized<float> {
static constexpr size_type size() {
return VECTOR_WIDTH / sizeof(float);
}
Vectorized() {
values = svdup_n_f32(0);
}
Vectorized() {}
Vectorized(svfloat32_t v) : values(v) {}
Vectorized(float val) {
values = svdup_n_f32(val);
@ -758,30 +756,6 @@ Vectorized<float> inline fmadd(
return svmad_f32_x(ptrue, a, b, c);
}
template <>
Vectorized<float> inline fnmadd(
const Vectorized<float>& a,
const Vectorized<float>& b,
const Vectorized<float>& c) {
return svmsb_f32_x(ptrue, a, b, c);
}
template <>
Vectorized<float> inline fmsub(
const Vectorized<float>& a,
const Vectorized<float>& b,
const Vectorized<float>& c) {
return svnmsb_f32_x(ptrue, a, b, c);
}
template <>
Vectorized<float> inline fnmsub(
const Vectorized<float>& a,
const Vectorized<float>& b,
const Vectorized<float>& c) {
return svnmad_f32_x(ptrue, a, b, c);
}
#endif // defined(CPU_CAPABILITY_SVE)
} // namespace CPU_CAPABILITY

View File

@ -32,9 +32,7 @@ inline namespace CPU_CAPABILITY {
static constexpr size_type size() { \
return vl; \
} \
Vectorized() { \
values = svdup_n_s##bit(0); \
} \
Vectorized() {} \
Vectorized(svint##bit##_t v) : values(v) {} \
Vectorized(int##bit##_t val) { \
values = svdup_n_s##bit(val); \

View File

@ -552,15 +552,6 @@ Vectorized<c10::BFloat16> inline fmadd(
return a * b + c;
}
template <>
Vectorized<c10::BFloat16> inline fnmadd(
const Vectorized<c10::BFloat16>& a,
const Vectorized<c10::BFloat16>& b,
const Vectorized<c10::BFloat16>& c) {
// See NOTE [BF16 FMA] above.
return -a * b + c;
}
template <>
Vectorized<c10::BFloat16> inline fmsub(
const Vectorized<c10::BFloat16>& a,
@ -570,15 +561,6 @@ Vectorized<c10::BFloat16> inline fmsub(
return a * b - c;
}
template <>
Vectorized<c10::BFloat16> inline fnmsub(
const Vectorized<c10::BFloat16>& a,
const Vectorized<c10::BFloat16>& b,
const Vectorized<c10::BFloat16>& c) {
// See NOTE [BF16 FMA] above.
return -a * b - c;
}
#endif // !defined(C10_MOBILE) && defined(__aarch64__)
} // namespace CPU_CAPABILITY

View File

@ -83,9 +83,7 @@ class Vectorized<float> {
static constexpr size_type size() {
return 4;
}
Vectorized() {
values = vmovq_n_f32(0);
}
Vectorized() {}
Vectorized(float32x4_t v) : values(v) {}
Vectorized(float val) : values{vdupq_n_f32(val)} {}
Vectorized(float val0, float val1, float val2, float val3)
@ -584,14 +582,6 @@ Vectorized<float> inline fmadd(
return Vectorized<float>(vfmaq_f32(c, a, b));
}
template <>
Vectorized<float> inline fnmadd(
const Vectorized<float>& a,
const Vectorized<float>& b,
const Vectorized<float>& c) {
return Vectorized<float>(vfmsq_f32(c, a, b));
}
template <>
Vectorized<float> inline fmsub(
const Vectorized<float>& a,
@ -600,14 +590,6 @@ Vectorized<float> inline fmsub(
return Vectorized<float>(vnegq_f32(vfmsq_f32(c, a, b)));
}
template <>
Vectorized<float> inline fnmsub(
const Vectorized<float>& a,
const Vectorized<float>& b,
const Vectorized<float>& c) {
return Vectorized<float>(vnegq_f32(vfmaq_f32(c, a, b)));
}
inline Vectorized<float> Vectorized<float>::erf() const {
// constants
const Vectorized<float> neg_zero_vec(-0.f);

View File

@ -621,18 +621,6 @@ Vectorized<c10::Half> inline fmadd(
#endif
}
template <>
Vectorized<c10::Half> inline fnmadd(
const Vectorized<c10::Half>& a,
const Vectorized<c10::Half>& b,
const Vectorized<c10::Half>& c) {
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
return Vectorized<c10::Half>(vfmsq_f16(c, a, b));
#else
return -a * b + c;
#endif
}
template <>
Vectorized<c10::Half> inline fmsub(
const Vectorized<c10::Half>& a,
@ -644,18 +632,6 @@ Vectorized<c10::Half> inline fmsub(
return a * b - c;
#endif
}
template <>
Vectorized<c10::Half> inline fnmsub(
const Vectorized<c10::Half>& a,
const Vectorized<c10::Half>& b,
const Vectorized<c10::Half>& c) {
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
return Vectorized<c10::Half>(vnegq_f16(vfmaq_f16(c, a, b)));
#else
return -a * b - c;
#endif
}
#endif // !defined(C10_MOBILE) && defined(__aarch64__)
} // namespace CPU_CAPABILITY

View File

@ -1 +1,396 @@
#include <torch/headeronly/cpu/vec/vec256/missing_vld1_neon.h>
/* Workaround for missing vld1_*_x2 and vst1_*_x2 intrinsics in gcc-7. */
__extension__ extern __inline uint8x8x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_u8_x2(const uint8_t* __a) {
uint8x8x2_t ret;
asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline int8x8x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_s8_x2(const int8_t* __a) {
int8x8x2_t ret;
asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline uint16x4x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_u16_x2(const uint16_t* __a) {
uint16x4x2_t ret;
asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline int16x4x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_s16_x2(const int16_t* __a) {
int16x4x2_t ret;
asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline uint32x2x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_u32_x2(const uint32_t* __a) {
uint32x2x2_t ret;
asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline int32x2x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_s32_x2(const int32_t* __a) {
int32x2x2_t ret;
asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline uint64x1x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_u64_x2(const uint64_t* __a) {
uint64x1x2_t ret;
asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline int64x1x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_s64_x2(const int64_t* __a) {
int64x1x2_t ret;
__builtin_aarch64_simd_oi __o;
asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline float16x4x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_f16_x2(const float16_t* __a) {
float16x4x2_t ret;
asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline float32x2x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_f32_x2(const float32_t* __a) {
float32x2x2_t ret;
asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline float64x1x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_f64_x2(const float64_t* __a) {
float64x1x2_t ret;
asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline poly8x8x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_p8_x2(const poly8_t* __a) {
poly8x8x2_t ret;
asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline poly16x4x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_p16_x2(const poly16_t* __a) {
poly16x4x2_t ret;
asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline poly64x1x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_p64_x2(const poly64_t* __a) {
poly64x1x2_t ret;
asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline uint8x16x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_u8_x2(const uint8_t* __a) {
uint8x16x2_t ret;
asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline int8x16x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_s8_x2(const int8_t* __a) {
int8x16x2_t ret;
asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline uint16x8x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_u16_x2(const uint16_t* __a) {
uint16x8x2_t ret;
asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline int16x8x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_s16_x2(const int16_t* __a) {
int16x8x2_t ret;
asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline uint32x4x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_u32_x2(const uint32_t* __a) {
uint32x4x2_t ret;
asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline int32x4x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_s32_x2(const int32_t* __a) {
int32x4x2_t ret;
asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline uint64x2x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_u64_x2(const uint64_t* __a) {
uint64x2x2_t ret;
asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline int64x2x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_s64_x2(const int64_t* __a) {
int64x2x2_t ret;
asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline float16x8x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_f16_x2(const float16_t* __a) {
float16x8x2_t ret;
asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline float32x4x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_f32_x2(const float32_t* __a) {
float32x4x2_t ret;
asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline float64x2x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_f64_x2(const float64_t* __a) {
float64x2x2_t ret;
asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline poly8x16x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_p8_x2(const poly8_t* __a) {
poly8x16x2_t ret;
asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline poly16x8x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_p16_x2(const poly16_t* __a) {
poly16x8x2_t ret;
asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline poly64x2x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_p64_x2(const poly64_t* __a) {
poly64x2x2_t ret;
asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
/* vst1x2 */
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_s64_x2(int64_t* __a, int64x1x2_t val) {
asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_u64_x2(uint64_t* __a, uint64x1x2_t val) {
asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_f64_x2(float64_t* __a, float64x1x2_t val) {
asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_s8_x2(int8_t* __a, int8x8x2_t val) {
asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_p8_x2(poly8_t* __a, poly8x8x2_t val) {
asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_s16_x2(int16_t* __a, int16x4x2_t val) {
asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_p16_x2(poly16_t* __a, poly16x4x2_t val) {
asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_s32_x2(int32_t* __a, int32x2x2_t val) {
asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_u8_x2(uint8_t* __a, uint8x8x2_t val) {
asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_u16_x2(uint16_t* __a, uint16x4x2_t val) {
asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_u32_x2(uint32_t* __a, uint32x2x2_t val) {
asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_f16_x2(float16_t* __a, float16x4x2_t val) {
asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_f32_x2(float32_t* __a, float32x2x2_t val) {
asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_p64_x2(poly64_t* __a, poly64x1x2_t val) {
asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_s8_x2(int8_t* __a, int8x16x2_t val) {
asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_p8_x2(poly8_t* __a, poly8x16x2_t val) {
asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_s16_x2(int16_t* __a, int16x8x2_t val) {
asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_p16_x2(poly16_t* __a, poly16x8x2_t val) {
asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_s32_x2(int32_t* __a, int32x4x2_t val) {
asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_s64_x2(int64_t* __a, int64x2x2_t val) {
asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_u8_x2(uint8_t* __a, uint8x16x2_t val) {
asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_u16_x2(uint16_t* __a, uint16x8x2_t val) {
asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_u32_x2(uint32_t* __a, uint32x4x2_t val) {
asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_u64_x2(uint64_t* __a, uint64x2x2_t val) {
asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_f16_x2(float16_t* __a, float16x8x2_t val) {
asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_f32_x2(float32_t* __a, float32x4x2_t val) {
asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_f64_x2(float64_t* __a, float64x2x2_t val) {
asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_p64_x2(poly64_t* __a, poly64x2x2_t val) {
asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
}

View File

@ -1 +1,7 @@
#include <torch/headeronly/cpu/vec/vec256/missing_vst1_neon.h>
/* Workaround for missing vst1q_f32_x2 in gcc-8. */
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_f32_x2(float32_t* __a, float32x4x2_t val) {
asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
}

View File

@ -34,9 +34,7 @@ class Vectorized<c10::complex<double>> {
static constexpr size_type size() {
return 2;
}
Vectorized() {
values = _mm256_setzero_pd();
}
Vectorized() {}
Vectorized(__m256d v) : values(v) {}
Vectorized(c10::complex<double> val) {
double real_value = val.real();

View File

@ -33,9 +33,7 @@ class Vectorized<c10::complex<float>> {
static constexpr size_type size() {
return 4;
}
Vectorized() {
values = _mm256_setzero_ps();
}
Vectorized() {}
Vectorized(__m256 v) : values(v) {}
Vectorized(c10::complex<float> val) {
float real_value = val.real();

View File

@ -31,9 +31,7 @@ class Vectorized<double> {
static constexpr size_type size() {
return 4;
}
Vectorized() {
values = _mm256_setzero_pd();
}
Vectorized() {}
Vectorized(__m256d v) : values(v) {}
Vectorized(double val) {
values = _mm256_set1_pd(val);
@ -495,14 +493,6 @@ Vectorized<double> inline fmadd(
return _mm256_fmadd_pd(a, b, c);
}
template <>
Vectorized<double> inline fnmadd(
const Vectorized<double>& a,
const Vectorized<double>& b,
const Vectorized<double>& c) {
return _mm256_fnmadd_pd(a, b, c);
}
template <>
Vectorized<double> inline fmsub(
const Vectorized<double>& a,
@ -510,14 +500,6 @@ Vectorized<double> inline fmsub(
const Vectorized<double>& c) {
return _mm256_fmsub_pd(a, b, c);
}
template <>
Vectorized<double> inline fnmsub(
const Vectorized<double>& a,
const Vectorized<double>& b,
const Vectorized<double>& c) {
return _mm256_fnmsub_pd(a, b, c);
}
#endif
#endif

View File

@ -30,9 +30,7 @@ class Vectorized<float> {
static constexpr size_type size() {
return 8;
}
Vectorized() {
values = _mm256_setzero_ps();
}
Vectorized() {}
Vectorized(__m256 v) : values(v) {}
Vectorized(float val) {
values = _mm256_set1_ps(val);
@ -696,14 +694,6 @@ Vectorized<float> inline fmadd(
return _mm256_fmadd_ps(a, b, c);
}
template <>
Vectorized<float> inline fnmadd(
const Vectorized<float>& a,
const Vectorized<float>& b,
const Vectorized<float>& c) {
return _mm256_fnmadd_ps(a, b, c);
}
template <>
Vectorized<float> inline fmsub(
const Vectorized<float>& a,
@ -712,14 +702,6 @@ Vectorized<float> inline fmsub(
return _mm256_fmsub_ps(a, b, c);
}
template <>
Vectorized<float> inline fnmsub(
const Vectorized<float>& a,
const Vectorized<float>& b,
const Vectorized<float>& c) {
return _mm256_fnmsub_ps(a, b, c);
}
// TODO: rewrite with ATEN vectorized (need to add unpack and shuffle)
// Used by Inductor CPP codegen for micro gemm
inline void transpose_block(at::vec::VectorizedN<float, 8>& input) {

View File

@ -23,9 +23,7 @@ struct Vectorizedi {
}
public:
Vectorizedi() {
values = _mm256_setzero_si256();
}
Vectorizedi() {}
Vectorizedi(__m256i v) : values(v) {}
operator __m256i() const {
return values;
@ -55,9 +53,7 @@ class Vectorized<int64_t> : public Vectorizedi {
return 4;
}
using Vectorizedi::Vectorizedi;
Vectorized() {
values = _mm256_setzero_si256();
}
Vectorized() {}
Vectorized(int64_t v) {
values = _mm256_set1_epi64x(v);
}

View File

@ -54,9 +54,7 @@ struct Vectorizedqi {
#endif
public:
Vectorizedqi() {
vals = _mm256_setzero_si256();
}
Vectorizedqi() {}
Vectorizedqi(__m256i v) : vals(v) {}
operator __m256i() const {
return vals;

View File

@ -192,9 +192,7 @@ class Vectorized16 {
static constexpr size_type size() {
return 32;
}
Vectorized16() {
values = _mm512_setzero_si512();
}
Vectorized16() {}
Vectorized16(__m512i v) : values(v) {}
Vectorized16(T val) {
value_type uw = val.x;

View File

@ -34,9 +34,7 @@ class Vectorized<c10::complex<double>> {
static constexpr size_type size() {
return 4;
}
Vectorized() {
values = _mm512_setzero_pd();
}
Vectorized() {}
Vectorized(__m512d v) : values(v) {}
Vectorized(c10::complex<double> val) {
double real_value = val.real();

View File

@ -34,9 +34,7 @@ class Vectorized<c10::complex<float>> {
static constexpr size_type size() {
return 8;
}
Vectorized() {
values = _mm512_setzero_ps();
}
Vectorized() {}
Vectorized(__m512 v) : values(v) {}
Vectorized(c10::complex<float> val) {
float real_value = val.real();

View File

@ -34,9 +34,7 @@ class Vectorized<double> {
static constexpr size_type size() {
return 8;
}
Vectorized() {
values = _mm512_setzero_pd();
}
Vectorized() {}
Vectorized(__m512d v) : values(v) {}
Vectorized(double val) {
values = _mm512_set1_pd(val);
@ -536,14 +534,6 @@ Vectorized<double> inline fmadd(
return _mm512_fmadd_pd(a, b, c);
}
template <>
Vectorized<double> inline fnmadd(
const Vectorized<double>& a,
const Vectorized<double>& b,
const Vectorized<double>& c) {
return _mm512_fnmadd_pd(a, b, c);
}
template <>
Vectorized<double> inline fmsub(
const Vectorized<double>& a,
@ -552,14 +542,6 @@ Vectorized<double> inline fmsub(
return _mm512_fmsub_pd(a, b, c);
}
template <>
Vectorized<double> inline fnmsub(
const Vectorized<double>& a,
const Vectorized<double>& b,
const Vectorized<double>& c) {
return _mm512_fnmsub_pd(a, b, c);
}
#endif
} // namespace CPU_CAPABILITY

View File

@ -32,9 +32,7 @@ class Vectorized<float> {
static constexpr size_type size() {
return 16;
}
Vectorized() {
values = _mm512_setzero_ps();
}
Vectorized() {}
Vectorized(__m512 v) : values(v) {}
Vectorized(float val) {
values = _mm512_set1_ps(val);
@ -749,14 +747,6 @@ Vectorized<float> inline fmadd(
return _mm512_fmadd_ps(a, b, c);
}
template <>
Vectorized<float> inline fnmadd(
const Vectorized<float>& a,
const Vectorized<float>& b,
const Vectorized<float>& c) {
return _mm512_fnmadd_ps(a, b, c);
}
template <>
Vectorized<float> inline fmsub(
const Vectorized<float>& a,
@ -765,14 +755,6 @@ Vectorized<float> inline fmsub(
return _mm512_fmsub_ps(a, b, c);
}
template <>
Vectorized<float> inline fnmsub(
const Vectorized<float>& a,
const Vectorized<float>& b,
const Vectorized<float>& c) {
return _mm512_fnmsub_ps(a, b, c);
}
// TODO: rewrite with ATEN vectorized (need to add unpack and shuffle)
// Used by Inductor CPP codegen for micro gemm
// Code referred to FBGEMM:

View File

@ -53,9 +53,7 @@ class Vectorized<int64_t> : public Vectorizedi {
return 8;
}
using Vectorizedi::Vectorizedi;
Vectorized() {
values = _mm512_setzero_si512();
}
Vectorized() {}
Vectorized(int64_t v) {
values = _mm512_set1_epi64(v);
}

View File

@ -55,9 +55,7 @@ struct Vectorizedqi {
#endif
public:
Vectorizedqi() {
vals = _mm512_setzero_si512();
}
Vectorizedqi() {}
Vectorizedqi(__m512i v) : vals(v) {}
operator __m512i() const {
return vals;

Some files were not shown because too many files have changed in this diff Show More