Compare commits

..

1 Commits

Author SHA1 Message Date
e44ca7305f vllm setup
Signed-off-by: Yang Wang <elainewy@meta.com>
2025-07-21 17:40:32 -07:00
797 changed files with 51608 additions and 25091 deletions

View File

@ -36,105 +36,3 @@ See `build.sh` for valid build environments (it's the giant switch).
# Set flags (see build.sh) and build image
sudo bash -c 'TRITON=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
```
## [Guidance] Adding a New Base Docker Image
### Background
The base Docker images in directory `.ci/docker/` are built by the `docker-builds.yml` workflow. Those images are used throughout the PyTorch CI/CD pipeline. You should only create or modify a base Docker image if you need specific environment changes or dependencies before building PyTorch on CI.
1. **Automatic Rebuilding**:
- The Docker image building process is triggered automatically when changes are made to files in the `.ci/docker/*` directory
- This ensures all images stay up-to-date with the latest dependencies and configurations
2. **Image Reuse in PyTorch Build Workflows** (example: linux-build):
- The images generated by `docker-builds.yml` are reused in `_linux-build.yml` through the `calculate-docker-image` step
- The `_linux-build.yml` workflow:
- Pulls the Docker image determined by the `calculate-docker-image` step
- Runs a Docker container with that image
- Executes `.ci/pytorch/build.sh` inside the container to build PyTorch
3. **Usage in Test Workflows** (example: linux-test):
- The same Docker images are also used in `_linux-test.yml` for running tests
- The `_linux-test.yml` workflow follows a similar pattern:
- It uses the `calculate-docker-image` step to determine which Docker image to use
- It pulls the Docker image and runs a container with that image
- It installs the wheels from the artifacts generated by PyTorch build jobs
- It executes test scripts (like `.ci/pytorch/test.sh` or `.ci/pytorch/multigpu-test.sh`) inside the container
### Understanding File Purposes
#### `.ci/docker/build.sh` vs `.ci/pytorch/build.sh`
- **`.ci/docker/build.sh`**:
- Used for building base Docker images
- Executed by the `docker-builds.yml` workflow to pre-build Docker images for CI
- Contains configurations for different Docker build environments
- **`.ci/pytorch/build.sh`**:
- Used for building PyTorch inside a Docker container
- Called by workflows like `_linux-build.yml` after the Docker container is started
- Builds PyTorch wheels and other artifacts
#### `.ci/docker/ci_commit_pins/` vs `.github/ci_commit_pins`
- **`.ci/docker/ci_commit_pins/`**:
- Used for pinning dependency versions during base Docker image building
- Ensures consistent environments for building PyTorch
- Changes here trigger base Docker image rebuilds
- **`.github/ci_commit_pins`**:
- Used for pinning dependency versions during PyTorch building and tests
- Ensures consistent dependencies for PyTorch across different builds
- Used by build scripts running inside Docker containers
### Step-by-Step Guide for Adding a New Base Docker Image
#### 1. Add Pinned Commits (If Applicable)
We use pinned commits for build stability. The `nightly.yml` workflow checks and updates pinned commits for certain repository dependencies daily.
If your new Docker image needs a library installed from a specific pinned commit or built from source:
1. Add the repository you want to track in `nightly.yml` and `merge-rules.yml`
2. Add the initial pinned commit in `.ci/docker/ci_commit_pins/`. The text filename should match the one defined in step 1
#### 2. Configure the Base Docker Image
1. **Add new Base Docker image configuration** (if applicable):
Add the configuration in `.ci/docker/build.sh`. For example:
```bash
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-new1)
CUDA_VERSION=12.8.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=11
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
NEW_ARG_1=yes
;;
```
2. **Add build arguments to Docker build command**:
If you're introducing a new argument to the Docker build, make sure to add it in the Docker build step in `.ci/docker/build.sh`:
```bash
docker build \
....
--build-arg "NEW_ARG_1=${NEW_ARG_1}"
```
3. **Update Dockerfile logic**:
Update the Dockerfile to use the new argument. For example, in `ubuntu/Dockerfile`:
```dockerfile
ARG NEW_ARG_1
# Set up environment for NEW_ARG_1
RUN if [ -n "${NEW_ARG_1}" ]; then bash ./do_something.sh; fi
```
4. **Add the Docker configuration** in `.github/workflows/docker-builds.yml`:
The `docker-builds.yml` workflow pre-builds the Docker images whenever changes occur in the `.ci/docker/` directory. This includes the
pinned commit updates.

View File

@ -78,331 +78,45 @@ elif [[ "$image" == *linter* ]]; then
DOCKERFILE="linter/Dockerfile"
fi
_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
if [[ "$image" == *rocm* ]]; then
_UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6
_UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
fi
PY_HARDCODED_CONFIG_SCRIPT=$(python3 get_config.py --image "$image")
tag=$(echo $image | awk -F':' '{print $2}')
# It's annoying to rename jobs every time you want to rewrite a
# configuration, so we hardcode everything here rather than do it
# from scratch
case "$tag" in
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11)
CUDA_VERSION=12.4
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=11
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11)
CUDA_VERSION=12.8.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=11
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
CUDA_VERSION=12.8.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks)
CUDA_VERSION=12.8.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks)
CUDA_VERSION=12.8.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.13
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9)
CUDA_VERSION=12.6.3
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm)
CUDA_VERSION=12.8.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=11
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
;;
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
CUDA_VERSION=12.6
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
CUDA_VERSION=12.6
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
CUDA_VERSION=12.6
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.13
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9)
CUDA_VERSION=12.8.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
;;
pytorch-linux-jammy-py3-clang12-onnx)
ANACONDA_PYTHON_VERSION=3.9
CLANG_VERSION=12
VISION=yes
ONNX=yes
;;
pytorch-linux-jammy-py3.9-clang12)
ANACONDA_PYTHON_VERSION=3.9
CLANG_VERSION=12
VISION=yes
TRITON=yes
;;
pytorch-linux-jammy-py3.11-clang12)
ANACONDA_PYTHON_VERSION=3.11
CLANG_VERSION=12
VISION=yes
TRITON=yes
;;
pytorch-linux-jammy-py3.9-gcc9)
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=9
VISION=yes
TRITON=yes
;;
pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-noble-rocm-n-py3)
if [[ $tag =~ "jammy" ]]; then
ANACONDA_PYTHON_VERSION=3.10
else
ANACONDA_PYTHON_VERSION=3.12
fi
GCC_VERSION=11
VISION=yes
ROCM_VERSION=6.4
if [[ $? -eq 0 ]]; then
eval "$PY_HARDCODED_CONFIG_SCRIPT"
else
echo "[Fallback] Python script failed or no match — fallback to hardcoded shell case"
# Catch-all for builds that are not hardcoded.
VISION=yes
echo "image '$image' did not match an existing build configuration"
if [[ "$image" == *py* ]]; then
extract_version_from_image_name py ANACONDA_PYTHON_VERSION
fi
if [[ "$image" == *cuda* ]]; then
extract_version_from_image_name cuda CUDA_VERSION
extract_version_from_image_name cudnn CUDNN_VERSION
fi
if [[ "$image" == *rocm* ]]; then
extract_version_from_image_name rocm ROCM_VERSION
NINJA_VERSION=1.9.0
TRITON=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-noble-rocm-alpha-py3)
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=11
VISION=yes
ROCM_VERSION=7.0
NINJA_VERSION=1.9.0
TRITON=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
INDUCTOR_BENCHMARKS=yes
PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
;;
pytorch-linux-jammy-xpu-2025.0-py3)
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=11
VISION=yes
XPU_VERSION=2025.0
NINJA_VERSION=1.9.0
TRITON=yes
;;
pytorch-linux-jammy-xpu-2025.1-py3)
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=11
VISION=yes
XPU_VERSION=2025.1
NINJA_VERSION=1.9.0
TRITON=yes
;;
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=11
VISION=yes
KATEX=yes
TRITON=yes
DOCS=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12)
ANACONDA_PYTHON_VERSION=3.9
CUDA_VERSION=12.8.1
CUDNN_VERSION=9
CLANG_VERSION=12
VISION=yes
TRITON=yes
;;
pytorch-linux-jammy-py3-clang18-asan)
ANACONDA_PYTHON_VERSION=3.10
CLANG_VERSION=18
VISION=yes
;;
pytorch-linux-jammy-py3.9-gcc11)
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=11
VISION=yes
KATEX=yes
TRITON=yes
DOCS=yes
UNINSTALL_DILL=yes
;;
pytorch-linux-jammy-py3-clang12-executorch)
ANACONDA_PYTHON_VERSION=3.10
CLANG_VERSION=12
EXECUTORCH=yes
;;
pytorch-linux-jammy-py3.12-halide)
CUDA_VERSION=12.6
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=11
HALIDE=yes
TRITON=yes
;;
pytorch-linux-jammy-py3.12-triton-cpu)
CUDA_VERSION=12.6
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=11
TRITON_CPU=yes
;;
pytorch-linux-jammy-linter)
# TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
# We will need to update mypy version eventually, but that's for another day. The task
# would be to upgrade mypy to 1.0.0 with Python 3.11
PYTHON_VERSION=3.9
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)
PYTHON_VERSION=3.9
CUDA_VERSION=12.8.1
;;
pytorch-linux-jammy-aarch64-py3.10-gcc11)
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=11
ACL=yes
VISION=yes
CONDA_CMAKE=yes
OPENBLAS=yes
# snadampal: skipping llvm src build install because the current version
# from pytorch/llvm:9.0.1 is x86 specific
SKIP_LLVM_SRC_BUILD_INSTALL=yes
;;
pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks)
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=11
ACL=yes
VISION=yes
CONDA_CMAKE=yes
OPENBLAS=yes
# snadampal: skipping llvm src build install because the current version
# from pytorch/llvm:9.0.1 is x86 specific
SKIP_LLVM_SRC_BUILD_INSTALL=yes
INDUCTOR_BENCHMARKS=yes
;;
*)
# Catch-all for builds that are not hardcoded.
VISION=yes
echo "image '$image' did not match an existing build configuration"
if [[ "$image" == *py* ]]; then
extract_version_from_image_name py ANACONDA_PYTHON_VERSION
# To ensure that any ROCm config will build using conda cmake
# and thus have LAPACK/MKL enabled
fi
if [[ "$image" == *cuda* ]]; then
extract_version_from_image_name cuda CUDA_VERSION
extract_version_from_image_name cudnn CUDNN_VERSION
fi
if [[ "$image" == *rocm* ]]; then
extract_version_from_image_name rocm ROCM_VERSION
NINJA_VERSION=1.9.0
TRITON=yes
# To ensure that any ROCm config will build using conda cmake
# and thus have LAPACK/MKL enabled
fi
if [[ "$image" == *centos7* ]]; then
NINJA_VERSION=1.10.2
fi
if [[ "$image" == *gcc* ]]; then
extract_version_from_image_name gcc GCC_VERSION
fi
if [[ "$image" == *clang* ]]; then
extract_version_from_image_name clang CLANG_VERSION
fi
if [[ "$image" == *devtoolset* ]]; then
extract_version_from_image_name devtoolset DEVTOOLSET_VERSION
fi
if [[ "$image" == *glibc* ]]; then
extract_version_from_image_name glibc GLIBC_VERSION
fi
;;
if [[ "$image" == *centos7* ]]; then
NINJA_VERSION=1.10.2
fi
if [[ "$image" == *gcc* ]]; then
extract_version_from_image_name gcc GCC_VERSION
fi
if [[ "$image" == *clang* ]]; then
extract_version_from_image_name clang CLANG_VERSION
fi
if [[ "$image" == *devtoolset* ]]; then
extract_version_from_image_name devtoolset DEVTOOLSET_VERSION
fi
if [[ "$image" == *glibc* ]]; then
extract_version_from_image_name glibc GLIBC_VERSION
fi
;;
esac
tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')

View File

@ -15,11 +15,35 @@ function install_timm() {
commit=$(get_pinned_commit timm)
pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
# Clean up
conda_run pip uninstall -y torch torchvision triton
}
function install_torchbench() {
local commit
commit=$(get_pinned_commit torchbench)
git clone https://github.com/pytorch/benchmark torchbench
pushd torchbench
git checkout "$commit"
python install.py --continue_on_fail
# TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488
# is regressing speedup metric. This needs to be investigated further
pip install transformers==4.38.1
echo "Print all dependencies after TorchBench is installed"
python -mpip freeze
popd
}
# Pango is needed for weasyprint which is needed for doctr
conda_install pango
# Stable packages are ok here, just to satisfy TorchBench check
pip_install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
install_torchbench
install_huggingface
install_timm
# Clean up
conda_run pip uninstall -y torch torchvision torchaudio triton

View File

@ -30,7 +30,7 @@ EOF
# we want the patch version of 6.4 instead
if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
ROCM_VERSION="${ROCM_VERSION}.2"
ROCM_VERSION="${ROCM_VERSION}.1"
fi
# Default url values
@ -85,19 +85,16 @@ EOF
# CI no longer builds for ROCm 6.3, but
# ROCm 6.4 did not yet fix the regression, also HIP branch names are different
if [[ $(ver $ROCM_VERSION) -ge $(ver 6.4) ]] && [[ $(ver $ROCM_VERSION) -lt $(ver 7.0) ]]; then
if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.2) ]]; then
HIP_TAG=rocm-6.4.2
CLR_HASH=74d78ba3ac4bac235d02bcb48511c30b5cfdd457 # branch release/rocm-rel-6.4.2-statco-hotfix
elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.1) ]]; then
HIP_TAG=rocm-6.4.1
CLR_HASH=efe6c35790b9206923bfeed1209902feff37f386 # branch release/rocm-rel-6.4.1-statco-hotfix
if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.1) ]]; then
HIP_BRANCH=release/rocm-rel-6.4
CLR_HASH=606bc820b4b1f315d135da02a1f0b176ca50a92c # branch release/rocm-rel-6.4.1-statco-hotfix
elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
HIP_TAG=rocm-6.4.0
HIP_BRANCH=release/rocm-rel-6.4
CLR_HASH=600f5b0d2baed94d5121e2174a9de0851b040b0c # branch release/rocm-rel-6.4-statco-hotfix
fi
# clr build needs CppHeaderParser but can only find it using conda's python
python -m pip install CppHeaderParser
git clone https://github.com/ROCm/HIP -b $HIP_TAG
git clone https://github.com/ROCm/HIP -b $HIP_BRANCH
HIP_COMMON_DIR=$(readlink -f HIP)
git clone https://github.com/jeffdaily/clr
pushd clr

350
.ci/docker/get_config.py Normal file
View File

@ -0,0 +1,350 @@
import argparse
import sys
from enum import Enum
import shlex
class HardwareType(Enum):
DEFAULT = "default"
ROCM = "rocm"
@staticmethod
def from_image_name(image_name: str) -> "HardwareType":
if "rocm" in image_name:
return HardwareType.ROCM
return HardwareType.DEFAULT
class HardcodedBaseConfig:
_UCX_UCC_CONFIGS: dict[HardwareType, dict[str, str]] = {
HardwareType.DEFAULT: {
"UCX_COMMIT": "7bb2722ff2187a0cad557ae4a6afa090569f83fb",
"UCC_COMMIT": "20eae37090a4ce1b32bcce6144ccad0b49943e0b",
},
HardwareType.ROCM: {
"UCX_COMMIT": "cc312eaa4655c0cc5c2bcd796db938f90563bcf6",
"UCC_COMMIT": "0c0fc21559835044ab107199e334f7157d6a0d3d",
},
}
def __init__(self, hardwareType: HardwareType) -> None:
commits = self.get_ucx_ucc_commits(hardwareType)
self.ucx_commit = commits["UCX_COMMIT"]
self.ucc_commit = commits["UCC_COMMIT"]
def _get_tag(self, image: str):
if ":" not in image:
print(f"echo 'Invalid image format (missing :): {image}'", file=sys.stderr)
return
tag = image.split(":")[1]
return tag
def get_all_configs(self):
_TAG_CONFIGS = {
"pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11": {
"CUDA_VERSION": "12.4",
"CUDNN_VERSION": "9",
"ANACONDA_PYTHON_VERSION": "3.10",
"GCC_VERSION": "11",
"VISION": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"TRITON": "yes",
},
"pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11": {
"CUDA_VERSION": "12.8.1",
"CUDNN_VERSION": "9",
"ANACONDA_PYTHON_VERSION": "3.10",
"GCC_VERSION": "11",
"VISION": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"TRITON": "yes",
},
"pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks": {
"CUDA_VERSION": "12.8.1",
"CUDNN_VERSION": "9",
"ANACONDA_PYTHON_VERSION": "3.10",
"GCC_VERSION": "9",
"VISION": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"TRITON": "yes",
"INDUCTOR_BENCHMARKS": "yes",
},
"pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks": {
"CUDA_VERSION": "12.8.1",
"CUDNN_VERSION": "9",
"ANACONDA_PYTHON_VERSION": "3.12",
"GCC_VERSION": "9",
"VISION": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"TRITON": "yes",
"INDUCTOR_BENCHMARKS": "yes",
},
"pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks": {
"CUDA_VERSION": "12.8.1",
"CUDNN_VERSION": "9",
"ANACONDA_PYTHON_VERSION": "3.13",
"GCC_VERSION": "9",
"VISION": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"TRITON": "yes",
"INDUCTOR_BENCHMARKS": "yes",
},
"pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9": {
"CUDA_VERSION": "12.6.3",
"CUDNN_VERSION": "9",
"ANACONDA_PYTHON_VERSION": "3.10",
"GCC_VERSION": "9",
"VISION": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"TRITON": "yes",
},
"pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks": {
"CUDA_VERSION": "12.6",
"CUDNN_VERSION": "9",
"ANACONDA_PYTHON_VERSION": "3.10",
"GCC_VERSION": "9",
"VISION": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"TRITON": "yes",
"INDUCTOR_BENCHMARKS": "yes",
},
"pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks": {
"CUDA_VERSION": "12.6",
"CUDNN_VERSION": "9",
"ANACONDA_PYTHON_VERSION": "3.12",
"GCC_VERSION": "9",
"VISION": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"TRITON": "yes",
"INDUCTOR_BENCHMARKS": "yes",
},
"pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks": {
"CUDA_VERSION": "12.6",
"CUDNN_VERSION": "9",
"ANACONDA_PYTHON_VERSION": "3.13",
"GCC_VERSION": "9",
"VISION": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"TRITON": "yes",
"INDUCTOR_BENCHMARKS": "yes",
},
"pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9": {
"CUDA_VERSION": "12.8.1",
"CUDNN_VERSION": "9",
"ANACONDA_PYTHON_VERSION": "3.10",
"GCC_VERSION": "9",
"VISION": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"TRITON": "yes",
},
"pytorch-linux-jammy-py3-clang12-onnx": {
"ANACONDA_PYTHON_VERSION": "3.9",
"CLANG_VERSION": "12",
"VISION": "yes",
"ONNX": "yes",
},
"pytorch-linux-jammy-py3.9-clang12": {
"ANACONDA_PYTHON_VERSION": "3.9",
"CLANG_VERSION": "12",
"VISION": "yes",
"TRITON": "yes",
},
"pytorch-linux-jammy-py3.11-clang12": {
"ANACONDA_PYTHON_VERSION": "3.11",
"CLANG_VERSION": "12",
"VISION": "yes",
"TRITON": "yes",
},
"pytorch-linux-jammy-py3.9-gcc9": {
"ANACONDA_PYTHON_VERSION": "3.9",
"GCC_VERSION": "9",
"VISION": "yes",
"TRITON": "yes",
},
"pytorch-linux-jammy-rocm-n-py3": {
"ANACONDA_PYTHON_VERSION": "3.10",
"GCC_VERSION": "11",
"VISION": "yes",
"ROCM_VERSION": "6.4",
"NINJA_VERSION": "1.9.0",
"TRITON": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"INDUCTOR_BENCHMARKS": "yes",
},
"pytorch-linux-noble-rocm-n-py3": {
"ANACONDA_PYTHON_VERSION": "3.12",
"GCC_VERSION": "11",
"VISION": "yes",
"ROCM_VERSION": "6.4",
"NINJA_VERSION": "1.9.0",
"TRITON": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"INDUCTOR_BENCHMARKS": "yes",
},
"pytorch-linux-noble-rocm-alpha-py3": {
"ANACONDA_PYTHON_VERSION": "3.12",
"GCC_VERSION": "11",
"VISION": "yes",
"ROCM_VERSION": "7.0",
"NINJA_VERSION": "1.9.0",
"TRITON": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"INDUCTOR_BENCHMARKS": "yes",
"PYTORCH_ROCM_ARCH": "gfx90a;gfx942;gfx950",
},
"pytorch-linux-jammy-xpu-2025.0-py3": {
"ANACONDA_PYTHON_VERSION": "3.9",
"GCC_VERSION": "11",
"VISION": "yes",
"XPU_VERSION": "2025.0",
"NINJA_VERSION": "1.9.0",
"TRITON": "yes",
},
"pytorch-linux-jammy-xpu-2025.1-py3": {
"ANACONDA_PYTHON_VERSION": "3.9",
"GCC_VERSION": "11",
"VISION": "yes",
"XPU_VERSION": "2025.1",
"NINJA_VERSION": "1.9.0",
"TRITON": "yes",
},
"pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks": {
"ANACONDA_PYTHON_VERSION": "3.9",
"GCC_VERSION": "11",
"VISION": "yes",
"KATEX": "yes",
"TRITON": "yes",
"DOCS": "yes",
"INDUCTOR_BENCHMARKS": "yes",
},
"pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12": {
"ANACONDA_PYTHON_VERSION": "3.9",
"CUDA_VERSION": "12.8.1",
"CUDNN_VERSION": "9",
"CLANG_VERSION": "12",
"VISION": "yes",
"TRITON": "yes",
},
"pytorch-linux-jammy-py3-clang18-asan": {
"ANACONDA_PYTHON_VERSION": "3.10",
"CLANG_VERSION": "18",
"VISION": "yes",
},
"pytorch-linux-jammy-py3.9-gcc11": {
"ANACONDA_PYTHON_VERSION": "3.9",
"GCC_VERSION": "11",
"VISION": "yes",
"KATEX": "yes",
"TRITON": "yes",
"DOCS": "yes",
"UNINSTALL_DILL": "yes",
},
"pytorch-linux-jammy-py3-clang12-executorch": {
"ANACONDA_PYTHON_VERSION": "3.10",
"CLANG_VERSION": "12",
"EXECUTORCH": "yes",
},
"pytorch-linux-jammy-py3.12-halide": {
"CUDA_VERSION": "12.6",
"ANACONDA_PYTHON_VERSION": "3.12",
"GCC_VERSION": "11",
"HALIDE": "yes",
"TRITON": "yes",
},
"pytorch-linux-jammy-py3.12-triton-cpu": {
"CUDA_VERSION": "12.6",
"ANACONDA_PYTHON_VERSION": "3.12",
"GCC_VERSION": "11",
"TRITON_CPU": "yes",
},
"pytorch-linux-jammy-linter": {
"PYTHON_VERSION": "3.9",
},
"pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter": {
"PYTHON_VERSION": "3.9",
"CUDA_VERSION": "12.8.1",
},
"pytorch-linux-jammy-aarch64-py3.10-gcc11": {
"ANACONDA_PYTHON_VERSION": "3.10",
"GCC_VERSION": "11",
"ACL": "yes",
"VISION": "yes",
"CONDA_CMAKE": "yes",
"OPENBLAS": "yes",
"SKIP_LLVM_SRC_BUILD_INSTALL": "yes",
},
"pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks": {
"ANACONDA_PYTHON_VERSION": "3.10",
"GCC_VERSION": "11",
"ACL": "yes",
"VISION": "yes",
"CONDA_CMAKE": "yes",
"OPENBLAS": "yes",
"SKIP_LLVM_SRC_BUILD_INSTALL": "yes",
"INDUCTOR_BENCHMARKS": "yes",
},
}
return _TAG_CONFIGS
def get_config(self, image_name:str) -> dict:
tag = self._get_tag(image_name)
config_dict = self.get_all_configs()
if tag not in config_dict:
raise ValueError(f"Unknown tag: {tag}")
return config_dict[tag]
def get_ucx_ucc_commits(self, hw_type: HardwareType) -> dict[str, str]:
if hw_type not in self._UCX_UCC_CONFIGS:
raise ValueError(f"Unsupported hardware type: {hw_type}")
return self._UCX_UCC_CONFIGS[hw_type]
def main():
parser = argparse.ArgumentParser(
description="Return for a given image tag."
)
parser.add_argument(
"--image", required=True, help="Full image string (e.g., repo/name:tag)"
)
args = parser.parse_args()
try:
image_name = args.image
hw_type = HardwareType.from_image_name(image_name)
config_runner = HardcodedBaseConfig(hw_type)
config = config_runner.get_config(args.image)
for key, val in config.items():
print(f'export {key}={shlex.quote(val)}')
except Exception as e:
# Any error will signal fallback
print(f"# Fallback due to error: {e}", file=sys.stderr)
sys.exit(42)
if __name__ == "__main__":
main()

View File

@ -41,7 +41,7 @@ case ${DOCKER_TAG_PREFIX} in
rocm*)
# we want the patch version of 6.4 instead
if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.1"
fi
BASE_TARGET=rocm
GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete

View File

@ -77,7 +77,7 @@ case ${image} in
manylinux2_28-builder:rocm*)
# we want the patch version of 6.4 instead
if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.1"
fi
TARGET=rocm_final
MANY_LINUX_VERSION="2_28"

View File

@ -50,7 +50,7 @@ flatbuffers==24.12.23
hypothesis==5.35.1
# Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
#Description: advanced library for generating parametrized tests
#Pinned versions: 5.35.1
#Pinned versions: 3.44.6, 4.53.2
#test that import: test_xnnpack_integration.py, test_pruning_op.py, test_nn.py
junitparser==2.1.1
@ -221,9 +221,9 @@ pygments==2.15.0
#Pinned versions: 2.12.0
#test that import: the doctests
#pyyaml
#PyYAML
#Description: data serialization format
#Pinned versions: 6.0.2
#Pinned versions:
#test that import:
#requests
@ -233,7 +233,7 @@ pygments==2.15.0
#rich
#Description: rich text and beautiful formatting in the terminal
#Pinned versions: 14.1.0
#Pinned versions: 10.9.0
#test that import:
scikit-image==0.19.3 ; python_version < "3.10"
@ -307,7 +307,7 @@ pytest-cpp==2.3.0
#Pinned versions: 2.3.0
#test that import:
z3-solver==4.15.1.0
z3-solver==4.12.6.0
#Description: The Z3 Theorem Prover Project
#Pinned versions:
#test that import:
@ -361,7 +361,6 @@ pwlf==2.2.1
#Pinned versions: 2.2.1
#test that import: test_sac_estimator.py
# To build PyTorch itself
pyyaml
pyzstd

View File

@ -98,8 +98,9 @@ COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/huggingface.txt huggingface.txt
COPY ci_commit_pins/timm.txt timm.txt
COPY ci_commit_pins/torchbench.txt torchbench.txt
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt
# (optional) Install non-default Ninja version
ARG NINJA_VERSION

View File

@ -98,8 +98,9 @@ COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/huggingface.txt huggingface.txt
COPY ci_commit_pins/timm.txt timm.txt
COPY ci_commit_pins/torchbench.txt torchbench.txt
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt
ARG TRITON
ARG TRITON_CPU

34
.ci/pytorch/build-mobile.sh Executable file
View File

@ -0,0 +1,34 @@
#!/usr/bin/env bash
# DO NOT ADD 'set -x' not to reveal CircleCI secret context environment variables
set -eu -o pipefail
# This script uses linux host toolchain + mobile build options in order to
# build & test mobile libtorch without having to setup Android/iOS
# toolchain/simulator.
# shellcheck source=./common.sh
source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
# shellcheck source=./common-build.sh
source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
# Install torch & torchvision - used to download & trace test model.
# Ideally we should use the libtorch built on the PR so that backward
# incompatible changes won't break this script - but it will significantly slow
# down mobile CI jobs.
# Here we install nightly instead of stable so that we have an option to
# temporarily skip mobile CI jobs on BC-breaking PRs until they are in nightly.
retry pip install --pre torch torchvision \
-f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html \
--progress-bar off
# Run end-to-end process of building mobile library, linking into the predictor
# binary, and running forward pass with a real model.
if [[ "$BUILD_ENVIRONMENT" == *-mobile-custom-build-static* ]]; then
TEST_CUSTOM_BUILD_STATIC=1 test/mobile/custom_build/build.sh
elif [[ "$BUILD_ENVIRONMENT" == *-mobile-lightweight-dispatch* ]]; then
test/mobile/lightweight_dispatch/build.sh
else
TEST_DEFAULT_BUILD=1 test/mobile/custom_build/build.sh
fi
print_sccache_stats

View File

@ -11,6 +11,10 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
# shellcheck source=./common-build.sh
source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
if [[ "$BUILD_ENVIRONMENT" == *-mobile-*build* ]]; then
exec "$(dirname "${BASH_SOURCE[0]}")/build-mobile.sh" "$@"
fi
echo "Python version:"
python --version
@ -50,6 +54,9 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
export ATEN_THREADING=NATIVE
fi
# Enable LLVM dependency for TensorExpr testing
export USE_LLVM=/opt/llvm
export LLVM_DIR=/opt/llvm/lib/cmake/llvm
if ! which conda; then
# In ROCm CIs, we are doing cross compilation on build machines with
@ -117,8 +124,26 @@ if [[ "$BUILD_ENVIRONMENT" == *libtorch* ]]; then
fi
# Use special scripts for Android builds
if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
export ANDROID_NDK=/opt/ndk
build_args=()
if [[ "${BUILD_ENVIRONMENT}" == *-arm-v7a* ]]; then
build_args+=("-DANDROID_ABI=armeabi-v7a")
elif [[ "${BUILD_ENVIRONMENT}" == *-arm-v8a* ]]; then
build_args+=("-DANDROID_ABI=arm64-v8a")
elif [[ "${BUILD_ENVIRONMENT}" == *-x86_32* ]]; then
build_args+=("-DANDROID_ABI=x86")
elif [[ "${BUILD_ENVIRONMENT}" == *-x86_64* ]]; then
build_args+=("-DANDROID_ABI=x86_64")
fi
if [[ "${BUILD_ENVIRONMENT}" == *vulkan* ]]; then
build_args+=("-DUSE_VULKAN=ON")
fi
build_args+=("-DUSE_LITE_INTERPRETER_PROFILER=OFF")
exec ./scripts/build_android.sh "${build_args[@]}" "$@"
fi
if [[ "$BUILD_ENVIRONMENT" == *vulkan* ]]; then
if [[ "$BUILD_ENVIRONMENT" != *android* && "$BUILD_ENVIRONMENT" == *vulkan* ]]; then
export USE_VULKAN=1
# shellcheck disable=SC1091
source /var/lib/jenkins/vulkansdk/setup-env.sh
@ -189,6 +214,7 @@ if [[ "$BUILD_ENVIRONMENT" == *-clang*-asan* ]]; then
export USE_ASAN=1
export REL_WITH_DEB_INFO=1
export UBSAN_FLAGS="-fno-sanitize-recover=all"
unset USE_LLVM
fi
if [[ "${BUILD_ENVIRONMENT}" == *no-ops* ]]; then
@ -199,7 +225,7 @@ if [[ "${BUILD_ENVIRONMENT}" == *-pch* ]]; then
export USE_PRECOMPILED_HEADERS=1
fi
if [[ "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
export BUILD_STATIC_RUNTIME_BENCHMARK=ON
fi

View File

@ -204,32 +204,8 @@ function install_torchrec_and_fbgemm() {
pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
pip_uninstall fbgemm-gpu-nightly
# Set ROCM_HOME isn't available, use ROCM_PATH if set or /opt/rocm
ROCM_HOME="${ROCM_HOME:-${ROCM_PATH:-/opt/rocm}}"
# Find rocm_version.h header file for ROCm version extract
rocm_version_h="${ROCM_HOME}/include/rocm-core/rocm_version.h"
if [ ! -f "$rocm_version_h" ]; then
rocm_version_h="${ROCM_HOME}/include/rocm_version.h"
fi
# Error out if rocm_version.h not found
if [ ! -f "$rocm_version_h" ]; then
echo "Error: rocm_version.h not found in expected locations." >&2
exit 1
fi
# Extract major, minor and patch ROCm version numbers
MAJOR_VERSION=$(grep 'ROCM_VERSION_MAJOR' "$rocm_version_h" | awk '{print $3}')
MINOR_VERSION=$(grep 'ROCM_VERSION_MINOR' "$rocm_version_h" | awk '{print $3}')
PATCH_VERSION=$(grep 'ROCM_VERSION_PATCH' "$rocm_version_h" | awk '{print $3}')
ROCM_INT=$((MAJOR_VERSION * 10000 + MINOR_VERSION * 100 + PATCH_VERSION))
echo "ROCm version: $ROCM_INT"
export BUILD_ROCM_VERSION="$MAJOR_VERSION.$MINOR_VERSION"
pip_install tabulate # needed for newer fbgemm
pip_install patchelf # needed for rocm fbgemm
pushd /tmp
local wheel_dir=dist/fbgemm_gpu
local found_whl=0
@ -247,7 +223,7 @@ function install_torchrec_and_fbgemm() {
pushd fbgemm/fbgemm_gpu
git checkout "${fbgemm_commit}"
python setup.py bdist_wheel \
--build-variant=rocm \
--package_variant=rocm \
-DHIP_ROOT_DIR="${ROCM_PATH}" \
-DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \
-DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
@ -264,7 +240,6 @@ function install_torchrec_and_fbgemm() {
done
rm -rf fbgemm
popd
else
pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu
@ -283,30 +258,6 @@ function clone_pytorch_xla() {
fi
}
function checkout_install_torchbench() {
local commit
commit=$(get_pinned_commit torchbench)
git clone https://github.com/pytorch/benchmark torchbench
pushd torchbench
git checkout "$commit"
if [ "$1" ]; then
python install.py --continue_on_fail models "$@"
else
# Occasionally the installation may fail on one model but it is ok to continue
# to install and test other models
python install.py --continue_on_fail
fi
# TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488
# is regressing speedup metric. This needs to be investigated further
pip install transformers==4.38.1
echo "Print all dependencies after TorchBench is installed"
python -mpip freeze
popd
}
function install_torchao() {
local commit
commit=$(get_pinned_commit torchao)

View File

@ -0,0 +1,123 @@
from datetime import datetime, timedelta, timezone
from tempfile import mkdtemp
from cryptography import x509
from cryptography.hazmat.primitives import hashes, serialization
from cryptography.hazmat.primitives.asymmetric import rsa
from cryptography.x509.oid import NameOID
temp_dir = mkdtemp()
print(temp_dir)
def genrsa(path):
key = rsa.generate_private_key(
public_exponent=65537,
key_size=2048,
)
with open(path, "wb") as f:
f.write(
key.private_bytes(
encoding=serialization.Encoding.PEM,
format=serialization.PrivateFormat.TraditionalOpenSSL,
encryption_algorithm=serialization.NoEncryption(),
)
)
return key
def create_cert(path, C, ST, L, O, key):
subject = issuer = x509.Name(
[
x509.NameAttribute(NameOID.COUNTRY_NAME, C),
x509.NameAttribute(NameOID.STATE_OR_PROVINCE_NAME, ST),
x509.NameAttribute(NameOID.LOCALITY_NAME, L),
x509.NameAttribute(NameOID.ORGANIZATION_NAME, O),
]
)
cert = (
x509.CertificateBuilder()
.subject_name(subject)
.issuer_name(issuer)
.public_key(key.public_key())
.serial_number(x509.random_serial_number())
.not_valid_before(datetime.now(timezone.utc))
.not_valid_after(
# Our certificate will be valid for 10 days
datetime.now(timezone.utc) + timedelta(days=10)
)
.add_extension(
x509.BasicConstraints(ca=True, path_length=None),
critical=True,
)
.sign(key, hashes.SHA256())
)
# Write our certificate out to disk.
with open(path, "wb") as f:
f.write(cert.public_bytes(serialization.Encoding.PEM))
return cert
def create_req(path, C, ST, L, O, key):
csr = (
x509.CertificateSigningRequestBuilder()
.subject_name(
x509.Name(
[
# Provide various details about who we are.
x509.NameAttribute(NameOID.COUNTRY_NAME, C),
x509.NameAttribute(NameOID.STATE_OR_PROVINCE_NAME, ST),
x509.NameAttribute(NameOID.LOCALITY_NAME, L),
x509.NameAttribute(NameOID.ORGANIZATION_NAME, O),
]
)
)
.sign(key, hashes.SHA256())
)
with open(path, "wb") as f:
f.write(csr.public_bytes(serialization.Encoding.PEM))
return csr
def sign_certificate_request(path, csr_cert, ca_cert, private_ca_key):
cert = (
x509.CertificateBuilder()
.subject_name(csr_cert.subject)
.issuer_name(ca_cert.subject)
.public_key(csr_cert.public_key())
.serial_number(x509.random_serial_number())
.not_valid_before(datetime.now(timezone.utc))
.not_valid_after(
# Our certificate will be valid for 10 days
datetime.now(timezone.utc) + timedelta(days=10)
# Sign our certificate with our private key
)
.sign(private_ca_key, hashes.SHA256())
)
with open(path, "wb") as f:
f.write(cert.public_bytes(serialization.Encoding.PEM))
return cert
ca_key = genrsa(temp_dir + "/ca.key")
ca_cert = create_cert(
temp_dir + "/ca.pem",
"US",
"New York",
"New York",
"Gloo Certificate Authority",
ca_key,
)
pkey = genrsa(temp_dir + "/pkey.key")
csr = create_req(
temp_dir + "/csr.csr",
"US",
"California",
"San Francisco",
"Gloo Testing Company",
pkey,
)
cert = sign_certificate_request(temp_dir + "/cert.pem", csr, ca_cert, ca_key)

18
.ci/pytorch/run_glootls_test.sh Executable file
View File

@ -0,0 +1,18 @@
#!/bin/bash
CREATE_TEST_CERT="$(dirname "${BASH_SOURCE[0]}")/create_test_cert.py"
TMP_CERT_DIR=$(python "$CREATE_TEST_CERT")
openssl verify -CAfile "${TMP_CERT_DIR}/ca.pem" "${TMP_CERT_DIR}/cert.pem"
export GLOO_DEVICE_TRANSPORT=TCP_TLS
export GLOO_DEVICE_TRANSPORT_TCP_TLS_PKEY=${TMP_CERT_DIR}/pkey.key
export GLOO_DEVICE_TRANSPORT_TCP_TLS_CERT=${TMP_CERT_DIR}/cert.pem
export GLOO_DEVICE_TRANSPORT_TCP_TLS_CA_FILE=${TMP_CERT_DIR}/ca.pem
time python test/run_test.py --include distributed/test_c10d_gloo --verbose -- ProcessGroupGlooTest
unset GLOO_DEVICE_TRANSPORT
unset GLOO_DEVICE_TRANSPORT_TCP_TLS_PKEY
unset GLOO_DEVICE_TRANSPORT_TCP_TLS_CERT
unset GLOO_DEVICE_TRANSPORT_TCP_TLS_CA_FILE

View File

@ -385,29 +385,6 @@ def smoke_test_compile(device: str = "cpu") -> None:
x_pt2 = torch.compile(model, mode="max-autotune")(x)
def smoke_test_nvshmem() -> None:
if not torch.cuda.is_available():
print("CUDA is not available, skipping NVSHMEM test")
return
# Check if NVSHMEM is compiled in current build
try:
from torch._C._distributed_c10d import _is_nvshmem_available
except ImportError:
# Not built with NVSHMEM support.
# torch is not compiled with NVSHMEM prior to 2.9
if torch.__version__ < "2.9":
return
else:
# After 2.9: NVSHMEM is expected to be compiled in current build
raise RuntimeError("torch not compiled with NVSHMEM") from None
print("torch compiled with NVSHMEM")
# Check if NVSHMEM is available on current system.
print(f"NVSHMEM available at run time: {_is_nvshmem_available()}")
def smoke_test_modules():
cwd = os.getcwd()
for module in MODULES:
@ -502,8 +479,6 @@ def main() -> None:
options.pypi_pkg_check,
)
smoke_test_nvshmem()
if __name__ == "__main__":
main()

View File

@ -345,12 +345,6 @@ test_h100_symm_mem() {
assert_git_not_dirty
}
test_h100_cutlass_backend() {
# cutlass backend tests for H100
TORCHINDUCTOR_CUTLASS_DIR=$(realpath "./third_party/cutlass") python test/run_test.py --include inductor/test_cutlass_backend -k "not addmm" $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
TORCHINDUCTOR_CUTLASS_DIR=$(realpath "./third_party/cutlass") python test/run_test.py --include inductor/test_cutlass_evt $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
}
test_lazy_tensor_meta_reference_disabled() {
export TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE=1
echo "Testing lazy tensor operations without meta reference"
@ -365,6 +359,7 @@ test_dynamo_wrapped_shard() {
exit 1
fi
python tools/dynamo/verify_dynamo.py
python tools/dynamo/gb_id_mapping.py verify
# PLEASE DO NOT ADD ADDITIONAL EXCLUDES HERE.
# Instead, use @skipIfTorchDynamo on your tests.
time python test/run_test.py --dynamo \
@ -462,7 +457,7 @@ test_inductor_aoti() {
# rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled
/usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}"
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile
}
test_inductor_cpp_wrapper_shard() {
@ -928,6 +923,12 @@ test_torchbench_gcp_smoketest(){
popd
}
test_python_gloo_with_tls() {
source "$(dirname "${BASH_SOURCE[0]}")/run_glootls_test.sh"
assert_git_not_dirty
}
test_aten() {
# Test ATen
# The following test(s) of ATen have already been skipped by caffe2 in rocm environment:
@ -974,8 +975,6 @@ test_without_numpy() {
if [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch;torch.compile(lambda x:print(x))('Hello World')"
fi
# Regression test for https://github.com/pytorch/pytorch/pull/157734 (torch.onnx should be importable without numpy)
python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch; import torch.onnx"
popd
}
@ -1039,10 +1038,20 @@ test_libtorch_api() {
mkdir -p $TEST_REPORTS_DIR
OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" "$TORCH_BIN_DIR"/test_api --gtest_filter='-IMethodTest.*' --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml
"$TORCH_BIN_DIR"/test_tensorexpr --gtest_output=xml:$TEST_REPORTS_DIR/test_tensorexpr.xml
else
# Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy
OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_api -k "not IMethodTest"
# On s390x, pytorch is built without llvm.
# Even if it would be built with llvm, llvm currently doesn't support used features on s390x and
# test fails with errors like:
# JIT session error: Unsupported target machine architecture in ELF object pytorch-jitted-objectbuffer
# unknown file: Failure
# C++ exception with description "valOrErr INTERNAL ASSERT FAILED at "/var/lib/jenkins/workspace/torch/csrc/jit/tensorexpr/llvm_jit.h":34, please report a bug to PyTorch. Unexpected failure in LLVM JIT: Failed to materialize symbols: { (main, { func }) }
if [[ "${BUILD_ENVIRONMENT}" != *s390x* ]]; then
python test/run_test.py --cpp --verbose -i cpp/test_tensorexpr
fi
fi
# quantization is not fully supported on s390x yet
@ -1310,13 +1319,10 @@ EOF
# Step 2. Make sure that the public API test "test_correct_module_names" fails when an existing
# file is modified to introduce an invalid public API function.
# The filepath here must not have __all__ defined in it, otherwise the test will pass.
# If your PR introduces __all__ to torch/cuda/streams.py please point this to another file
# that does not have __all__ defined.
EXISTING_FILEPATH="${TORCH_INSTALL_DIR}/cuda/streams.py"
EXISTING_FILEPATH="${TORCH_INSTALL_DIR}/nn/parameter.py"
cp -v "${EXISTING_FILEPATH}" "${EXISTING_FILEPATH}.orig"
echo "${BAD_PUBLIC_FUNC}" >> "${EXISTING_FILEPATH}"
invalid_api="torch.cuda.streams.new_public_func"
invalid_api="torch.nn.parameter.new_public_func"
echo "Appended an invalid public API function to existing file ${EXISTING_FILEPATH}..."
check_public_api_test_fails \
@ -1550,7 +1556,7 @@ test_executorch() {
test_linux_aarch64() {
python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops test_cpp_extensions_open_device_registration \
--shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
# Dynamo tests
@ -1662,13 +1668,11 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then
elif [[ "${TEST_CONFIG}" == cachebench ]]; then
install_torchaudio
install_torchvision
checkout_install_torchbench nanogpt BERT_pytorch resnet50 hf_T5 llama moco
PYTHONPATH=$(pwd)/torchbench test_cachebench
PYTHONPATH=/torchbench test_cachebench
elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then
install_torchaudio
install_torchvision
checkout_install_torchbench nanogpt
PYTHONPATH=$(pwd)/torchbench test_verify_cachebench
PYTHONPATH=/torchbench test_verify_cachebench
elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
install_torchaudio
install_torchvision
@ -1677,28 +1681,22 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
# https://github.com/opencv/opencv-python/issues/885
pip_install opencv-python==4.8.0.74
if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
checkout_install_torchbench hf_Bert hf_Albert timm_vision_transformer
PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
PYTHONPATH=/torchbench test_inductor_torchbench_smoketest_perf
elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_edgecnn \
llama_v2_7b_16h resnet50 timm_efficientnet mobilenet_v3_large timm_resnest \
functorch_maml_omniglot yolov3 mobilenet_v2 resnext50_32x4d densenet121 mnasnet1_0
PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_cpu_smoketest_perf
PYTHONPATH=/torchbench test_inductor_torchbench_cpu_smoketest_perf
elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then
checkout_install_torchbench
TORCHBENCHPATH=$(pwd)/torchbench test_torchbench_gcp_smoketest
TORCHBENCHPATH=/torchbench test_torchbench_gcp_smoketest
else
checkout_install_torchbench
# Do this after checkout_install_torchbench to ensure we clobber any
# nightlies that torchbench may pull in
if [[ "${TEST_CONFIG}" != *cpu* ]]; then
install_torchrec_and_fbgemm
fi
PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"
fi
elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
install_torchvision
PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
if [[ "$SHARD_NUMBER" -eq "1" ]]; then
test_inductor_aoti
fi
@ -1763,8 +1761,6 @@ elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
test_h100_distributed
elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
test_h100_symm_mem
elif [[ "${TEST_CONFIG}" == h100_cutlass_backend ]]; then
test_h100_cutlass_backend
else
install_torchvision
install_monkeytype

View File

@ -1,34 +0,0 @@
# If you want to rebuild, run this with $env:REBUILD=1
# If you want to build with CUDA, run this with $env:USE_CUDA=1
# If you want to build without CUDA, run this with $env:USE_CUDA=0
# Check for setup.py in the current directory
if (-not (Test-Path "setup.py")) {
Write-Host "ERROR: Please run this build script from PyTorch root directory."
exit 1
}
# Get the script's parent directory
$ScriptParentDir = Split-Path -Parent $MyInvocation.MyCommand.Definition
# Set TMP_DIR and convert to Windows path
$env:TMP_DIR = Join-Path (Get-Location) "build\win_tmp"
$env:TMP_DIR_WIN = $env:TMP_DIR # Already in Windows format, no cygpath needed
# Set final package directory with default fallback
if (-not $env:PYTORCH_FINAL_PACKAGE_DIR) {
$env:PYTORCH_FINAL_PACKAGE_DIR = "C:\w\build-results"
}
# Create the final package directory if it doesn't exist
if (-not (Test-Path $env:PYTORCH_FINAL_PACKAGE_DIR)) {
New-Item -Path $env:PYTORCH_FINAL_PACKAGE_DIR -ItemType Directory -Force | Out-Null
}
# Set script helpers directory
$env:SCRIPT_HELPERS_DIR = Join-Path $ScriptParentDir "win-test-helpers\arm64"
# Run the main build script
& "$env:SCRIPT_HELPERS_DIR\build_pytorch.ps1"
Write-Host "BUILD PASSED"

View File

@ -1,24 +0,0 @@
#!/bin/bash
set -ex -o pipefail
SCRIPT_PARENT_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
# shellcheck source=./common.sh
source "$SCRIPT_PARENT_DIR/common.sh"
run_tests() {
echo Running smoke_test.py...
python ./.ci/pytorch/smoke_test/smoke_test.py --package torchonly
echo Running test_autograd.oy, test_nn.py, test_torch.py...
cd test
CORE_TEST_LIST=("test_autograd.py" "test_nn.py" "test_modules.py")
for t in "${CORE_TEST_LIST[@]}"; do
echo "Running test: $t"
python "$t" --verbose --save-xml --use-pytest -vvvv -rfEsxXP -p no:xdist
done
}
run_tests
echo "TEST PASSED"

View File

@ -1,98 +0,0 @@
# TODO: we may can use existing build_pytorch.bat for arm64
if ($env:DEBUG -eq "1") {
$env:BUILD_TYPE = "debug"
} else {
$env:BUILD_TYPE = "release"
}
# This inflates our log size slightly, but it is REALLY useful to be
# able to see what our cl.exe commands are. (since you can actually
# just copy-paste them into a local Windows setup to just rebuild a
# single file.)
# log sizes are too long, but leaving this here in case someone wants to use it locally
# $env:CMAKE_VERBOSE_MAKEFILE = "1"
$env:INSTALLER_DIR = Join-Path $env:SCRIPT_HELPERS_DIR "installation-helpers"
cd ..
# Environment variables
$env:SCCACHE_IDLE_TIMEOUT = "0"
$env:SCCACHE_IGNORE_SERVER_IO_ERROR = "1"
$env:CMAKE_BUILD_TYPE = $env:BUILD_TYPE
$env:CMAKE_C_COMPILER_LAUNCHER = "sccache"
$env:CMAKE_CXX_COMPILER_LAUNCHER = "sccache"
$env:libuv_ROOT = Join-Path $env:DEPENDENCIES_DIR "libuv\install"
$env:MSSdk = "1"
if ($env:PYTORCH_BUILD_VERSION) {
$env:PYTORCH_BUILD_VERSION = $env:PYTORCH_BUILD_VERSION
$env:PYTORCH_BUILD_NUMBER = "1"
}
$env:CMAKE_POLICY_VERSION_MINIMUM = "3.5"
# Set BLAS type
if ($env:ENABLE_APL -eq "1") {
$env:BLAS = "APL"
$env:USE_LAPACK = "1"
} elseif ($env:ENABLE_OPENBLAS -eq "1") {
$env:BLAS = "OpenBLAS"
$env:OpenBLAS_HOME = Join-Path $env:DEPENDENCIES_DIR "OpenBLAS\install"
}
# Change to source directory
Set-Location $env:PYTORCH_ROOT
# Copy libuv.dll
Copy-Item -Path (Join-Path $env:libuv_ROOT "lib\Release\uv.dll") -Destination "torch\lib\uv.dll" -Force
# Create virtual environment
python -m venv .venv
.\.venv\Scripts\Activate.ps1
where.exe python
# Python install dependencies
python -m pip install --upgrade pip
pip install setuptools pyyaml
pip install -r requirements.txt
# Set after installing psutil
$env:DISTUTILS_USE_SDK = "1"
# Print all environment variables
Get-ChildItem Env:
# Start and inspect sccache
sccache --start-server
sccache --zero-stats
sccache --show-stats
# Build the wheel
python setup.py bdist_wheel
if ($LASTEXITCODE -ne 0) { exit 1 }
# Install the wheel locally
$whl = Get-ChildItem -Path "dist\*.whl" | Select-Object -First 1
if ($whl) {
python -mpip install --no-index --no-deps $whl.FullName
}
# Copy final wheel
robocopy "dist" "$env:PYTORCH_FINAL_PACKAGE_DIR" *.whl
# Export test times
python tools/stats/export_test_times.py
# Copy additional CI files
robocopy ".additional_ci_files" "$env:PYTORCH_FINAL_PACKAGE_DIR\.additional_ci_files" /E
# Save ninja log
Copy-Item -Path "build\.ninja_log" -Destination $env:PYTORCH_FINAL_PACKAGE_DIR -Force
# Final sccache stats and stop
sccache --show-stats
sccache --stop-server
exit 0

View File

@ -41,7 +41,7 @@ fi
python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
# Install Z3 optional dependency for Windows builds.
python -m pip install z3-solver==4.15.1.0
python -m pip install z3-solver==4.12.2.0
# Install tlparse for test\dynamo\test_structured_trace.py UTs.
python -m pip install tlparse==0.3.30

View File

@ -7,12 +7,12 @@ max-line-length = 120
# C408 ignored because we like the dict keyword argument syntax
# E501 is not flexible enough, we're using B950 instead
ignore =
E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,
# shebang has extra meaning in fbcode lints, so I think it's not worth trying
# to line this up with executable bit
EXE001,
# these ignores are from flake8-bugbear; please fix!
B007,B008,B017,B019,B023,B028,B903,B904,B905,B906,B907,B908,B910
B007,B008,B017,B019,B023,B028,B903,B904,B905,B906,B907
# these ignores are from flake8-comprehensions; please fix!
C407,
# these ignores are from flake8-logging-format; please fix!

View File

@ -0,0 +1,78 @@
name: build android
description: build android for a specific arch
inputs:
arch:
description: arch to build
required: true
arch-for-build-env:
description: |
arch to pass to build environment.
This is currently different than the arch name we use elsewhere, which
should be fixed.
required: true
github-secret:
description: github token
required: true
build-environment:
required: true
description: Top-level label for what's being built/tested.
docker-image:
required: true
description: Name of the base docker image to build with.
branch:
required: true
description: What branch we are building on.
outputs:
container_id:
description: Docker container identifier used to build the artifacts
value: ${{ steps.build.outputs.container_id }}
runs:
using: composite
steps:
- name: Build-${{ inputs.arch }}
id: build
shell: bash
env:
BRANCH: ${{ inputs.branch }}
BUILD_ENVIRONMENT: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-${{ inputs.arch-for-build-env }}-build"
AWS_DEFAULT_REGION: us-east-1
PR_NUMBER: ${{ github.event.pull_request.number }}
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
SCCACHE_REGION: us-east-1
DOCKER_IMAGE: ${{ inputs.docker-image }}
MATRIX_ARCH: ${{ inputs.arch }}
run: |
# detached container should get cleaned up by teardown_ec2_linux
set -exo pipefail
export container_name
container_name=$(docker run \
-e BUILD_ENVIRONMENT \
-e MAX_JOBS="$(nproc --ignore=2)" \
-e AWS_DEFAULT_REGION \
-e PR_NUMBER \
-e SHA1 \
-e BRANCH \
-e SCCACHE_BUCKET \
-e SCCACHE_REGION \
-e SKIP_SCCACHE_INITIALIZATION=1 \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--tty \
--detach \
--user jenkins \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}"
)
git submodule sync && git submodule update -q --init --recursive --depth 1
docker cp "${GITHUB_WORKSPACE}/." "${container_name}:/var/lib/jenkins/workspace"
(echo "sudo chown -R jenkins . && .ci/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete" | docker exec -u jenkins -i "${container_name}" bash) 2>&1
# Copy install binaries back
mkdir -p "${GITHUB_WORKSPACE}/build_android_install_${MATRIX_ARCH}"
docker cp "${container_name}:/var/lib/jenkins/workspace/build_android/install" "${GITHUB_WORKSPACE}/build_android_install_${MATRIX_ARCH}"
echo "container_id=${container_name}" >> "${GITHUB_OUTPUT}"

View File

@ -70,7 +70,7 @@ runs:
set -eux
# PyYAML 6.0 doesn't work with MacOS x86 anymore
# This must run on Python-3.7 (AmazonLinux2) so can't use request=3.32.2
python3 -m pip install requests==2.27.1 pyyaml==6.0.2
python3 -m pip install requests==2.27.1 pyyaml==6.0.1
- name: Parse ref
id: parse-ref

View File

@ -1 +1 @@
f6dfe1231dcdd221a68416e49ab85c2575cbb824
00b0c91db92c51a11356249262577b9fa26c18c5

View File

@ -1 +1 @@
7f1de94a4c2d14f59ad4ca84538c36084ea6b2c8
5fb5024118e9bb9decf96c2b0b1a8f0010bf56be

View File

@ -1 +1 @@
8f605ee30912541126c0fe46d0c8c413101b600a
29d1ffc5b4c763ef76aff9e3f617fa60dd292418

View File

@ -1 +1 @@
29ae4c76c026185f417a25e841d2cd5e65f087a3
1c00dea2c9adb2137903c86b4191e8c247f8fda9

View File

@ -131,6 +131,21 @@
- Lint
- pull
- name: Mobile
patterns:
- ios/**
- android/**
- test/mobile/**
approved_by:
- linbinyu
- IvanKobzarev
- dreiss
- raziel
mandatory_checks_name:
- EasyCLA
- Lint
- pull
- name: PrimTorch
patterns:
- torch/_meta_registrations.py
@ -477,19 +492,6 @@
- srossross
- chillee
- zou3519
- guilhermeleobas
mandatory_checks_name:
- EasyCLA
- Lint
- pull
- name: Dynamo
patterns:
- torch/_dynamo/**
- torch/csrc/dynamo/**
- test/dynamo/**
approved_by:
- guilhermeleobas
mandatory_checks_name:
- EasyCLA
- Lint

View File

@ -31,9 +31,7 @@ ciflow_push_tags:
- ciflow/pull
- ciflow/h100
- ciflow/h100-distributed
- ciflow/win-arm64
- ciflow/h100-symm-mem
- ciflow/h100-cutlass-backend
retryable_workflows:
- pull
- trunk

View File

@ -7,9 +7,9 @@
# .ci/docker/requirements-ci.txt
boto3==1.35.42
jinja2==3.1.6
lintrunner==0.12.7
lintrunner==0.10.7
ninja==1.10.0.post1
nvidia-ml-py==11.525.84
pyyaml==6.0.2
pyyaml==6.0
requests==2.32.4
rich==14.1.0
rich==10.9.0

View File

@ -2,7 +2,7 @@ boto3==1.35.42
cmake==3.27.*
expecttest==0.3.0
fbscribelogger==0.1.7
filelock==3.13.1
filelock==3.6.0
hypothesis==6.56.4
librosa>=0.6.2
mpmath==1.3.0
@ -33,4 +33,4 @@ tensorboard==2.13.0
typing-extensions==4.12.2
unittest-xml-reporting<=3.2.0,>=2.0.0
xdoctest==1.1.0
z3-solver==4.15.1.0
z3-solver==4.12.2.0

View File

@ -2,7 +2,7 @@
set -ex
# Use uv to speed up lintrunner init
python3 -m pip install -U uv==0.8.* setuptools
python3 -m pip install uv==0.1.45 setuptools
CACHE_DIRECTORY="/tmp/.lintbin"
# Try to recover the cached binaries

View File

@ -27,7 +27,7 @@ jobs:
PR_NUMBER="${{ github.event.number }}"
# Use gh CLI to get changed files in the PR with explicit repo
CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
CHANGED_FILES=$(gh pr view "$PR_NUMBER" --repo "${{ github.repository }}" --json files --jq '.files[].path' | tr '\n' ' ' | sed 's/ $//')
if [ -z "$CHANGED_FILES" ]; then
echo "No changed files found, setting to '*'"
@ -40,4 +40,4 @@ jobs:
else
echo "Not in PR context, setting changed files to '*'"
echo "changed-files=*" >> "$GITHUB_OUTPUT"
fi
fi

View File

@ -269,8 +269,8 @@ jobs:
# copy test results back to the mounted workspace, needed sudo, resulting permissions were correct
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test"
- name: Change permissions (only needed for MI300 and MI355 kubernetes runners for now)
if: ${{ always() && steps.test.conclusion && (contains(matrix.runner, 'mi300') || contains(matrix.runner, 'mi355')) }}
- name: Change permissions (only needed for MI300 runners for now)
if: ${{ always() && steps.test.conclusion && contains(matrix.runner, 'mi300') }}
run: |
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"

View File

@ -56,7 +56,7 @@ jobs:
cache: pip
architecture: x64
- run: pip install pyyaml==6.0.2
- run: pip install pyyaml==6.0
shell: bash
- name: Verify mergeability

View File

@ -26,7 +26,7 @@ jobs:
cache: pip
# Not the direct dependencies but the script uses trymerge
- run: pip install pyyaml==6.0.2
- run: pip install pyyaml==6.0
- name: Setup committer id
run: |

View File

@ -50,7 +50,6 @@ jobs:
runner: [linux.12xlarge]
docker-image-name: [
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks,

View File

@ -144,7 +144,7 @@ jobs:
run: |
make -f docker.Makefile "${BUILD_IMAGE_TYPE}-image"
- name: Push nightly tags
if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' && matrix.platform == 'linux/amd4' }}
if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' && matrix.build_platforms == 'linux/amd4' }}
run: |
PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-cuda${CUDA_VERSION_SHORT}-cudnn${CUDNN_VERSION}-runtime"
CUDA_SUFFIX="-cu${CUDA_VERSION}"

View File

@ -1,58 +0,0 @@
name: Limited CI for CUTLASS backend on H100
on:
pull_request:
paths:
- .github/workflows/h100-cutlass-backend.yml
workflow_dispatch:
schedule:
- cron: 22 9 * * * # every 24 hours about 2:22am PDT
push:
tags:
- ciflow/h100-cutlass-backend/*
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
get-label-type:
if: github.repository_owner == 'pytorch'
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-cutlass-backend:
name: linux-jammy-cuda12.8-py3.10-gcc11-sm90-cutlass-backend
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-cutlass-backend
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '9.0'
test-matrix: |
{ include: [
{ config: "h100_cutlass_backend", shard: 1, num_shards: 1, runner: "linux.aws.h100", owners: ["oncall:pt2"] },
]}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc11-sm90-test:
name: linux-jammy-cuda12.8-py3.10-gcc11-sm90-cutlass-backend
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-cutlass-backend
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-cutlass-backend
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-cutlass-backend.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-cutlass-backend.outputs.test-matrix }}
secrets: inherit

View File

@ -2,7 +2,7 @@ name: inductor-perf-nightly-h100
on:
schedule:
- cron: 15 0,12 * * 1-6
- cron: 15 0,4,8,12,16,20 * * 1-6
- cron: 0 7 * * 0
# NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
# out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
@ -126,7 +126,7 @@ jobs:
name: cuda12.8-py3.10-gcc9-sm90
uses: ./.github/workflows/_linux-test.yml
needs: build
if: github.event.schedule == '15 0,12 * * 1-6'
if: github.event.schedule == '15 0,4,8,12,16,20 * * 1-6'
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true

View File

@ -315,6 +315,21 @@ jobs:
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-py3-clang18-mobile-build:
name: linux-jammy-py3-clang18-mobile-build
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3-clang12-mobile-build
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
build-generates-artifacts: false
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 1 },
]}
secrets: inherit
linux-jammy-cuda12_8-cudnn9-py3_9-clang12-build:
name: linux-jammy-cuda12.8-cudnn9-py3.9-clang12
uses: ./.github/workflows/_linux-build.yml

View File

@ -26,7 +26,7 @@ jobs:
architecture: x64
check-latest: false
cache: pip
- run: pip install pyyaml==6.0.2
- run: pip install pyyaml==6.0
- name: Setup committer id
run: |

View File

@ -1,68 +0,0 @@
name: rocm-mi355
on:
workflow_dispatch:
schedule:
- cron: 30 9 * * * # about 2:30am PDT
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions: read-all
jobs:
target-determination:
if: github.repository_owner == 'pytorch'
name: before-test
uses: ./.github/workflows/target_determination.yml
permissions:
id-token: write
contents: read
get-label-type:
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
linux-noble-rocm-py3_12-build:
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
name: linux-noble-rocm-py3.12-mi355
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-noble-rocm-py3.12-mi355
docker-image-name: ci-image:pytorch-linux-noble-rocm-alpha-py3
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
]}
secrets: inherit
linux-noble-rocm-py3_12-test:
permissions:
id-token: write
contents: read
name: linux-noble-rocm-py3.12-mi355
uses: ./.github/workflows/_rocm-test.yml
needs:
- linux-noble-rocm-py3_12-build
- target-determination
with:
build-environment: linux-noble-rocm-py3.12-mi355
docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }}
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
secrets: inherit

View File

@ -28,7 +28,7 @@ jobs:
check-latest: false
cache: pip
architecture: x64
- run: pip install pyyaml==6.0.2
- run: pip install pyyaml==6.0
- name: Setup committer id
run: |

View File

@ -25,7 +25,7 @@ jobs:
architecture: x64
check-latest: false
cache: pip
- run: pip install pyyaml==6.0.2
- run: pip install pyyaml==6.0
- name: Setup committer id
run: |

View File

@ -14,7 +14,6 @@ on:
- inductor-periodic
- rocm
- rocm-mi300
- rocm-mi355
- inductor-micro-benchmark
- inductor-micro-benchmark-x86
- inductor-cu124

View File

@ -1,187 +0,0 @@
name: windows-arm64-build-test
on:
push:
tags:
- ciflow/win-arm64/*
env:
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
PYTHON_VERSION: "3.12"
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
DOWNLOADS_DIR: c:\temp\downloads
DEPENDENCIES_DIR: c:\temp\dependencies
ENABLE_APL: 1
ENABLE_OPENBLAS: 0
BUILD_TYPE: release
permissions:
id-token: write
contents: read
jobs:
build:
# Don't run on forked repos.
if: github.repository_owner == 'pytorch'
runs-on: "windows-11-arm64-preview"
timeout-minutes: 240
steps:
- name: configure aws credentials
id: aws_creds
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_sscache
aws-region: us-east-1
role-duration-seconds: 18000
- name: Enable long paths
shell: cmd
run: |
git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
git config --system core.longpaths true
- name: Git checkout PyTorch
uses: actions/checkout@v4
with:
path: pytorch
submodules: recursive
- name: Bootstrap Python
shell: cmd
run: |
"pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
- name: Parse ref
id: parse-ref
shell: bash
run: python pytorch/.github/scripts/parse_ref.py
- name: Get workflow job id
shell: bash
id: get-job-id
run: |
set -eux
python pytorch/.github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}"
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Bootstrap APL
shell: cmd
run: |
"pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
- name: Bootstrap Rust
shell: cmd
run: |
"pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
- name: Bootstrap sccache
shell: cmd
run: |
"pytorch/.ci/pytorch/windows/arm64/bootstrap_sccache.bat"
- name: Bootstrap Libuv
shell: cmd
run: |
"pytorch/.ci/pytorch/windows/arm64/bootstrap_libuv.bat"
- name: Build
id: build
shell: cmd
env:
PYTORCH_FINAL_PACKAGE_DIR: C:/${{ github.run_id }}/build-results/
BRANCH: ${{ steps.parse-ref.outputs.branch }}
BUILD_WHEEL: 1
MAX_JOBS: 8
PYTHON_VERSION: "3.12"
SCCACHE_BUCKET: "ossci-compiler-cache"
SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
SCCACHE_REGION: us-east-1
VC_PRODUCT: "BuildTools"
VC_VERSION: ""
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
AWS_DEFAULT_REGION: us-east-1
USE_CUDA: '0'
USE_XPU: '0'
OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
run: |
cd pytorch
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64
powershell -ExecutionPolicy Bypass -File ".ci/pytorch/win-arm64-build.ps1"
- name: Upload artifacts
uses: actions/upload-artifact@v4.4.0
if: always()
with:
name: torch-wheel-win-arm64-py3-12
retention-days: 14
if-no-files-found: error
path: C:\${{ github.run_id }}\build-results
test:
if: github.repository_owner == 'pytorch'
strategy:
fail-fast: false
runs-on: "windows-11-arm64-preview"
needs: build
steps:
- name: Enable long paths
shell: cmd
run: |
git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
git config --system core.longpaths true
- name: Git checkout PyTorch
uses: actions/checkout@v4
with:
path: pytorch
submodules: recursive
- name: Bootstrap Python
shell: cmd
run: |
"pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
- name: Bootstrap Rust
shell: cmd
run: |
"pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
- name: Get workflow job id
shell: bash
id: get-job-id
run: |
set -eux
python pytorch/.github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}"
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Download Build Artifacts
uses: actions/download-artifact@v4.1.7
with:
name: torch-wheel-win-arm64-py3-12
path: C:\${{ github.run_id }}\build-results
- name: Test
id: test
shell: cmd
env:
USE_CUDA: '0'
INSTALL_WINDOWS_SDK: 1
PYTHON_VERSION: "3.12"
VC_PRODUCT: "BuildTools"
AWS_DEFAULT_REGION: us-east-1
GITHUB_REPOSITORY: ${{ github.repository }}
GITHUB_WORKFLOW: ${{ github.workflow }}
GITHUB_JOB: ${{ github.job }}
GITHUB_RUN_ID: ${{ github.run_id }}
GITHUB_RUN_NUMBER: ${{ github.run_number }}
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
PYTORCH_FINAL_PACKAGE_DIR: C:/${{ github.run_id }}/build-results/
run: |
mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
call pytorch/.ci/pytorch/windows/arm64/bootstrap_tests.bat
set GIT_BASH=C:\Program Files\Git\usr\bin\bash.exe
"%GIT_BASH%" -c "bash --noprofile --norc .ci/pytorch/win-arm64-test.sh"

View File

@ -39,16 +39,16 @@ init_command = [
'python3',
'tools/linter/adapters/pip_init.py',
'--dry-run={{DRYRUN}}',
'flake8==7.3.0',
'flake8-bugbear==24.12.12',
'flake8-comprehensions==3.16.0',
'flake8==6.1.0',
'flake8-bugbear==23.3.23',
'flake8-comprehensions==3.15.0',
'flake8-executable==2.1.3',
'flake8-logging-format==2024.24.12',
'flake8-pyi==25.5.0',
'flake8-simplify==0.22.0',
'flake8-logging-format==0.9.0',
'flake8-pyi==23.3.1',
'flake8-simplify==0.19.3',
'mccabe==0.7.0',
'pycodestyle==2.14.0',
'pyflakes==3.4.0',
'pycodestyle==2.11.1',
'pyflakes==3.1.0',
'torchfix==0.4.0 ; python_version >= "3.9" and python_version < "3.13"',
]
@ -158,7 +158,7 @@ init_command = [
'mypy==1.16.0',
'sympy==1.13.3',
'types-requests==2.27.25',
'types-pyyaml==6.0.2',
'types-pyyaml==6.0.1',
'types-tabulate==0.8.8',
'types-protobuf==5.29.1.20250403',
'types-setuptools==79.0.0.20250422',
@ -166,8 +166,8 @@ init_command = [
'types-colorama==0.4.6',
'filelock==3.13.1',
'junitparser==2.1.1',
'rich==14.1.0',
'pyyaml==6.0.2',
'rich==10.9.0',
'pyyaml==6.0.1',
'optree==0.13.0',
'dataclasses-json==0.6.7',
'pandas==2.2.3',
@ -1111,7 +1111,7 @@ init_command = [
'python3',
'tools/linter/adapters/pip_init.py',
'--dry-run={{DRYRUN}}',
'pyyaml==6.0.2',
'PyYAML==6.0.1',
]
[[linter]]
@ -1133,7 +1133,7 @@ init_command = [
'python3',
'tools/linter/adapters/pip_init.py',
'--dry-run={{DRYRUN}}',
'pyyaml==6.0.2',
'PyYAML==6.0.1',
]
[[linter]]
@ -1794,12 +1794,3 @@ include_patterns = [
'torch/header_only_apis.txt',
]
is_formatter = false
[[linter]]
code = "GB_REGISTRY"
include_patterns = ["torch/_dynamo/**/*.py"]
command = [
"python3",
"tools/linter/adapters/gb_registry_linter.py",
]

View File

@ -47,6 +47,18 @@ WORKDIR /opt/pytorch
COPY . .
RUN git submodule update --init --recursive
FROM conda as build
ARG CMAKE_VARS
WORKDIR /opt/pytorch
COPY --from=conda /opt/conda /opt/conda
COPY --from=submodule-update /opt/pytorch /opt/pytorch
RUN make triton
RUN --mount=type=cache,target=/opt/ccache \
export eval ${CMAKE_VARS} && \
TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 8.9 9.0 9.0a" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
python -m pip install --no-build-isolation -v .
FROM conda as conda-installs
ARG PYTHON_VERSION=3.11
ARG CUDA_PATH=cu121
@ -97,5 +109,4 @@ WORKDIR /workspace
FROM official as dev
# Should override the already installed version from the official-image stage
COPY --from=conda /opt/conda /opt/conda
COPY --from=submodule-update /opt/pytorch /opt/pytorch
COPY --from=build /opt/conda /opt/conda

View File

@ -294,12 +294,14 @@ Install PyTorch
```bash
export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
python -m pip install -r requirements-build.txt
python -m pip install --no-build-isolation -v -e .
```
**On macOS**
```bash
python -m pip install -r requirements-build.txt
python -m pip install --no-build-isolation -v -e .
```

View File

@ -586,10 +586,17 @@ if(USE_CUDA AND NOT USE_ROCM)
CUDA::cufft_static_nocallback
)
if(NOT BUILD_LAZY_CUDA_LINALG)
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
CUDA::cusolver_static
${CUDAToolkit_LIBRARY_DIR}/libcusolver_lapack_static.a # needed for libcusolver_static
)
if(CUDA_VERSION_MAJOR LESS_EQUAL 11)
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
CUDA::cusolver_static
${CUDAToolkit_LIBRARY_DIR}/liblapack_static.a # needed for libcusolver_static
)
elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 12)
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
CUDA::cusolver_static
${CUDAToolkit_LIBRARY_DIR}/libcusolver_lapack_static.a # needed for libcusolver_static
)
endif()
endif()
else()
list(APPEND ATen_CUDA_DEPENDENCY_LIBS

View File

@ -14,9 +14,7 @@
#include <ATen/cpu/FlushDenormal.h>
#ifdef USE_FBGEMM
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
#include <fbgemm/Fbgemm.h>
C10_DIAGNOSTIC_POP()
#endif // USE_FBGEMM
#if defined(__aarch64__) && !defined(C10_MOBILE)
#include <cpuinfo.h>
@ -334,14 +332,6 @@ void Context::setBenchmarkLimitCuDNN(int b) {
benchmark_limit_cudnn = b;
}
bool Context::immediateMiopen() const {
return immediate_miopen;
}
void Context::setImmediateMiopen(bool b) {
immediate_miopen = b;
}
bool Context::allowTF32CuBLAS() const {
#ifdef USE_ROCM
const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
@ -512,7 +502,7 @@ at::BlasBackend Context::blasPreferredBackend() {
static const std::vector<std::string> archs = {
"gfx90a", "gfx942",
#if ROCM_VERSION >= 60300
"gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908",
"gfx1100", "gfx1101", "gfx1200", "gfx1201",
#endif
#if ROCM_VERSION >= 60500
"gfx950"

View File

@ -205,8 +205,6 @@ class TORCH_API Context {
void setBenchmarkCuDNN(bool);
int benchmarkLimitCuDNN() const;
void setBenchmarkLimitCuDNN(int);
bool immediateMiopen() const;
void setImmediateMiopen(bool);
bool deterministicCuDNN() const;
void setDeterministicCuDNN(bool);
bool deterministicMkldnn() const;
@ -442,7 +440,6 @@ class TORCH_API Context {
bool enabled_overrideable = true;
bool allow_fp16_bf16_reduction_mathSDP = false;
bool benchmark_cudnn = false;
bool immediate_miopen = false;
Float32MatmulPrecision float32_matmul_precision =
c10::utils::check_env("TORCH_ALLOW_TF32_CUBLAS_OVERRIDE") == true
? at::Float32MatmulPrecision::HIGH

View File

@ -132,9 +132,6 @@ DLDevice torchDeviceToDLDevice(at::Device device) {
case DeviceType::PrivateUse1:
ctx.device_type = DLDeviceType::kDLExtDev;
break;
case DeviceType::MPS:
ctx.device_type = DLDeviceType::kDLMetal;
break;
default:
TORCH_CHECK_BUFFER(false, "Cannot pack tensors on " + device.str());
}
@ -167,8 +164,6 @@ static Device getATenDevice(DLDeviceType type, c10::DeviceIndex index, void* dat
return at::Device(DeviceType::MAIA, index);
case DLDeviceType::kDLExtDev:
return at::Device(DeviceType::PrivateUse1, index);
case DLDeviceType::kDLMetal:
return at::Device(DeviceType::MPS, index);
default:
TORCH_CHECK_BUFFER(
false, "Unsupported device_type: ", std::to_string(type));

View File

@ -1,5 +1,6 @@
#pragma once
#include <c10/core/CachingDeviceAllocator.h>
#include <c10/core/DeviceType.h>
#include <c10/macros/Macros.h>
@ -72,6 +73,27 @@ TORCH_API c10::DeviceIndex exchangeDevice(c10::DeviceIndex device_index);
// original device index that was active before the change.
TORCH_API c10::DeviceIndex maybeExchangeDevice(c10::DeviceIndex device_index);
TORCH_API inline void emptyCache() {
const auto device_type = getAccelerator(true).value();
at::getDeviceAllocator(device_type)->emptyCache();
}
TORCH_API inline at::CachingDeviceAllocator::DeviceStats getDeviceStats(
c10::DeviceIndex device_index) {
const auto device_type = getAccelerator(true).value();
return at::getDeviceAllocator(device_type)->getDeviceStats(device_index);
}
TORCH_API inline void resetAccumulatedStats(c10::DeviceIndex device_index) {
const auto device_type = getAccelerator(true).value();
at::getDeviceAllocator(device_type)->resetAccumulatedStats(device_index);
}
TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) {
const auto device_type = getAccelerator(true).value();
at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
}
} // namespace at::accelerator
namespace at {

View File

@ -9,36 +9,7 @@
namespace at {
/*
* Design:
* 1. ZeroTensors are regular tensors with TensorOptions, a storage
* pointing to nullptr and a ZeroTensor dispatch key set.
*
* 2. ZeroTensors are immutable. This is done to prevent data race in the case of multithreading
* (when two threads try to read the same zero tensor and materialize it in-place).
*
* 3. ZeroTensor has a boxed fallback that will be dispatched to any ops that don't
* have special ZeroTensor handling. This fallback materializes each ZeroTensor to
* `at::zeros({}, tensor.options()).expand(tensor.sizes())`.
* 4. ZeroTensors are handled above autograd. This is necessary because fallback
* operations are not differentiable.
* - Example: Consider add in the case it was using the fallback: zerotensor_a + b.
* zerotensor_a would be materialized to c=torch.zeros_like(zerotensor_a) after
* passing through the fallback. If this happens above the autograd, then the
* gradients would be populated on c instead of zerotensor_a.
*
* 5. The grad field is always populated with an honest to goodness tensor. This
* materialization of ZeroTensors will happen in:
* - AccumulateGrad for Backward Mode AD.
* - will never be required for ForwardMode AD.
* - This is because if all the tangents were undefined (efficient ZeroTensors),
* no computation will be performed (this is ensured via an existing pre-check).
*
* Today ZeroTensors are primarily used to represent undefined gradients in forward AD,
* it does not perfectly handle NaNs and Infs as we don't check the actual values
* and assume that they are non-zero, non-inf, non-NaN etc.
*/
// TODO: add a note explaining the design decisions
// ZeroTensors are designed to be immutable. Thus, we error out when an in-place operation is performed on ZeroTensors
static void zeroTensorFallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
const auto& arguments = op.schema().arguments();

View File

@ -1 +1,55 @@
#include <torch/headeronly/cpu/vec/intrinsics.h>
#pragma once
#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
/* GCC or clang-compatible compiler, targeting x86/x86-64 */
#include <x86intrin.h>
#elif defined(__clang__) && (defined(__ARM_NEON__) || defined(__aarch64__))
/* Clang-compatible compiler, targeting arm neon */
#include <arm_neon.h>
#if defined(__ARM_FEATURE_SVE)
/* CLANG-compatible compiler, targeting ARM with SVE */
#include <arm_sve.h>
#endif
#elif defined(_MSC_VER)
/* Microsoft C/C++-compatible compiler */
#include <intrin.h>
#if _MSC_VER <= 1900
#define _mm256_extract_epi64(X, Y) \
(_mm_extract_epi64(_mm256_extractf128_si256(X, Y >> 1), Y % 2))
#define _mm256_extract_epi32(X, Y) \
(_mm_extract_epi32(_mm256_extractf128_si256(X, Y >> 2), Y % 4))
#define _mm256_extract_epi16(X, Y) \
(_mm_extract_epi16(_mm256_extractf128_si256(X, Y >> 3), Y % 8))
#define _mm256_extract_epi8(X, Y) \
(_mm_extract_epi8(_mm256_extractf128_si256(X, Y >> 4), Y % 16))
#endif
#elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__aarch64__))
/* GCC-compatible compiler, targeting ARM with NEON */
#include <arm_neon.h>
#if defined(__ARM_FEATURE_SVE)
/* GCC-compatible compiler, targeting ARM with SVE */
#include <arm_sve.h>
#endif
#if defined(MISSING_ARM_VLD1)
#include <ATen/cpu/vec/vec256/missing_vld1_neon.h>
#elif defined(MISSING_ARM_VST1)
#include <ATen/cpu/vec/vec256/missing_vst1_neon.h>
#endif
#elif defined(__GNUC__) && defined(__IWMMXT__)
/* GCC-compatible compiler, targeting ARM with WMMX */
#include <mmintrin.h>
#elif defined(__s390x__)
// targets Z/architecture
// we will include vecintrin later
#elif (defined(__GNUC__) || defined(__xlC__)) && \
(defined(__VEC__) || defined(__ALTIVEC__))
/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
#include <altivec.h>
/* We need to undef those tokens defined by <altivec.h> to avoid conflicts
with the C++ types. => Can still use __bool/__vector */
#undef bool
#undef vector
#undef pixel
#elif defined(__GNUC__) && defined(__SPE__)
/* GCC-compatible compiler, targeting PowerPC with SPE */
#include <spe.h>
#endif

View File

@ -5,7 +5,6 @@
#include <ATen/cpu/vec/sve/vec_common_sve.h>
#include <ATen/cpu/vec/sve/vec_float.h>
#include <ATen/cpu/vec/vec_base.h>
#include <c10/util/bit_cast.h>
#include <cmath>
namespace at {
namespace vec {
@ -37,7 +36,7 @@ class Vectorized<BFloat16> {
return VECTOR_WIDTH / sizeof(BFloat16);
}
Vectorized();
Vectorized() {}
Vectorized(svbfloat16_t v) : values(v) {}
Vectorized(int val);
Vectorized(BFloat16 val);
@ -307,11 +306,6 @@ Vectorized<c10::BFloat16> inline operator/(
return binary_operator_via_float(std::divides<Vectorized<float>>(), a, b);
}
inline Vectorized<BFloat16>::Vectorized() {
const short zero = 0;
values = svdup_n_bf16(c10::bit_cast<bfloat16_t>(zero));
}
inline Vectorized<BFloat16>::Vectorized(int val) {
auto vals_f = svdup_n_f32(val);
values = convert_float_bfloat16(vals_f, vals_f);

View File

@ -38,9 +38,7 @@ class Vectorized<double> {
static constexpr size_type size() {
return VECTOR_WIDTH / sizeof(double);
}
Vectorized() {
values = svdup_n_f64(0);
}
Vectorized() {}
Vectorized(svfloat64_t v) : values(v) {}
Vectorized(double val) {
values = svdup_n_f64(val);
@ -587,30 +585,6 @@ Vectorized<double> inline fmadd(
return svmad_f64_x(ptrue, a, b, c);
}
template <>
Vectorized<double> inline fnmadd(
const Vectorized<double>& a,
const Vectorized<double>& b,
const Vectorized<double>& c) {
return svmsb_f64_x(ptrue, a, b, c);
}
template <>
Vectorized<double> inline fmsub(
const Vectorized<double>& a,
const Vectorized<double>& b,
const Vectorized<double>& c) {
return svnmsb_f64_x(ptrue, a, b, c);
}
template <>
Vectorized<double> inline fnmsub(
const Vectorized<double>& a,
const Vectorized<double>& b,
const Vectorized<double>& c) {
return svnmad_f64_x(ptrue, a, b, c);
}
#endif // defined(CPU_CAPABILITY_SVE)
} // namespace CPU_CAPABILITY

View File

@ -38,9 +38,7 @@ class Vectorized<float> {
static constexpr size_type size() {
return VECTOR_WIDTH / sizeof(float);
}
Vectorized() {
values = svdup_n_f32(0);
}
Vectorized() {}
Vectorized(svfloat32_t v) : values(v) {}
Vectorized(float val) {
values = svdup_n_f32(val);
@ -758,30 +756,6 @@ Vectorized<float> inline fmadd(
return svmad_f32_x(ptrue, a, b, c);
}
template <>
Vectorized<float> inline fnmadd(
const Vectorized<float>& a,
const Vectorized<float>& b,
const Vectorized<float>& c) {
return svmsb_f32_x(ptrue, a, b, c);
}
template <>
Vectorized<float> inline fmsub(
const Vectorized<float>& a,
const Vectorized<float>& b,
const Vectorized<float>& c) {
return svnmsb_f32_x(ptrue, a, b, c);
}
template <>
Vectorized<float> inline fnmsub(
const Vectorized<float>& a,
const Vectorized<float>& b,
const Vectorized<float>& c) {
return svnmad_f32_x(ptrue, a, b, c);
}
#endif // defined(CPU_CAPABILITY_SVE)
} // namespace CPU_CAPABILITY

View File

@ -32,9 +32,7 @@ inline namespace CPU_CAPABILITY {
static constexpr size_type size() { \
return vl; \
} \
Vectorized() { \
values = svdup_n_s##bit(0); \
} \
Vectorized() {} \
Vectorized(svint##bit##_t v) : values(v) {} \
Vectorized(int##bit##_t val) { \
values = svdup_n_s##bit(val); \

View File

@ -552,15 +552,6 @@ Vectorized<c10::BFloat16> inline fmadd(
return a * b + c;
}
template <>
Vectorized<c10::BFloat16> inline fnmadd(
const Vectorized<c10::BFloat16>& a,
const Vectorized<c10::BFloat16>& b,
const Vectorized<c10::BFloat16>& c) {
// See NOTE [BF16 FMA] above.
return -a * b + c;
}
template <>
Vectorized<c10::BFloat16> inline fmsub(
const Vectorized<c10::BFloat16>& a,
@ -570,15 +561,6 @@ Vectorized<c10::BFloat16> inline fmsub(
return a * b - c;
}
template <>
Vectorized<c10::BFloat16> inline fnmsub(
const Vectorized<c10::BFloat16>& a,
const Vectorized<c10::BFloat16>& b,
const Vectorized<c10::BFloat16>& c) {
// See NOTE [BF16 FMA] above.
return -a * b - c;
}
#endif // !defined(C10_MOBILE) && defined(__aarch64__)
} // namespace CPU_CAPABILITY

View File

@ -83,9 +83,7 @@ class Vectorized<float> {
static constexpr size_type size() {
return 4;
}
Vectorized() {
values = vmovq_n_f32(0);
}
Vectorized() {}
Vectorized(float32x4_t v) : values(v) {}
Vectorized(float val) : values{vdupq_n_f32(val)} {}
Vectorized(float val0, float val1, float val2, float val3)
@ -584,14 +582,6 @@ Vectorized<float> inline fmadd(
return Vectorized<float>(vfmaq_f32(c, a, b));
}
template <>
Vectorized<float> inline fnmadd(
const Vectorized<float>& a,
const Vectorized<float>& b,
const Vectorized<float>& c) {
return Vectorized<float>(vfmsq_f32(c, a, b));
}
template <>
Vectorized<float> inline fmsub(
const Vectorized<float>& a,
@ -600,14 +590,6 @@ Vectorized<float> inline fmsub(
return Vectorized<float>(vnegq_f32(vfmsq_f32(c, a, b)));
}
template <>
Vectorized<float> inline fnmsub(
const Vectorized<float>& a,
const Vectorized<float>& b,
const Vectorized<float>& c) {
return Vectorized<float>(vnegq_f32(vfmaq_f32(c, a, b)));
}
inline Vectorized<float> Vectorized<float>::erf() const {
// constants
const Vectorized<float> neg_zero_vec(-0.f);

View File

@ -621,18 +621,6 @@ Vectorized<c10::Half> inline fmadd(
#endif
}
template <>
Vectorized<c10::Half> inline fnmadd(
const Vectorized<c10::Half>& a,
const Vectorized<c10::Half>& b,
const Vectorized<c10::Half>& c) {
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
return Vectorized<c10::Half>(vfmsq_f16(c, a, b));
#else
return -a * b + c;
#endif
}
template <>
Vectorized<c10::Half> inline fmsub(
const Vectorized<c10::Half>& a,
@ -644,18 +632,6 @@ Vectorized<c10::Half> inline fmsub(
return a * b - c;
#endif
}
template <>
Vectorized<c10::Half> inline fnmsub(
const Vectorized<c10::Half>& a,
const Vectorized<c10::Half>& b,
const Vectorized<c10::Half>& c) {
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
return Vectorized<c10::Half>(vnegq_f16(vfmaq_f16(c, a, b)));
#else
return -a * b - c;
#endif
}
#endif // !defined(C10_MOBILE) && defined(__aarch64__)
} // namespace CPU_CAPABILITY

View File

@ -1 +1,396 @@
#include <torch/headeronly/cpu/vec/vec256/missing_vld1_neon.h>
/* Workaround for missing vld1_*_x2 and vst1_*_x2 intrinsics in gcc-7. */
__extension__ extern __inline uint8x8x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_u8_x2(const uint8_t* __a) {
uint8x8x2_t ret;
asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline int8x8x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_s8_x2(const int8_t* __a) {
int8x8x2_t ret;
asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline uint16x4x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_u16_x2(const uint16_t* __a) {
uint16x4x2_t ret;
asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline int16x4x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_s16_x2(const int16_t* __a) {
int16x4x2_t ret;
asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline uint32x2x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_u32_x2(const uint32_t* __a) {
uint32x2x2_t ret;
asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline int32x2x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_s32_x2(const int32_t* __a) {
int32x2x2_t ret;
asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline uint64x1x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_u64_x2(const uint64_t* __a) {
uint64x1x2_t ret;
asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline int64x1x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_s64_x2(const int64_t* __a) {
int64x1x2_t ret;
__builtin_aarch64_simd_oi __o;
asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline float16x4x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_f16_x2(const float16_t* __a) {
float16x4x2_t ret;
asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline float32x2x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_f32_x2(const float32_t* __a) {
float32x2x2_t ret;
asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline float64x1x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_f64_x2(const float64_t* __a) {
float64x1x2_t ret;
asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline poly8x8x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_p8_x2(const poly8_t* __a) {
poly8x8x2_t ret;
asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline poly16x4x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_p16_x2(const poly16_t* __a) {
poly16x4x2_t ret;
asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline poly64x1x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1_p64_x2(const poly64_t* __a) {
poly64x1x2_t ret;
asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline uint8x16x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_u8_x2(const uint8_t* __a) {
uint8x16x2_t ret;
asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline int8x16x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_s8_x2(const int8_t* __a) {
int8x16x2_t ret;
asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline uint16x8x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_u16_x2(const uint16_t* __a) {
uint16x8x2_t ret;
asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline int16x8x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_s16_x2(const int16_t* __a) {
int16x8x2_t ret;
asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline uint32x4x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_u32_x2(const uint32_t* __a) {
uint32x4x2_t ret;
asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline int32x4x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_s32_x2(const int32_t* __a) {
int32x4x2_t ret;
asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline uint64x2x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_u64_x2(const uint64_t* __a) {
uint64x2x2_t ret;
asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline int64x2x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_s64_x2(const int64_t* __a) {
int64x2x2_t ret;
asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline float16x8x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_f16_x2(const float16_t* __a) {
float16x8x2_t ret;
asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline float32x4x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_f32_x2(const float32_t* __a) {
float32x4x2_t ret;
asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline float64x2x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_f64_x2(const float64_t* __a) {
float64x2x2_t ret;
asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline poly8x16x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_p8_x2(const poly8_t* __a) {
poly8x16x2_t ret;
asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline poly16x8x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_p16_x2(const poly16_t* __a) {
poly16x8x2_t ret;
asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
__extension__ extern __inline poly64x2x2_t
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vld1q_p64_x2(const poly64_t* __a) {
poly64x2x2_t ret;
asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
return ret;
}
/* vst1x2 */
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_s64_x2(int64_t* __a, int64x1x2_t val) {
asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_u64_x2(uint64_t* __a, uint64x1x2_t val) {
asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_f64_x2(float64_t* __a, float64x1x2_t val) {
asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_s8_x2(int8_t* __a, int8x8x2_t val) {
asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_p8_x2(poly8_t* __a, poly8x8x2_t val) {
asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_s16_x2(int16_t* __a, int16x4x2_t val) {
asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_p16_x2(poly16_t* __a, poly16x4x2_t val) {
asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_s32_x2(int32_t* __a, int32x2x2_t val) {
asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_u8_x2(uint8_t* __a, uint8x8x2_t val) {
asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_u16_x2(uint16_t* __a, uint16x4x2_t val) {
asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_u32_x2(uint32_t* __a, uint32x2x2_t val) {
asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_f16_x2(float16_t* __a, float16x4x2_t val) {
asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_f32_x2(float32_t* __a, float32x2x2_t val) {
asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1_p64_x2(poly64_t* __a, poly64x1x2_t val) {
asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_s8_x2(int8_t* __a, int8x16x2_t val) {
asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_p8_x2(poly8_t* __a, poly8x16x2_t val) {
asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_s16_x2(int16_t* __a, int16x8x2_t val) {
asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_p16_x2(poly16_t* __a, poly16x8x2_t val) {
asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_s32_x2(int32_t* __a, int32x4x2_t val) {
asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_s64_x2(int64_t* __a, int64x2x2_t val) {
asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_u8_x2(uint8_t* __a, uint8x16x2_t val) {
asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_u16_x2(uint16_t* __a, uint16x8x2_t val) {
asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_u32_x2(uint32_t* __a, uint32x4x2_t val) {
asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_u64_x2(uint64_t* __a, uint64x2x2_t val) {
asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_f16_x2(float16_t* __a, float16x8x2_t val) {
asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_f32_x2(float32_t* __a, float32x4x2_t val) {
asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_f64_x2(float64_t* __a, float64x2x2_t val) {
asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
}
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_p64_x2(poly64_t* __a, poly64x2x2_t val) {
asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
}

View File

@ -1 +1,7 @@
#include <torch/headeronly/cpu/vec/vec256/missing_vst1_neon.h>
/* Workaround for missing vst1q_f32_x2 in gcc-8. */
__extension__ extern __inline void
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
vst1q_f32_x2(float32_t* __a, float32x4x2_t val) {
asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
}

View File

@ -34,9 +34,7 @@ class Vectorized<c10::complex<double>> {
static constexpr size_type size() {
return 2;
}
Vectorized() {
values = _mm256_setzero_pd();
}
Vectorized() {}
Vectorized(__m256d v) : values(v) {}
Vectorized(c10::complex<double> val) {
double real_value = val.real();

View File

@ -33,9 +33,7 @@ class Vectorized<c10::complex<float>> {
static constexpr size_type size() {
return 4;
}
Vectorized() {
values = _mm256_setzero_ps();
}
Vectorized() {}
Vectorized(__m256 v) : values(v) {}
Vectorized(c10::complex<float> val) {
float real_value = val.real();

View File

@ -31,9 +31,7 @@ class Vectorized<double> {
static constexpr size_type size() {
return 4;
}
Vectorized() {
values = _mm256_setzero_pd();
}
Vectorized() {}
Vectorized(__m256d v) : values(v) {}
Vectorized(double val) {
values = _mm256_set1_pd(val);
@ -495,14 +493,6 @@ Vectorized<double> inline fmadd(
return _mm256_fmadd_pd(a, b, c);
}
template <>
Vectorized<double> inline fnmadd(
const Vectorized<double>& a,
const Vectorized<double>& b,
const Vectorized<double>& c) {
return _mm256_fnmadd_pd(a, b, c);
}
template <>
Vectorized<double> inline fmsub(
const Vectorized<double>& a,
@ -510,14 +500,6 @@ Vectorized<double> inline fmsub(
const Vectorized<double>& c) {
return _mm256_fmsub_pd(a, b, c);
}
template <>
Vectorized<double> inline fnmsub(
const Vectorized<double>& a,
const Vectorized<double>& b,
const Vectorized<double>& c) {
return _mm256_fnmsub_pd(a, b, c);
}
#endif
#endif

View File

@ -30,9 +30,7 @@ class Vectorized<float> {
static constexpr size_type size() {
return 8;
}
Vectorized() {
values = _mm256_setzero_ps();
}
Vectorized() {}
Vectorized(__m256 v) : values(v) {}
Vectorized(float val) {
values = _mm256_set1_ps(val);
@ -696,14 +694,6 @@ Vectorized<float> inline fmadd(
return _mm256_fmadd_ps(a, b, c);
}
template <>
Vectorized<float> inline fnmadd(
const Vectorized<float>& a,
const Vectorized<float>& b,
const Vectorized<float>& c) {
return _mm256_fnmadd_ps(a, b, c);
}
template <>
Vectorized<float> inline fmsub(
const Vectorized<float>& a,
@ -712,14 +702,6 @@ Vectorized<float> inline fmsub(
return _mm256_fmsub_ps(a, b, c);
}
template <>
Vectorized<float> inline fnmsub(
const Vectorized<float>& a,
const Vectorized<float>& b,
const Vectorized<float>& c) {
return _mm256_fnmsub_ps(a, b, c);
}
// TODO: rewrite with ATEN vectorized (need to add unpack and shuffle)
// Used by Inductor CPP codegen for micro gemm
inline void transpose_block(at::vec::VectorizedN<float, 8>& input) {

View File

@ -23,9 +23,7 @@ struct Vectorizedi {
}
public:
Vectorizedi() {
values = _mm256_setzero_si256();
}
Vectorizedi() {}
Vectorizedi(__m256i v) : values(v) {}
operator __m256i() const {
return values;
@ -55,9 +53,7 @@ class Vectorized<int64_t> : public Vectorizedi {
return 4;
}
using Vectorizedi::Vectorizedi;
Vectorized() {
values = _mm256_setzero_si256();
}
Vectorized() {}
Vectorized(int64_t v) {
values = _mm256_set1_epi64x(v);
}

View File

@ -54,9 +54,7 @@ struct Vectorizedqi {
#endif
public:
Vectorizedqi() {
vals = _mm256_setzero_si256();
}
Vectorizedqi() {}
Vectorizedqi(__m256i v) : vals(v) {}
operator __m256i() const {
return vals;

View File

@ -192,9 +192,7 @@ class Vectorized16 {
static constexpr size_type size() {
return 32;
}
Vectorized16() {
values = _mm512_setzero_si512();
}
Vectorized16() {}
Vectorized16(__m512i v) : values(v) {}
Vectorized16(T val) {
value_type uw = val.x;

View File

@ -34,9 +34,7 @@ class Vectorized<c10::complex<double>> {
static constexpr size_type size() {
return 4;
}
Vectorized() {
values = _mm512_setzero_pd();
}
Vectorized() {}
Vectorized(__m512d v) : values(v) {}
Vectorized(c10::complex<double> val) {
double real_value = val.real();

View File

@ -34,9 +34,7 @@ class Vectorized<c10::complex<float>> {
static constexpr size_type size() {
return 8;
}
Vectorized() {
values = _mm512_setzero_ps();
}
Vectorized() {}
Vectorized(__m512 v) : values(v) {}
Vectorized(c10::complex<float> val) {
float real_value = val.real();

View File

@ -34,9 +34,7 @@ class Vectorized<double> {
static constexpr size_type size() {
return 8;
}
Vectorized() {
values = _mm512_setzero_pd();
}
Vectorized() {}
Vectorized(__m512d v) : values(v) {}
Vectorized(double val) {
values = _mm512_set1_pd(val);
@ -536,14 +534,6 @@ Vectorized<double> inline fmadd(
return _mm512_fmadd_pd(a, b, c);
}
template <>
Vectorized<double> inline fnmadd(
const Vectorized<double>& a,
const Vectorized<double>& b,
const Vectorized<double>& c) {
return _mm512_fnmadd_pd(a, b, c);
}
template <>
Vectorized<double> inline fmsub(
const Vectorized<double>& a,
@ -552,14 +542,6 @@ Vectorized<double> inline fmsub(
return _mm512_fmsub_pd(a, b, c);
}
template <>
Vectorized<double> inline fnmsub(
const Vectorized<double>& a,
const Vectorized<double>& b,
const Vectorized<double>& c) {
return _mm512_fnmsub_pd(a, b, c);
}
#endif
} // namespace CPU_CAPABILITY

View File

@ -32,9 +32,7 @@ class Vectorized<float> {
static constexpr size_type size() {
return 16;
}
Vectorized() {
values = _mm512_setzero_ps();
}
Vectorized() {}
Vectorized(__m512 v) : values(v) {}
Vectorized(float val) {
values = _mm512_set1_ps(val);
@ -749,14 +747,6 @@ Vectorized<float> inline fmadd(
return _mm512_fmadd_ps(a, b, c);
}
template <>
Vectorized<float> inline fnmadd(
const Vectorized<float>& a,
const Vectorized<float>& b,
const Vectorized<float>& c) {
return _mm512_fnmadd_ps(a, b, c);
}
template <>
Vectorized<float> inline fmsub(
const Vectorized<float>& a,
@ -765,14 +755,6 @@ Vectorized<float> inline fmsub(
return _mm512_fmsub_ps(a, b, c);
}
template <>
Vectorized<float> inline fnmsub(
const Vectorized<float>& a,
const Vectorized<float>& b,
const Vectorized<float>& c) {
return _mm512_fnmsub_ps(a, b, c);
}
// TODO: rewrite with ATEN vectorized (need to add unpack and shuffle)
// Used by Inductor CPP codegen for micro gemm
// Code referred to FBGEMM:

View File

@ -53,9 +53,7 @@ class Vectorized<int64_t> : public Vectorizedi {
return 8;
}
using Vectorizedi::Vectorizedi;
Vectorized() {
values = _mm512_setzero_si512();
}
Vectorized() {}
Vectorized(int64_t v) {
values = _mm512_set1_epi64(v);
}

View File

@ -55,9 +55,7 @@ struct Vectorizedqi {
#endif
public:
Vectorizedqi() {
vals = _mm512_setzero_si512();
}
Vectorizedqi() {}
Vectorizedqi(__m512i v) : vals(v) {}
operator __m512i() const {
return vals;

View File

@ -1247,16 +1247,6 @@ inline Vectorized<T> fmadd(
VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(fmadd)
template <typename T>
inline Vectorized<T> fnmadd(
const Vectorized<T>& a,
const Vectorized<T>& b,
const Vectorized<T>& c) {
return -(a * b) + c;
}
VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(fnmadd)
template <typename T>
inline Vectorized<T> fmsub(
const Vectorized<T>& a,
@ -1267,16 +1257,6 @@ inline Vectorized<T> fmsub(
VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(fmsub)
template <typename T>
inline Vectorized<T> fnmsub(
const Vectorized<T>& a,
const Vectorized<T>& b,
const Vectorized<T>& c) {
return -(a * b) - c;
}
VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(fnmsub)
template <typename T>
Vectorized<T> inline operator&&(
const Vectorized<T>& a,

View File

@ -3,12 +3,50 @@
#include <ATen/cpu/vec/intrinsics.h>
#include <c10/util/Exception.h>
#include <torch/headeronly/cpu/vec/vec_half.h>
namespace at::vec {
// See Note [CPU_CAPABILITY namespace]
inline namespace CPU_CAPABILITY {
#if (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
!defined(__APPLE__)
static inline uint16_t float2half_scalar(float val) {
#if defined(CPU_CAPABILITY_AVX2)
#if defined(_MSC_VER)
__m256 v = _mm256_set1_ps(val);
__m128i o =
_mm256_cvtps_ph(v, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
return static_cast<std::uint16_t>(_mm_cvtsi128_si32(o));
#else
return _cvtss_sh(val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
#endif
#elif defined(CPU_CAPABILITY_AVX512)
__m512 v = _mm512_set1_ps(val);
__m256i o =
_mm512_cvtps_ph(v, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
return static_cast<std::uint16_t>(
_mm_cvtsi128_si32(_mm256_castsi256_si128(o)));
#endif
}
static inline float half2float_scalar(uint16_t val) {
#if defined(CPU_CAPABILITY_AVX2)
#if defined(_MSC_VER)
__m128i v = _mm_cvtsi32_si128(val);
__m256 o = _mm256_cvtph_ps(v);
return _mm256_cvtss_f32(o);
#else
return _cvtsh_ss(val);
#endif
#elif defined(CPU_CAPABILITY_AVX512)
__m256i v =
_mm256_setr_epi16(val, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 o = _mm512_cvtph_ps(v);
return _mm512_cvtss_f32(o);
#endif
}
#endif
// Transpose a [2, 32] matrix to [32, 2]
// Note: the output leading dimension should be 2,
// that is, the output must be contiguous

View File

@ -1843,69 +1843,6 @@ template bool gemm_and_bias(
int64_t result_ld,
GEMMAndBiasActivationEpilogue activation);
int get_scale_mode(ScalingType scaling_type, ScalarType scale_dtype, bool use_fast_accum) {
switch (scaling_type) {
case ScalingType::BlockWise1x32:
TORCH_CHECK(scale_dtype == kFloat8_e8m0fnu);
#if CUDA_VERSION >= 12080
return CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0;
#else
TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales of 1x32 blocks is only supported for CUDA 12.8 and above");
#endif // if CUDA_VERSION >= 12080
case ScalingType::BlockWise1x16:
TORCH_CHECK(scale_dtype == kFloat8_e4m3fn);
#if CUDA_VERSION >= 12080
return CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3;
#else
TORCH_CHECK(false, "scaled_gemm with `torch.float8_e4m3fn` scales of 1x16 blocks is only supported for CUDA 12.8 and above");
#endif // if CUDA_VERSION >= 12080
case ScalingType::RowWise:
TORCH_CHECK(scale_dtype == kFloat);
#if CUDA_VERSION >= 12090 || (defined(USE_ROCM) && defined(HIPBLASLT_OUTER_VEC))
return CUBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F;
#elif defined(USE_ROCM) && defined(HIPBLASLT_VEC_EXT)
// Return the default, since in old hipblaslt this is activated via
// the SCALE_POINTER_VEC_EXT attributed.
return 0;
#else
TORCH_CHECK(false, "scaled_gemm with rowwise scaling is only supported for CUDA 12.9 and above");
#endif // if CUDA_VERSION >= 12090
case ScalingType::BlockWise1x128:
TORCH_CHECK(scale_dtype == kFloat);
TORCH_CHECK(!use_fast_accum, "scaled_gemm doesn't support fast accum with 1x128 blockwise scaling")
#if CUDA_VERSION >= 12090
return CUBLASLT_MATMUL_MATRIX_SCALE_VEC128_32F;
#else
TORCH_CHECK(false, "scaled_gemm with 1x128 blockwise scaling is only supported for CUDA 12.9 and above");
#endif // if CUDA_VERSION >= 12090
case ScalingType::BlockWise128x128:
TORCH_CHECK(scale_dtype == kFloat);
TORCH_CHECK(!use_fast_accum, "scaled_gemm doesn't support fast accum with 128x128 blockwise scaling")
#if CUDA_VERSION >= 12090
return CUBLASLT_MATMUL_MATRIX_SCALE_BLK128x128_32F;
#else
TORCH_CHECK(false, "scaled_gemm with 128x128 blockwise scaling is only supported for CUDA 12.9 and above");
#endif // if CUDA_VERSION >= 12090
case ScalingType::TensorWise:
TORCH_CHECK(scale_dtype == kFloat);
#if CUDA_VERSION >= 12080
return CUBLASLT_MATMUL_MATRIX_SCALE_SCALAR_32F;
#else
// The macro isn't defined, thus we inline its value.
return 0;
#endif // if CUDA_VERSION >= 12080
default:
TORCH_CHECK(false);
return -1;
}
}
void scaled_gemm(
char transa,
char transb,
@ -1917,20 +1854,19 @@ void scaled_gemm(
int64_t mat1_ld,
ScalarType mat1_dtype,
ScalarType mat1_scale_dtype,
ScalingType mat1_scaling_type,
const void* mat2_ptr,
const void* mat2_scale_ptr,
int64_t mat2_ld,
ScalarType mat2_dtype,
ScalarType mat2_scale_dtype,
ScalingType mat2_scaling_type,
const void* bias_ptr,
ScalarType bias_dtype,
void* result_ptr,
const void *result_scale_ptr,
int64_t result_ld,
ScalarType result_dtype,
bool use_fast_accum) {
bool use_fast_accum,
bool use_rowwise) {
// Note: see `cublasCommonArgs` for various non-intuitive manupulations
// of input arguments to this function.
#if CUDA_VERSION >= 11080 || defined(USE_ROCM)
@ -1943,15 +1879,19 @@ void scaled_gemm(
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb));
cublasLtMatmulDescAttributes_t matmulDescA = CUBLASLT_MATMUL_DESC_A_SCALE_POINTER;
cublasLtMatmulDescAttributes_t matmulDescB = CUBLASLT_MATMUL_DESC_B_SCALE_POINTER;
// hipblaslt supported row-wise before cublas, and did so their own way (via
// the SCALE_POINTERSs), but then migrated to match how cublas does it (via
// the SCALE_MODEs). Here we check for this early custom mode.
#if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
if (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise) {
#if defined(USE_ROCM)
#if defined(HIPBLASLT_OUTER_VEC)
// this case is handled later as hipified CUBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F
#elif defined(HIPBLASLT_VEC_EXT)
if (use_rowwise) {
matmulDescA = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT;
matmulDescB = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT;
}
#endif // if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
#else
// rowwise isn't supported using older hipblaslt
TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with older hipblaslt");
#endif
#endif // defined(USE_ROCM)
computeDesc.setAttribute(matmulDescA, mat1_scale_ptr);
computeDesc.setAttribute(matmulDescB, mat2_scale_ptr);
if (result_scale_ptr != nullptr) {
@ -1991,14 +1931,30 @@ void scaled_gemm(
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, ScalarTypeToCudaDataType(bias_dtype));
}
// The SCALE_MODE attrs only exist in cuBLAS 12.8+ or in recent hipblaslt,
// but we must invoke get_scale_mode anyways to trigger the version checks.
[[maybe_unused]] int a_scale_mode = get_scale_mode(mat1_scaling_type, mat1_scale_dtype, use_fast_accum);
[[maybe_unused]] int b_scale_mode = get_scale_mode(mat2_scaling_type, mat2_scale_dtype, use_fast_accum);
#if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && defined(HIPBLASLT_OUTER_VEC))
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, a_scale_mode);
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, b_scale_mode);
#endif
if (mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) {
#if CUDA_VERSION >= 12080
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0);
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0);
#else
TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales is only supported for CUDA 12.8 and above");
#endif // if CUDA_VERSION >= 12080
} else if (mat1_scale_dtype == kFloat8_e4m3fn && mat2_scale_dtype == kFloat8_e4m3fn) {
#if CUDA_VERSION >= 12080
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3);
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3);
#else
TORCH_CHECK(false, "scaled_gemm with `torch.float8_e4m3fn` scales is only supported for CUDA 12.8 and above");
#endif // if CUDA_VERSION >= 12080
} else if (mat1_scale_dtype == kFloat && mat2_scale_dtype == kFloat && use_rowwise) {
#if CUDA_VERSION >= 12090 || (defined(USE_ROCM) && defined(HIPBLASLT_OUTER_VEC))
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F);
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F);
#elif defined(USE_ROCM) && defined(HIPBLASLT_VEC_EXT)
// no-op here for older hipblaslt ext enums, to avoid TORCH_CHECK below
#else
TORCH_CHECK(false, "scaled_gemm with `torch.float` outer vector scaling is only supported for CUDA 12.9 and above");
#endif // if CUDA_VERSION >= 12090
}
CuBlasLtMatmulPreference preference;
auto ltworkspace = CublasLtWorkspace();

View File

@ -136,15 +136,6 @@ void int8_gemm(
int32_t* result_ptr,
int64_t result_ld);
enum class ScalingType : std::uint8_t {
TensorWise, // fp32 scales
RowWise, // fp32 scales
BlockWise1x16, // fp8_e4m3fn scales
BlockWise1x32, // fp8_e8m0fnu scales
BlockWise1x128, // fp32 scales
BlockWise128x128, // fp32 scales
};
void scaled_gemm(
char transa,
char transb,
@ -156,20 +147,19 @@ void scaled_gemm(
int64_t mat1_ld,
ScalarType mat1_dtype,
ScalarType mat1_scale_dtype,
ScalingType mat1_scaling_type,
const void* mat2_ptr,
const void* mat2_scale_ptr,
int64_t mat2_ld,
ScalarType mat2_dtype,
ScalarType mat2_scale_dtype,
ScalingType mat2_scaling_type,
const void* bias_ptr,
ScalarType bias_dtype,
void* result_ptr,
const void* result_scale_ptr,
int64_t result_ld,
ScalarType result_dtype,
bool use_fast_accum);
bool use_fast_accum,
bool use_rowwise);
#define CUDABLAS_BGEMM_ARGTYPES(Dtype) CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, Dtype)

View File

@ -2,7 +2,6 @@
#include <ATen/cuda/CUDAGraph.h>
#include <ATen/cuda/Exceptions.h>
#include <ATen/Functions.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/cuda/CUDAFunctions.h>
#include <cstddef>

View File

@ -2,6 +2,7 @@
#include <ATen/Tensor.h>
#include <c10/core/Device.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/cuda/CUDAGraphsC10Utils.h>
#include <c10/cuda/CUDAStream.h>
#include <c10/util/flat_hash_map.h>

View File

@ -258,7 +258,7 @@ DECLARE_HOST_ALLOCATOR(
CUDACachingHostAllocator,
CUDACachingHostAllocatorImpl,
raw_local_deleter,
caching_host_allocator)
caching_host_allocator);
REGISTER_HOST_ALLOCATOR(at::kCUDA, &caching_host_allocator)

View File

@ -29,8 +29,6 @@
namespace at::cuda::tunable {
using at::cuda::blas::ScalingType;
enum class BlasOp {
N = 0,
T = 1
@ -600,8 +598,7 @@ struct ScaledGemmParams : OpParams {
//
// In TunableOp, we must distinguish in param signature these two cases: with and without a bias vector.
return fmt::sprintf("%c%c_%ld_%ld_%ld_ld_%ld_%ld_%ld_rw_%d_bias_%s",
transa, transb, m, n, k, lda, ldb, ldc,
a_scaling_type == ScalingType::RowWise && b_scaling_type == ScalingType::RowWise,
transa, transb, m, n, k, lda, ldb, ldc, use_rowwise,
bias_ptr == nullptr ? "None" : at::toString(bias_dtype));
}
@ -676,13 +673,11 @@ struct ScaledGemmParams : OpParams {
int64_t lda{};
ScalarType a_dtype{};
ScalarType a_scale_dtype{};
ScalingType a_scaling_type{};
const void* b{};
const void* b_scale_ptr{};
int64_t ldb{};
ScalarType b_dtype{};
ScalarType b_scale_dtype{};
ScalingType b_scaling_type{};
const void* bias_ptr{};
ScalarType bias_dtype{};
void* c{};
@ -691,6 +686,7 @@ struct ScaledGemmParams : OpParams {
ScalarType c_dtype{};
void* amax_ptr{};
bool use_fast_accum{};
bool use_rowwise{};
private:
bool duplicate_inputs_{false};
};

View File

@ -206,43 +206,23 @@ float GetBetaFromParams(const ScaledGemmParams<T>* params) {
}
template <typename T>
ScalingType GetAScalingTypeFromParams(const GemmParams<T>* params) {
return ScalingType::TensorWise;
bool GetUseRowwiseFromParams(const GemmParams<T>* params) {
return false;
}
template <typename T>
ScalingType GetBScalingTypeFromParams(const GemmParams<T>* params) {
return ScalingType::TensorWise;
bool GetUseRowwiseFromParams(const GemmAndBiasParams<T>* params) {
return false;
}
template <typename T>
ScalingType GetAScalingTypeFromParams(const GemmAndBiasParams<T>* params) {
return ScalingType::TensorWise;
bool GetUseRowwiseFromParams(const GemmStridedBatchedParams<T>* params) {
return false;
}
template <typename T>
ScalingType GetBScalingTypeFromParams(const GemmAndBiasParams<T>* params) {
return ScalingType::TensorWise;
}
template <typename T>
ScalingType GetAScalingTypeFromParams(const GemmStridedBatchedParams<T>* params) {
return ScalingType::TensorWise;
}
template <typename T>
ScalingType GetBScalingTypeFromParams(const GemmStridedBatchedParams<T>* params) {
return ScalingType::TensorWise;
}
template <typename T>
ScalingType GetAScalingTypeFromParams(const ScaledGemmParams<T>* params) {
return params->a_scaling_type;
}
template <typename T>
ScalingType GetBScalingTypeFromParams(const ScaledGemmParams<T>* params) {
return params->b_scaling_type;
bool GetUseRowwiseFromParams(const ScaledGemmParams<T>* params) {
return params->use_rowwise;
}
template <typename T>
@ -509,24 +489,23 @@ class HipblasltGemmOp : public Callable<ParamsT> {
const void* mat2_scale_ptr = GetBScalePointerFromParams<CT>(params);
const void* result_scale_ptr = GetDScalePointerFromParams<CT>(params);
if (mat1_scale_ptr && mat2_scale_ptr) {
hipblasLtMatmulDescAttributes_t a_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER;
hipblasLtMatmulDescAttributes_t b_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER;
if (GetAScalingTypeFromParams<CT>(params) == ScalingType::RowWise) {
#if defined(HIPBLASLT_OUTER_VEC)
#ifdef HIPBLASLT_VEC_EXT
if (GetUseRowwiseFromParams<CT>(params)) {
matmul.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT, mat1_scale_ptr);
matmul.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT, mat2_scale_ptr);
}
else
#endif
{
matmul.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr);
matmul.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr);
}
#ifdef HIPBLASLT_OUTER_VEC
if (GetUseRowwiseFromParams<CT>(params)) {
matmul.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_MODE, HIPBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F);
#elif defined(HIPBLASLT_VEC_EXT)
a_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT;
#endif
}
if (GetBScalingTypeFromParams<CT>(params) == ScalingType::RowWise) {
#if defined(HIPBLASLT_OUTER_VEC)
matmul.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_MODE, HIPBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F);
#elif defined(HIPBLASLT_VEC_EXT)
b_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT;
#endif
}
matmul.setAttribute(a_scale_ptr_desc, mat1_scale_ptr);
matmul.setAttribute(b_scale_ptr_desc, mat2_scale_ptr);
#endif
}
if (result_scale_ptr) {
matmul.setAttribute(HIPBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);

View File

@ -96,20 +96,19 @@ class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
params->lda,
params->a_dtype,
params->a_scale_dtype,
params->a_scaling_type,
params->b,
params->b_scale_ptr,
params->ldb,
params->b_dtype,
params->b_scale_dtype,
params->b_scaling_type,
params->bias_ptr,
params->bias_dtype,
params->c,
params->c_scale_ptr,
params->ldc,
params->c_dtype,
params->use_fast_accum);
params->use_fast_accum,
params->use_rowwise);
return OK;
}
};

View File

@ -158,7 +158,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
OP_DECOMPOSE(kron);
OP_DECOMPOSE(l1_loss);
m.impl("layer_norm", native::layer_norm_symint);
m.impl("_fused_rms_norm", native::rms_norm_composite);
OP_DECOMPOSE2(ldexp, Tensor);
OP_DECOMPOSE2(less_equal, Tensor );
OP_DECOMPOSE2(less, Tensor );

View File

@ -1,6 +1,6 @@
#pragma once
#include <c10/core/Allocator.h>
#include <c10/core/CachingDeviceAllocator.h>
#include <c10/core/DeviceType.h>
// Use of c10::hip namespace here makes hipification easier, because
@ -10,10 +10,10 @@ namespace c10::hip {
// Takes a valid HIPAllocator (of any sort) and turns it into
// an allocator pretending to be a CUDA allocator. See
// Note [Masquerading as CUDA]
class HIPAllocatorMasqueradingAsCUDA final : public Allocator {
Allocator* allocator_;
class HIPAllocatorMasqueradingAsCUDA final : public DeviceAllocator {
DeviceAllocator* allocator_;
public:
explicit HIPAllocatorMasqueradingAsCUDA(Allocator* allocator)
explicit HIPAllocatorMasqueradingAsCUDA(DeviceAllocator* allocator)
: allocator_(allocator) {}
DataPtr allocate(size_t size) override {
DataPtr r = allocator_->allocate(size);
@ -26,6 +26,24 @@ public:
void copy_data(void* dest, const void* src, std::size_t count) const final {
allocator_->copy_data(dest, src, count);
}
bool initialized() override {
return allocator_->initialized();
}
void emptyCache(MempoolId_t mempool_id = {0, 0}) {
allocator_->emptyCache(mempool_id);
}
void recordStream(const DataPtr& ptr, c10::Stream stream) {
allocator_->recordStream(ptr, stream);
}
CachingDeviceAllocator::DeviceStats getDeviceStats(c10::DeviceIndex device) {
return allocator_->getDeviceStats(device);
}
void resetAccumulatedStats(c10::DeviceIndex device) {
allocator_->resetAccumulatedStats(device);
}
void resetPeakStats(c10::DeviceIndex device) {
allocator_->resetPeakStats(device);
}
};
} // namespace c10::hip

View File

@ -4,8 +4,9 @@
namespace c10 { namespace hip {
namespace HIPCachingAllocatorMasqueradingAsCUDA {
static HIPAllocatorMasqueradingAsCUDA allocator(HIPCachingAllocator::get());
Allocator* get() {
static HIPAllocatorMasqueradingAsCUDA allocator(HIPCachingAllocator::get());
return &allocator;
}
@ -13,5 +14,9 @@ void recordStreamMasqueradingAsCUDA(const DataPtr& ptr, HIPStreamMasqueradingAsC
HIPCachingAllocator::recordStream(ptr, stream.hip_stream());
}
// Register this HIP allocator as CUDA allocator to enable access through both
// c10::GetAllocator(kCUDA) and c10::getDeviceAllocator(kCUDA) APIs
REGISTER_ALLOCATOR(kCUDA, &allocator)
} // namespace HIPCachingAllocatorMasqueradingAsCUDA
}} // namespace c10::hip

View File

@ -36,10 +36,8 @@
#endif
#ifdef USE_FBGEMM
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
#include <fbgemm/Fbgemm.h>
#include <fbgemm/FbgemmConvert.h>
C10_DIAGNOSTIC_POP()
#endif
namespace {

View File

@ -14,10 +14,8 @@
#include <c10/util/Half.h>
#ifdef USE_FBGEMM
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
#include <fbgemm/Fbgemm.h>
#include <fbgemm/FbgemmConvert.h>
C10_DIAGNOSTIC_POP()
#else
#include <caffe2/perfkernels/embedding_lookup_idx.h>
#endif

View File

@ -25,11 +25,9 @@
#include <c10/util/irange.h>
#ifdef USE_FBGEMM
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi")
#include <fbgemm/Fbgemm.h>
#include <fbgemm/FbgemmFP16.h>
#include <fbgemm/QuantUtils.h>
C10_DIAGNOSTIC_POP()
#endif // USE_FBGEMM
namespace caffe2 {
@ -411,7 +409,7 @@ Tensor fbgemm_pack_gemm_matrix_fp16(const Tensor& weight) {
Tensor fbgemm_linear_fp16_weight_fp32_activation(
const Tensor& input,
const Tensor& packed_weight,
const std::optional<Tensor>& bias) {
const Tensor& bias) {
TORCH_WARN_ONCE("fbgemm_linear_fp16_weight_fp32_activation is deprecated "
"and will be removed in a future PyTorch release.")
@ -432,6 +430,7 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation(
TORCH_CHECK(input.size(input.dim() - 1) == packed_weight_fp16.numRows())
TORCH_CHECK(input.dim() >= 2);
TORCH_CHECK(bias.dim() == 1);
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
@ -450,12 +449,7 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation(
output.data_ptr<float>());
// Add bias term
c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias);
const Tensor& bias_ = *bias_maybe_owned;
if (bias_.defined()) {
TORCH_CHECK(bias_.dim() == 1);
output.add_(bias_);
}
output.add_(bias);
return output;
}
@ -557,7 +551,7 @@ Tensor fbgemm_pack_gemm_matrix_fp16(const Tensor& weight) {
Tensor fbgemm_linear_fp16_weight_fp32_activation(
const Tensor& input,
const Tensor& packed_weight,
const std::optional<Tensor>& bias) {
const Tensor& bias) {
TORCH_WARN_ONCE("fbgemm_linear_fp16_weight_fp32_activation is deprecated "
"and will be removed in a future PyTorch release.")

View File

@ -71,8 +71,6 @@
#include <ATen/ops/exp.h>
#include <ATen/ops/gather.h>
#include <ATen/ops/gradient_native.h>
#include <ATen/ops/hash_tensor.h>
#include <ATen/ops/hash_tensor_native.h>
#include <ATen/ops/imag.h>
#include <ATen/ops/isnan_native.h>
#include <ATen/ops/linalg_vector_norm.h>
@ -400,19 +398,6 @@ TORCH_META_FUNC(amin)
resize_reduction(*this, self, dim, keepdim, out_dtype);
}
TORCH_META_FUNC(hash_tensor)
(const Tensor& self, IntArrayRef dim, bool keepdim, int64_t mode) {
auto maybe_result = maybe_get_output();
if (maybe_result.defined()){
TORCH_CHECK(maybe_result.scalar_type() == at::kUInt64, "Expected result to be of dtype uint64, but got ", maybe_result.scalar_type());
}
if (self.sym_numel() == 0) {
native::zero_numel_check_dims(self, dim, "hash_tensor");
}
resize_reduction(*this, self, dim, keepdim, at::kUInt64);
}
} // namespace at::meta
namespace at::native {
@ -456,7 +441,6 @@ DEFINE_DISPATCH(argmin_stub);
DEFINE_DISPATCH(cumsum_stub);
DEFINE_DISPATCH(cumprod_stub);
DEFINE_DISPATCH(logcumsumexp_stub);
DEFINE_DISPATCH(xor_sum_stub);
Tensor _logcumsumexp_cpu(const Tensor& self, int64_t dim) {
Tensor result = at::empty_like(self, MemoryFormat::Contiguous);
@ -2249,24 +2233,6 @@ Tensor dist(const Tensor &self, const Tensor& other, const Scalar& p){
return at::norm(self - other, p);
}
enum class HashMode { XOR_SUM = 0 };
TORCH_IMPL_FUNC(hash_tensor_out) (const Tensor& self, IntArrayRef dim, bool keepdim, int64_t mode, const Tensor& result) {
auto iter = meta::make_reduction(self, result, dim, keepdim, self.scalar_type());
switch (static_cast<HashMode>(mode)) {
case HashMode::XOR_SUM:
if (iter.numel() == 0) {
result.fill_(0);
} else {
xor_sum_stub(iter.device_type(), iter);
}
return;
default:
TORCH_CHECK(false, "Unknown hash_tensor mode: ", mode);
}
}
bool cpu_equal(const Tensor& self, const Tensor& other) {
if (!at::namedinference::are_names_equal(
self.unsafeGetTensorImpl(), other.unsafeGetTensorImpl())) {

View File

@ -27,7 +27,6 @@ DECLARE_DISPATCH(reduce_fn, min_values_stub)
DECLARE_DISPATCH(reduce_fn, max_values_stub)
DECLARE_DISPATCH(reduce_fn, argmax_stub)
DECLARE_DISPATCH(reduce_fn, argmin_stub)
DECLARE_DISPATCH(reduce_fn, xor_sum_stub)
using reduce_std_var_function =
void (*)(TensorIterator&, double correction, bool take_sqrt);

View File

@ -229,20 +229,17 @@ void replication_pad3d_backward_out_cpu_template(
int pbottom = paddingSize[3];
int pfront = paddingSize[4];
int pback = paddingSize[5];
int dimc = 0;
int dimw = 3;
int dimh = 2;
int dimd = 1;
if (input.dim() == 5) {
dimc++;
dimw++;
dimh++;
dimd++;
}
/* sizes */
int64_t ichannel = input.size(dimc);
int64_t idepth = input.size(dimd);
int64_t iheight = input.size(dimh);
int64_t iwidth = input.size(dimw);
@ -252,9 +249,6 @@ void replication_pad3d_backward_out_cpu_template(
at::native::padding::check_valid_input<3>(input, paddingSize);
TORCH_CHECK(ichannel == gradOutput.size(dimc),
"gradOutput width unexpected. Expected: ", ichannel, ", Got: ",
gradOutput.size(dimc));
TORCH_CHECK(owidth == gradOutput.size(dimw),
"gradOutput width unexpected. Expected: ", owidth, ", Got: ",
gradOutput.size(dimw));

Some files were not shown because too many files have changed in this diff Show More