mirror of
				https://github.com/pytorch/pytorch.git
				synced 2025-11-01 04:54:55 +08:00 
			
		
		
		
	Compare commits
	
		
			1 Commits
		
	
	
		
			newtest-ba
			...
			predispatc
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| cf1a2abf35 | 
| @ -12,7 +12,7 @@ fi | ||||
| SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" | ||||
| source $SCRIPTPATH/../manywheel/set_desired_python.sh | ||||
|  | ||||
| pip install -q numpy==${NUMPY_VERSION} pyyaml==6.0.2 scons==4.7.0 ninja==1.11.1 patchelf==0.17.2 | ||||
| pip install -q numpy==${NUMPY_VERSION} pyyaml==6.0.2 scons==4.7.0 ninja==1.11.1.4 patchelf==0.17.2 | ||||
|  | ||||
| for tool in python python3 pip pip3 ninja scons patchelf; do | ||||
|     ln -sf ${DESIRED_PYTHON_BIN_DIR}/${tool} /usr/local/bin; | ||||
|  | ||||
| @ -36,104 +36,3 @@ See `build.sh` for valid build environments (it's the giant switch). | ||||
| # Set flags (see build.sh) and build image | ||||
| sudo bash -c 'TRITON=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest | ||||
| ``` | ||||
|  | ||||
| ## [Guidance] Adding a New Base Docker Image | ||||
|  | ||||
| ### Background | ||||
|  | ||||
| The base Docker images in directory `.ci/docker/` are built by the `docker-builds.yml` workflow. Those images are used throughout the PyTorch CI/CD pipeline. You should only create or modify a base Docker image if you need specific environment changes or dependencies before building PyTorch on CI. | ||||
|  | ||||
| 1. **Automatic Rebuilding**: | ||||
|    - The Docker image building process is triggered automatically when changes are made to files in the `.ci/docker/*` directory | ||||
|    - This ensures all images stay up-to-date with the latest dependencies and configurations | ||||
|  | ||||
| 2. **Image Reuse in PyTorch Build Workflows** (example: linux-build): | ||||
|    - The images generated by `docker-builds.yml` are reused in `_linux-build.yml` through the `calculate-docker-image` step | ||||
|    - The `_linux-build.yml` workflow: | ||||
|      - Pulls the Docker image determined by the `calculate-docker-image` step | ||||
|      - Runs a Docker container with that image | ||||
|      - Executes `.ci/pytorch/build.sh` inside the container to build PyTorch | ||||
|  | ||||
| 3. **Usage in Test Workflows** (example: linux-test): | ||||
|    - The same Docker images are also used in `_linux-test.yml` for running tests | ||||
|    - The `_linux-test.yml` workflow follows a similar pattern: | ||||
|      - It uses the `calculate-docker-image` step to determine which Docker image to use | ||||
|      - It pulls the Docker image and runs a container with that image | ||||
|      - It installs the wheels from the artifacts generated by PyTorch build jobs | ||||
|      - It executes test scripts (like `.ci/pytorch/test.sh` or `.ci/pytorch/multigpu-test.sh`) inside the container | ||||
|  | ||||
| ### Understanding File Purposes | ||||
|  | ||||
| #### `.ci/docker/build.sh` vs `.ci/pytorch/build.sh` | ||||
| - **`.ci/docker/build.sh`**: | ||||
|   - Used for building base Docker images | ||||
|   - Executed by the `docker-builds.yml` workflow to pre-build Docker images for CI | ||||
|   - Contains configurations for different Docker build environments | ||||
|  | ||||
| - **`.ci/pytorch/build.sh`**: | ||||
|   - Used for building PyTorch inside a Docker container | ||||
|   - Called by workflows like `_linux-build.yml` after the Docker container is started | ||||
|   - Builds PyTorch wheels and other artifacts | ||||
|  | ||||
| #### `.ci/docker/ci_commit_pins/` vs `.github/ci_commit_pins` | ||||
| - **`.ci/docker/ci_commit_pins/`**: | ||||
|   - Used for pinning dependency versions during base Docker image building | ||||
|   - Ensures consistent environments for building PyTorch | ||||
|   - Changes here trigger base Docker image rebuilds | ||||
|  | ||||
| - **`.github/ci_commit_pins`**: | ||||
|   - Used for pinning dependency versions during PyTorch building and tests | ||||
|   - Ensures consistent dependencies for PyTorch across different builds | ||||
|   - Used by build scripts running inside Docker containers | ||||
|  | ||||
| ### Step-by-Step Guide for Adding a New Base Docker Image | ||||
|  | ||||
| #### 1. Add Pinned Commits (If Applicable) | ||||
|  | ||||
| We use pinned commits for build stability. The `nightly.yml` workflow checks and updates pinned commits for certain repository dependencies daily. | ||||
|  | ||||
| If your new Docker image needs a library installed from a specific pinned commit or built from source: | ||||
|  | ||||
| 1. Add the repository you want to track in `nightly.yml` and `merge-rules.yml` | ||||
| 2. Add the initial pinned commit in `.ci/docker/ci_commit_pins/`. The text filename should match the one defined in step 1 | ||||
|  | ||||
| #### 2. Configure the Base Docker Image | ||||
| 1. **Add new Base Docker image configuration** (if applicable): | ||||
|  | ||||
|    Add the configuration in `.ci/docker/build.sh`. For example: | ||||
|    ```bash | ||||
|    pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-new1) | ||||
|      CUDA_VERSION=12.8.1 | ||||
|      ANACONDA_PYTHON_VERSION=3.12 | ||||
|      GCC_VERSION=11 | ||||
|      VISION=yes | ||||
|      KATEX=yes | ||||
|      UCX_COMMIT=${_UCX_COMMIT} | ||||
|      UCC_COMMIT=${_UCC_COMMIT} | ||||
|      TRITON=yes | ||||
|      NEW_ARG_1=yes | ||||
|      ;; | ||||
|    ``` | ||||
|  | ||||
| 2. **Add build arguments to Docker build command**: | ||||
|  | ||||
|    If you're introducing a new argument to the Docker build, make sure to add it in the Docker build step in `.ci/docker/build.sh`: | ||||
|    ```bash | ||||
|    docker build \ | ||||
|       .... | ||||
|       --build-arg "NEW_ARG_1=${NEW_ARG_1}" | ||||
|    ``` | ||||
|  | ||||
| 3. **Update Dockerfile logic**: | ||||
|  | ||||
|    Update the Dockerfile to use the new argument. For example, in `ubuntu/Dockerfile`: | ||||
|    ```dockerfile | ||||
|    ARG NEW_ARG_1 | ||||
|    # Set up environment for NEW_ARG_1 | ||||
|    RUN if [ -n "${NEW_ARG_1}" ]; then bash ./do_something.sh; fi | ||||
|    ``` | ||||
|  | ||||
| 4. **Add the Docker configuration** in `.github/workflows/docker-builds.yml`: | ||||
|  | ||||
|    The `docker-builds.yml` workflow pre-builds the Docker images whenever changes occur in the `.ci/docker/` directory. This includes the | ||||
|    pinned commit updates. | ||||
|  | ||||
| @ -93,6 +93,7 @@ tag=$(echo $image | awk -F':' '{print $2}') | ||||
| case "$tag" in | ||||
|   pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11) | ||||
|     CUDA_VERSION=12.4 | ||||
|     CUDNN_VERSION=9 | ||||
|     ANACONDA_PYTHON_VERSION=3.10 | ||||
|     GCC_VERSION=11 | ||||
|     VISION=yes | ||||
| @ -103,6 +104,7 @@ case "$tag" in | ||||
|     ;; | ||||
|   pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11) | ||||
|     CUDA_VERSION=12.8.1 | ||||
|     CUDNN_VERSION=9 | ||||
|     ANACONDA_PYTHON_VERSION=3.10 | ||||
|     GCC_VERSION=11 | ||||
|     VISION=yes | ||||
| @ -113,6 +115,7 @@ case "$tag" in | ||||
|     ;; | ||||
|   pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks) | ||||
|     CUDA_VERSION=12.8.1 | ||||
|     CUDNN_VERSION=9 | ||||
|     ANACONDA_PYTHON_VERSION=3.10 | ||||
|     GCC_VERSION=9 | ||||
|     VISION=yes | ||||
| @ -124,6 +127,7 @@ case "$tag" in | ||||
|     ;; | ||||
|   pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks) | ||||
|     CUDA_VERSION=12.8.1 | ||||
|     CUDNN_VERSION=9 | ||||
|     ANACONDA_PYTHON_VERSION=3.12 | ||||
|     GCC_VERSION=9 | ||||
|     VISION=yes | ||||
| @ -135,6 +139,7 @@ case "$tag" in | ||||
|     ;; | ||||
|   pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks) | ||||
|     CUDA_VERSION=12.8.1 | ||||
|     CUDNN_VERSION=9 | ||||
|     ANACONDA_PYTHON_VERSION=3.13 | ||||
|     GCC_VERSION=9 | ||||
|     VISION=yes | ||||
| @ -146,6 +151,7 @@ case "$tag" in | ||||
|     ;; | ||||
|   pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9) | ||||
|     CUDA_VERSION=12.6.3 | ||||
|     CUDNN_VERSION=9 | ||||
|     ANACONDA_PYTHON_VERSION=3.10 | ||||
|     GCC_VERSION=9 | ||||
|     VISION=yes | ||||
| @ -154,18 +160,9 @@ case "$tag" in | ||||
|     UCC_COMMIT=${_UCC_COMMIT} | ||||
|     TRITON=yes | ||||
|     ;; | ||||
|   pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm) | ||||
|     CUDA_VERSION=12.8.1 | ||||
|     ANACONDA_PYTHON_VERSION=3.12 | ||||
|     GCC_VERSION=11 | ||||
|     VISION=yes | ||||
|     KATEX=yes | ||||
|     UCX_COMMIT=${_UCX_COMMIT} | ||||
|     UCC_COMMIT=${_UCC_COMMIT} | ||||
|     TRITON=yes | ||||
|     ;; | ||||
|   pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks) | ||||
|     CUDA_VERSION=12.6 | ||||
|     CUDNN_VERSION=9 | ||||
|     ANACONDA_PYTHON_VERSION=3.10 | ||||
|     GCC_VERSION=9 | ||||
|     VISION=yes | ||||
| @ -177,6 +174,7 @@ case "$tag" in | ||||
|     ;; | ||||
|   pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks) | ||||
|     CUDA_VERSION=12.6 | ||||
|     CUDNN_VERSION=9 | ||||
|     ANACONDA_PYTHON_VERSION=3.12 | ||||
|     GCC_VERSION=9 | ||||
|     VISION=yes | ||||
| @ -188,6 +186,7 @@ case "$tag" in | ||||
|     ;; | ||||
|   pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks) | ||||
|     CUDA_VERSION=12.6 | ||||
|     CUDNN_VERSION=9 | ||||
|     ANACONDA_PYTHON_VERSION=3.13 | ||||
|     GCC_VERSION=9 | ||||
|     VISION=yes | ||||
| @ -199,6 +198,7 @@ case "$tag" in | ||||
|     ;; | ||||
|   pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9) | ||||
|     CUDA_VERSION=12.8.1 | ||||
|     CUDNN_VERSION=9 | ||||
|     ANACONDA_PYTHON_VERSION=3.10 | ||||
|     GCC_VERSION=9 | ||||
|     VISION=yes | ||||
| @ -276,7 +276,7 @@ case "$tag" in | ||||
|     NINJA_VERSION=1.9.0 | ||||
|     TRITON=yes | ||||
|     ;; | ||||
|   pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks) | ||||
|     pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks) | ||||
|     ANACONDA_PYTHON_VERSION=3.9 | ||||
|     GCC_VERSION=11 | ||||
|     VISION=yes | ||||
| @ -288,6 +288,7 @@ case "$tag" in | ||||
|   pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12) | ||||
|     ANACONDA_PYTHON_VERSION=3.9 | ||||
|     CUDA_VERSION=12.8.1 | ||||
|     CUDNN_VERSION=9 | ||||
|     CLANG_VERSION=12 | ||||
|     VISION=yes | ||||
|     TRITON=yes | ||||
| @ -366,6 +367,7 @@ case "$tag" in | ||||
|     fi | ||||
|     if [[ "$image" == *cuda* ]]; then | ||||
|       extract_version_from_image_name cuda CUDA_VERSION | ||||
|       extract_version_from_image_name cudnn CUDNN_VERSION | ||||
|     fi | ||||
|     if [[ "$image" == *rocm* ]]; then | ||||
|       extract_version_from_image_name rocm ROCM_VERSION | ||||
| @ -417,6 +419,9 @@ docker build \ | ||||
|        --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \ | ||||
|        --build-arg "GCC_VERSION=${GCC_VERSION}" \ | ||||
|        --build-arg "CUDA_VERSION=${CUDA_VERSION}" \ | ||||
|        --build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \ | ||||
|        --build-arg "TENSORRT_VERSION=${TENSORRT_VERSION}" \ | ||||
|        --build-arg "GRADLE_VERSION=${GRADLE_VERSION}" \ | ||||
|        --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \ | ||||
|        --build-arg "KATEX=${KATEX:-}" \ | ||||
|        --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \ | ||||
|  | ||||
| @ -1 +1 @@ | ||||
| f7888497a1eb9e98d4c07537f0d0bcfe180d1363 | ||||
| 11ec6354315768a85da41032535e3b7b99c5f706 | ||||
|  | ||||
							
								
								
									
										26
									
								
								.ci/docker/common/install_cudnn.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								.ci/docker/common/install_cudnn.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,26 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| if [[ -n "${CUDNN_VERSION}" ]]; then | ||||
|     # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement | ||||
|     mkdir tmp_cudnn | ||||
|     pushd tmp_cudnn | ||||
|     if [[ ${CUDA_VERSION:0:4} == "12.9" || ${CUDA_VERSION:0:4} == "12.8" ]]; then | ||||
|         CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive" | ||||
|     elif [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then | ||||
|         CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive" | ||||
|     elif [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then | ||||
|         CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive" | ||||
|     elif [[ ${CUDA_VERSION:0:2} == "11" ]]; then | ||||
|         CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda11-archive" | ||||
|     else | ||||
|         print "Unsupported CUDA version ${CUDA_VERSION}" | ||||
|         exit 1 | ||||
|     fi | ||||
|     curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz | ||||
|     tar xf ${CUDNN_NAME}.tar.xz | ||||
|     cp -a ${CUDNN_NAME}/include/* /usr/local/cuda/include/ | ||||
|     cp -a ${CUDNN_NAME}/lib/* /usr/local/cuda/lib64/ | ||||
|     popd | ||||
|     rm -rf tmp_cudnn | ||||
|     ldconfig | ||||
| fi | ||||
| @ -30,7 +30,7 @@ EOF | ||||
|  | ||||
|     # we want the patch version of 6.4 instead | ||||
|     if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then | ||||
|         ROCM_VERSION="${ROCM_VERSION}.2" | ||||
|         ROCM_VERSION="${ROCM_VERSION}.1" | ||||
|     fi | ||||
|  | ||||
|     # Default url values | ||||
| @ -85,19 +85,16 @@ EOF | ||||
|     # CI no longer builds for ROCm 6.3, but | ||||
|     # ROCm 6.4 did not yet fix the regression, also HIP branch names are different | ||||
|     if [[ $(ver $ROCM_VERSION) -ge $(ver 6.4) ]] && [[ $(ver $ROCM_VERSION) -lt $(ver 7.0) ]]; then | ||||
|         if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.2) ]]; then | ||||
|             HIP_TAG=rocm-6.4.2 | ||||
|             CLR_HASH=74d78ba3ac4bac235d02bcb48511c30b5cfdd457  # branch release/rocm-rel-6.4.2-statco-hotfix | ||||
|         elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.1) ]]; then | ||||
|             HIP_TAG=rocm-6.4.1 | ||||
|             CLR_HASH=efe6c35790b9206923bfeed1209902feff37f386  # branch release/rocm-rel-6.4.1-statco-hotfix | ||||
|         if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.1) ]]; then | ||||
|             HIP_BRANCH=release/rocm-rel-6.4 | ||||
|             CLR_HASH=ca18eb3f77fa09292fcda62bc60c3e565d752ada  # branch release/rocm-rel-6.4.1-statco-hotfix | ||||
|         elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then | ||||
|             HIP_TAG=rocm-6.4.0 | ||||
|             HIP_BRANCH=release/rocm-rel-6.4 | ||||
|             CLR_HASH=600f5b0d2baed94d5121e2174a9de0851b040b0c  # branch release/rocm-rel-6.4-statco-hotfix | ||||
|         fi | ||||
|         # clr build needs CppHeaderParser but can only find it using conda's python | ||||
|         python -m pip install CppHeaderParser | ||||
|         git clone https://github.com/ROCm/HIP -b $HIP_TAG | ||||
|         git clone https://github.com/ROCm/HIP -b $HIP_BRANCH | ||||
|         HIP_COMMON_DIR=$(readlink -f HIP) | ||||
|         git clone https://github.com/jeffdaily/clr | ||||
|         pushd clr | ||||
|  | ||||
| @ -41,7 +41,7 @@ case ${DOCKER_TAG_PREFIX} in | ||||
|     rocm*) | ||||
|         # we want the patch version of 6.4 instead | ||||
|         if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then | ||||
|             GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2" | ||||
|             GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.1" | ||||
|         fi | ||||
|         BASE_TARGET=rocm | ||||
|         GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete | ||||
|  | ||||
| @ -128,7 +128,7 @@ ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH | ||||
| ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH | ||||
| # Install setuptools and wheel for python 3.12/3.13 | ||||
| RUN for cpython_version in "cp312-cp312" "cp313-cp313" "cp313-cp313t"; do \ | ||||
|     /opt/python/${cpython_version}/bin/python -m pip install setuptools wheel; \ | ||||
|     /opt/python/${cpython_version}/bin/python -m pip install "setuptools>=77.0.0" "packaging>=24.2" wheel; \ | ||||
|     done; | ||||
|  | ||||
|  | ||||
|  | ||||
| @ -124,10 +124,9 @@ RUN python3 -mpip install cmake==3.28.0 | ||||
| # install newest flatbuffers version first: | ||||
| # for some reason old version is getting pulled in otherwise. | ||||
| # packaging package is required for onnxruntime wheel build. | ||||
| RUN pip3 install flatbuffers && \ | ||||
|   pip3 install cython 'pkgconfig>=1.5.5' 'setuptools>=77' 'numpy<2.3.0' && \ | ||||
| RUN pip3 install 'setuptools>=77.0' 'packaging>=24.2' && \ | ||||
|   pip3 install flatbuffers cython 'pkgconfig>=1.5.5' 'numpy<2.3.0' && \ | ||||
|   pip3 install --no-build-isolation h5py==3.11.0 && \ | ||||
|   pip3 install packaging && \ | ||||
|   git clone https://github.com/microsoft/onnxruntime && \ | ||||
|   cd onnxruntime && git checkout v1.21.0 && \ | ||||
|   git submodule update --init --recursive && \ | ||||
|  | ||||
| @ -77,7 +77,7 @@ case ${image} in | ||||
|     manylinux2_28-builder:rocm*) | ||||
|         # we want the patch version of 6.4 instead | ||||
|         if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then | ||||
|             GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2" | ||||
|             GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.1" | ||||
|         fi | ||||
|         TARGET=rocm_final | ||||
|         MANY_LINUX_VERSION="2_28" | ||||
|  | ||||
| @ -104,10 +104,10 @@ networkx==2.8.8 | ||||
| #Pinned versions: 2.8.8 | ||||
| #test that import: functorch | ||||
|  | ||||
| ninja==1.11.1.3 | ||||
| ninja==1.11.1.4 | ||||
| #Description: build system. Used in some tests. Used in build to generate build | ||||
| #time tracing information | ||||
| #Pinned versions: 1.11.1.3 | ||||
| #Pinned versions: 1.11.1.4 | ||||
| #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py | ||||
|  | ||||
| numba==0.49.0 ; python_version < "3.9" | ||||
| @ -221,9 +221,9 @@ pygments==2.15.0 | ||||
| #Pinned versions: 2.12.0 | ||||
| #test that import: the doctests | ||||
|  | ||||
| #pyyaml | ||||
| #PyYAML | ||||
| #Description: data serialization format | ||||
| #Pinned versions: 6.0.2 | ||||
| #Pinned versions: | ||||
| #test that import: | ||||
|  | ||||
| #requests | ||||
| @ -233,7 +233,7 @@ pygments==2.15.0 | ||||
|  | ||||
| #rich | ||||
| #Description: rich text and beautiful formatting in the terminal | ||||
| #Pinned versions: 14.1.0 | ||||
| #Pinned versions: 10.9.0 | ||||
| #test that import: | ||||
|  | ||||
| scikit-image==0.19.3 ; python_version < "3.10" | ||||
| @ -363,9 +363,10 @@ pwlf==2.2.1 | ||||
|  | ||||
|  | ||||
| # To build PyTorch itself | ||||
| packaging>=24.2 | ||||
| pyyaml | ||||
| pyzstd | ||||
| setuptools>=70.1.0 | ||||
| setuptools>=77.0.0 | ||||
| six | ||||
|  | ||||
| scons==4.5.2 ; platform_machine == "aarch64" | ||||
|  | ||||
| @ -1,11 +1,11 @@ | ||||
| sphinx==5.3.0 | ||||
| #Description: This is used to generate PyTorch docs | ||||
| #Pinned versions: 5.3.0 | ||||
| -e git+https://github.com/pytorch/pytorch_sphinx_theme.git@722b7e6f9ca512fcc526ad07d62b3d28c50bb6cd#egg=pytorch_sphinx_theme2 | ||||
| -e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2 | ||||
|  | ||||
| # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering | ||||
| # but it doesn't seem to work and hangs around idly. The initial thought that it is probably | ||||
| # something related to Docker setup. We can investigate this later. | ||||
| # but it doesn't seem to work and hangs around idly. The initial thought it is probably | ||||
| # something related to Docker setup. We can investigate this later | ||||
|  | ||||
| sphinxcontrib.katex==0.8.6 | ||||
| #Description: This is used to generate PyTorch docs | ||||
| @ -50,8 +50,8 @@ IPython==8.12.0 | ||||
| #Pinned versions: 8.12.0 | ||||
|  | ||||
| myst-nb==0.17.2 | ||||
| #Description: This is used to generate PyTorch functorch and torch.compile docs. | ||||
| #Pinned versions: 0.17.2 | ||||
| #Description: This is used to generate PyTorch functorch docs | ||||
| #Pinned versions: 0.13.2 | ||||
|  | ||||
| # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs | ||||
| python-etcd==0.4.5 | ||||
| @ -59,3 +59,4 @@ sphinx-copybutton==0.5.0 | ||||
| sphinx-design==0.4.0 | ||||
| sphinxcontrib-mermaid==1.0.0 | ||||
| myst-parser==0.18.1 | ||||
| myst-nb | ||||
|  | ||||
| @ -194,7 +194,7 @@ ROCBLAS_LIB_SRC=$ROCM_HOME/lib/rocblas/library | ||||
| ROCBLAS_LIB_DST=lib/rocblas/library | ||||
| ROCBLAS_ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH) | ||||
| ROCBLAS_OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx) | ||||
| ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $ROCBLAS_OTHER_FILES) | ||||
| ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $OTHER_FILES) | ||||
|  | ||||
| # hipblaslt library files | ||||
| HIPBLASLT_LIB_SRC=$ROCM_HOME/lib/hipblaslt/library | ||||
|  | ||||
							
								
								
									
										34
									
								
								.ci/pytorch/build-mobile.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										34
									
								
								.ci/pytorch/build-mobile.sh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,34 @@ | ||||
| #!/usr/bin/env bash | ||||
| # DO NOT ADD 'set -x' not to reveal CircleCI secret context environment variables | ||||
| set -eu -o pipefail | ||||
|  | ||||
| # This script uses linux host toolchain + mobile build options in order to | ||||
| # build & test mobile libtorch without having to setup Android/iOS | ||||
| # toolchain/simulator. | ||||
|  | ||||
| # shellcheck source=./common.sh | ||||
| source "$(dirname "${BASH_SOURCE[0]}")/common.sh" | ||||
| # shellcheck source=./common-build.sh | ||||
| source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh" | ||||
|  | ||||
| # Install torch & torchvision - used to download & trace test model. | ||||
| # Ideally we should use the libtorch built on the PR so that backward | ||||
| # incompatible changes won't break this script - but it will significantly slow | ||||
| # down mobile CI jobs. | ||||
| # Here we install nightly instead of stable so that we have an option to | ||||
| # temporarily skip mobile CI jobs on BC-breaking PRs until they are in nightly. | ||||
| retry pip install --pre torch torchvision \ | ||||
|   -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html \ | ||||
|   --progress-bar off | ||||
|  | ||||
| # Run end-to-end process of building mobile library, linking into the predictor | ||||
| # binary, and running forward pass with a real model. | ||||
| if [[ "$BUILD_ENVIRONMENT" == *-mobile-custom-build-static* ]]; then | ||||
|   TEST_CUSTOM_BUILD_STATIC=1 test/mobile/custom_build/build.sh | ||||
| elif [[ "$BUILD_ENVIRONMENT" == *-mobile-lightweight-dispatch* ]]; then | ||||
|   test/mobile/lightweight_dispatch/build.sh | ||||
| else | ||||
|   TEST_DEFAULT_BUILD=1 test/mobile/custom_build/build.sh | ||||
| fi | ||||
|  | ||||
| print_sccache_stats | ||||
| @ -11,6 +11,10 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh" | ||||
| # shellcheck source=./common-build.sh | ||||
| source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh" | ||||
|  | ||||
| if [[ "$BUILD_ENVIRONMENT" == *-mobile-*build* ]]; then | ||||
|   exec "$(dirname "${BASH_SOURCE[0]}")/build-mobile.sh" "$@" | ||||
| fi | ||||
|  | ||||
| echo "Python version:" | ||||
| python --version | ||||
|  | ||||
| @ -120,8 +124,26 @@ if [[ "$BUILD_ENVIRONMENT" == *libtorch* ]]; then | ||||
| fi | ||||
|  | ||||
| # Use special scripts for Android builds | ||||
| if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then | ||||
|   export ANDROID_NDK=/opt/ndk | ||||
|   build_args=() | ||||
|   if [[ "${BUILD_ENVIRONMENT}" == *-arm-v7a* ]]; then | ||||
|     build_args+=("-DANDROID_ABI=armeabi-v7a") | ||||
|   elif [[ "${BUILD_ENVIRONMENT}" == *-arm-v8a* ]]; then | ||||
|     build_args+=("-DANDROID_ABI=arm64-v8a") | ||||
|   elif [[ "${BUILD_ENVIRONMENT}" == *-x86_32* ]]; then | ||||
|     build_args+=("-DANDROID_ABI=x86") | ||||
|   elif [[ "${BUILD_ENVIRONMENT}" == *-x86_64* ]]; then | ||||
|     build_args+=("-DANDROID_ABI=x86_64") | ||||
|   fi | ||||
|   if [[ "${BUILD_ENVIRONMENT}" == *vulkan* ]]; then | ||||
|     build_args+=("-DUSE_VULKAN=ON") | ||||
|   fi | ||||
|   build_args+=("-DUSE_LITE_INTERPRETER_PROFILER=OFF") | ||||
|   exec ./scripts/build_android.sh "${build_args[@]}" "$@" | ||||
| fi | ||||
|  | ||||
| if [[ "$BUILD_ENVIRONMENT" == *vulkan* ]]; then | ||||
| if [[ "$BUILD_ENVIRONMENT" != *android* && "$BUILD_ENVIRONMENT" == *vulkan* ]]; then | ||||
|   export USE_VULKAN=1 | ||||
|   # shellcheck disable=SC1091 | ||||
|   source /var/lib/jenkins/vulkansdk/setup-env.sh | ||||
| @ -203,7 +225,7 @@ if [[ "${BUILD_ENVIRONMENT}" == *-pch* ]]; then | ||||
|     export USE_PRECOMPILED_HEADERS=1 | ||||
| fi | ||||
|  | ||||
| if [[ "${BUILD_ENVIRONMENT}" != *cuda* ]]; then | ||||
| if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* ]]; then | ||||
|   export BUILD_STATIC_RUNTIME_BENCHMARK=ON | ||||
| fi | ||||
|  | ||||
| @ -247,6 +269,9 @@ if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then | ||||
|     tools/bazel build --config=no-tty "${BAZEL_MEM_LIMIT}" "${BAZEL_CPU_LIMIT}" //... | ||||
|   fi | ||||
| else | ||||
|   # install build-system requirements before running setup.py commands | ||||
|   python -m pip install -r requirements-build.txt | ||||
|  | ||||
|   # check that setup.py would fail with bad arguments | ||||
|   echo "The next three invocations are expected to fail with invalid command error messages." | ||||
|   ( ! get_exit_code python setup.py bad_argument ) | ||||
|  | ||||
| @ -204,32 +204,8 @@ function install_torchrec_and_fbgemm() { | ||||
|     pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec | ||||
|     pip_uninstall fbgemm-gpu-nightly | ||||
|  | ||||
|     # Set ROCM_HOME isn't available, use ROCM_PATH if set or /opt/rocm | ||||
|     ROCM_HOME="${ROCM_HOME:-${ROCM_PATH:-/opt/rocm}}" | ||||
|  | ||||
|     # Find rocm_version.h header file for ROCm version extract | ||||
|     rocm_version_h="${ROCM_HOME}/include/rocm-core/rocm_version.h" | ||||
|     if [ ! -f "$rocm_version_h" ]; then | ||||
|         rocm_version_h="${ROCM_HOME}/include/rocm_version.h" | ||||
|     fi | ||||
|  | ||||
|     # Error out if rocm_version.h not found | ||||
|     if [ ! -f "$rocm_version_h" ]; then | ||||
|         echo "Error: rocm_version.h not found in expected locations." >&2 | ||||
|         exit 1 | ||||
|     fi | ||||
|  | ||||
|     # Extract major, minor and patch ROCm version numbers | ||||
|     MAJOR_VERSION=$(grep 'ROCM_VERSION_MAJOR' "$rocm_version_h" | awk '{print $3}') | ||||
|     MINOR_VERSION=$(grep 'ROCM_VERSION_MINOR' "$rocm_version_h" | awk '{print $3}') | ||||
|     PATCH_VERSION=$(grep 'ROCM_VERSION_PATCH' "$rocm_version_h" | awk '{print $3}') | ||||
|     ROCM_INT=$((MAJOR_VERSION * 10000 + MINOR_VERSION * 100 + PATCH_VERSION)) | ||||
|     echo "ROCm version: $ROCM_INT" | ||||
|     export BUILD_ROCM_VERSION="$MAJOR_VERSION.$MINOR_VERSION" | ||||
|  | ||||
|     pip_install tabulate  # needed for newer fbgemm | ||||
|     pip_install patchelf  # needed for rocm fbgemm | ||||
|     pushd /tmp | ||||
|  | ||||
|     local wheel_dir=dist/fbgemm_gpu | ||||
|     local found_whl=0 | ||||
| @ -247,7 +223,7 @@ function install_torchrec_and_fbgemm() { | ||||
|       pushd fbgemm/fbgemm_gpu | ||||
|       git checkout "${fbgemm_commit}" | ||||
|       python setup.py bdist_wheel \ | ||||
|         --build-variant=rocm \ | ||||
|         --package_variant=rocm \ | ||||
|         -DHIP_ROOT_DIR="${ROCM_PATH}" \ | ||||
|         -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \ | ||||
|         -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA" | ||||
| @ -264,7 +240,6 @@ function install_torchrec_and_fbgemm() { | ||||
|     done | ||||
|  | ||||
|     rm -rf fbgemm | ||||
|     popd | ||||
|   else | ||||
|     pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec | ||||
|     pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu | ||||
|  | ||||
							
								
								
									
										123
									
								
								.ci/pytorch/create_test_cert.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										123
									
								
								.ci/pytorch/create_test_cert.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,123 @@ | ||||
| from datetime import datetime, timedelta, timezone | ||||
| from tempfile import mkdtemp | ||||
|  | ||||
| from cryptography import x509 | ||||
| from cryptography.hazmat.primitives import hashes, serialization | ||||
| from cryptography.hazmat.primitives.asymmetric import rsa | ||||
| from cryptography.x509.oid import NameOID | ||||
|  | ||||
|  | ||||
| temp_dir = mkdtemp() | ||||
| print(temp_dir) | ||||
|  | ||||
|  | ||||
| def genrsa(path): | ||||
|     key = rsa.generate_private_key( | ||||
|         public_exponent=65537, | ||||
|         key_size=2048, | ||||
|     ) | ||||
|     with open(path, "wb") as f: | ||||
|         f.write( | ||||
|             key.private_bytes( | ||||
|                 encoding=serialization.Encoding.PEM, | ||||
|                 format=serialization.PrivateFormat.TraditionalOpenSSL, | ||||
|                 encryption_algorithm=serialization.NoEncryption(), | ||||
|             ) | ||||
|         ) | ||||
|     return key | ||||
|  | ||||
|  | ||||
| def create_cert(path, C, ST, L, O, key): | ||||
|     subject = issuer = x509.Name( | ||||
|         [ | ||||
|             x509.NameAttribute(NameOID.COUNTRY_NAME, C), | ||||
|             x509.NameAttribute(NameOID.STATE_OR_PROVINCE_NAME, ST), | ||||
|             x509.NameAttribute(NameOID.LOCALITY_NAME, L), | ||||
|             x509.NameAttribute(NameOID.ORGANIZATION_NAME, O), | ||||
|         ] | ||||
|     ) | ||||
|     cert = ( | ||||
|         x509.CertificateBuilder() | ||||
|         .subject_name(subject) | ||||
|         .issuer_name(issuer) | ||||
|         .public_key(key.public_key()) | ||||
|         .serial_number(x509.random_serial_number()) | ||||
|         .not_valid_before(datetime.now(timezone.utc)) | ||||
|         .not_valid_after( | ||||
|             # Our certificate will be valid for 10 days | ||||
|             datetime.now(timezone.utc) + timedelta(days=10) | ||||
|         ) | ||||
|         .add_extension( | ||||
|             x509.BasicConstraints(ca=True, path_length=None), | ||||
|             critical=True, | ||||
|         ) | ||||
|         .sign(key, hashes.SHA256()) | ||||
|     ) | ||||
|     # Write our certificate out to disk. | ||||
|     with open(path, "wb") as f: | ||||
|         f.write(cert.public_bytes(serialization.Encoding.PEM)) | ||||
|     return cert | ||||
|  | ||||
|  | ||||
| def create_req(path, C, ST, L, O, key): | ||||
|     csr = ( | ||||
|         x509.CertificateSigningRequestBuilder() | ||||
|         .subject_name( | ||||
|             x509.Name( | ||||
|                 [ | ||||
|                     # Provide various details about who we are. | ||||
|                     x509.NameAttribute(NameOID.COUNTRY_NAME, C), | ||||
|                     x509.NameAttribute(NameOID.STATE_OR_PROVINCE_NAME, ST), | ||||
|                     x509.NameAttribute(NameOID.LOCALITY_NAME, L), | ||||
|                     x509.NameAttribute(NameOID.ORGANIZATION_NAME, O), | ||||
|                 ] | ||||
|             ) | ||||
|         ) | ||||
|         .sign(key, hashes.SHA256()) | ||||
|     ) | ||||
|     with open(path, "wb") as f: | ||||
|         f.write(csr.public_bytes(serialization.Encoding.PEM)) | ||||
|     return csr | ||||
|  | ||||
|  | ||||
| def sign_certificate_request(path, csr_cert, ca_cert, private_ca_key): | ||||
|     cert = ( | ||||
|         x509.CertificateBuilder() | ||||
|         .subject_name(csr_cert.subject) | ||||
|         .issuer_name(ca_cert.subject) | ||||
|         .public_key(csr_cert.public_key()) | ||||
|         .serial_number(x509.random_serial_number()) | ||||
|         .not_valid_before(datetime.now(timezone.utc)) | ||||
|         .not_valid_after( | ||||
|             # Our certificate will be valid for 10 days | ||||
|             datetime.now(timezone.utc) + timedelta(days=10) | ||||
|             # Sign our certificate with our private key | ||||
|         ) | ||||
|         .sign(private_ca_key, hashes.SHA256()) | ||||
|     ) | ||||
|     with open(path, "wb") as f: | ||||
|         f.write(cert.public_bytes(serialization.Encoding.PEM)) | ||||
|     return cert | ||||
|  | ||||
|  | ||||
| ca_key = genrsa(temp_dir + "/ca.key") | ||||
| ca_cert = create_cert( | ||||
|     temp_dir + "/ca.pem", | ||||
|     "US", | ||||
|     "New York", | ||||
|     "New York", | ||||
|     "Gloo Certificate Authority", | ||||
|     ca_key, | ||||
| ) | ||||
|  | ||||
| pkey = genrsa(temp_dir + "/pkey.key") | ||||
| csr = create_req( | ||||
|     temp_dir + "/csr.csr", | ||||
|     "US", | ||||
|     "California", | ||||
|     "San Francisco", | ||||
|     "Gloo Testing Company", | ||||
|     pkey, | ||||
| ) | ||||
|  | ||||
| cert = sign_certificate_request(temp_dir + "/cert.pem", csr, ca_cert, ca_key) | ||||
							
								
								
									
										18
									
								
								.ci/pytorch/run_glootls_test.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										18
									
								
								.ci/pytorch/run_glootls_test.sh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,18 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| CREATE_TEST_CERT="$(dirname "${BASH_SOURCE[0]}")/create_test_cert.py" | ||||
| TMP_CERT_DIR=$(python "$CREATE_TEST_CERT") | ||||
|  | ||||
| openssl verify -CAfile "${TMP_CERT_DIR}/ca.pem" "${TMP_CERT_DIR}/cert.pem" | ||||
|  | ||||
| export GLOO_DEVICE_TRANSPORT=TCP_TLS | ||||
| export GLOO_DEVICE_TRANSPORT_TCP_TLS_PKEY=${TMP_CERT_DIR}/pkey.key | ||||
| export GLOO_DEVICE_TRANSPORT_TCP_TLS_CERT=${TMP_CERT_DIR}/cert.pem | ||||
| export GLOO_DEVICE_TRANSPORT_TCP_TLS_CA_FILE=${TMP_CERT_DIR}/ca.pem | ||||
|  | ||||
| time python test/run_test.py --include distributed/test_c10d_gloo --verbose -- ProcessGroupGlooTest | ||||
|  | ||||
| unset GLOO_DEVICE_TRANSPORT | ||||
| unset GLOO_DEVICE_TRANSPORT_TCP_TLS_PKEY | ||||
| unset GLOO_DEVICE_TRANSPORT_TCP_TLS_CERT | ||||
| unset GLOO_DEVICE_TRANSPORT_TCP_TLS_CA_FILE | ||||
| @ -385,29 +385,6 @@ def smoke_test_compile(device: str = "cpu") -> None: | ||||
|     x_pt2 = torch.compile(model, mode="max-autotune")(x) | ||||
|  | ||||
|  | ||||
| def smoke_test_nvshmem() -> None: | ||||
|     if not torch.cuda.is_available(): | ||||
|         print("CUDA is not available, skipping NVSHMEM test") | ||||
|         return | ||||
|  | ||||
|     # Check if NVSHMEM is compiled in current build | ||||
|     try: | ||||
|         from torch._C._distributed_c10d import _is_nvshmem_available | ||||
|     except ImportError: | ||||
|         # Not built with NVSHMEM support. | ||||
|         # torch is not compiled with NVSHMEM prior to 2.9 | ||||
|         if torch.__version__ < "2.9": | ||||
|             return | ||||
|         else: | ||||
|             # After 2.9: NVSHMEM is expected to be compiled in current build | ||||
|             raise RuntimeError("torch not compiled with NVSHMEM") from None | ||||
|  | ||||
|     print("torch compiled with NVSHMEM") | ||||
|  | ||||
|     # Check if NVSHMEM is available on current system. | ||||
|     print(f"NVSHMEM available at run time: {_is_nvshmem_available()}") | ||||
|  | ||||
|  | ||||
| def smoke_test_modules(): | ||||
|     cwd = os.getcwd() | ||||
|     for module in MODULES: | ||||
| @ -502,8 +479,6 @@ def main() -> None: | ||||
|         options.pypi_pkg_check, | ||||
|     ) | ||||
|  | ||||
|     smoke_test_nvshmem() | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
|  | ||||
| @ -201,7 +201,7 @@ fi | ||||
|  | ||||
| if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]] ; then | ||||
|   # JIT C++ extensions require ninja. | ||||
|   pip_install "ninja==1.10.2" | ||||
|   pip_install "ninja==1.11.1.4" | ||||
|   # ninja is installed in $HOME/.local/bin, e.g., /var/lib/jenkins/.local/bin for CI user jenkins | ||||
|   # but this script should be runnable by any user, including root | ||||
|   export PATH="$HOME/.local/bin:$PATH" | ||||
| @ -345,12 +345,6 @@ test_h100_symm_mem() { | ||||
|   assert_git_not_dirty | ||||
| } | ||||
|  | ||||
| test_h100_cutlass_backend() { | ||||
|   # cutlass backend tests for H100 | ||||
|   TORCHINDUCTOR_CUTLASS_DIR=$(realpath "./third_party/cutlass") python test/run_test.py --include inductor/test_cutlass_backend -k "not addmm" $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running | ||||
|   TORCHINDUCTOR_CUTLASS_DIR=$(realpath "./third_party/cutlass") python test/run_test.py --include inductor/test_cutlass_evt $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running | ||||
| } | ||||
|  | ||||
| test_lazy_tensor_meta_reference_disabled() { | ||||
|   export TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE=1 | ||||
|   echo "Testing lazy tensor operations without meta reference" | ||||
| @ -365,6 +359,7 @@ test_dynamo_wrapped_shard() { | ||||
|     exit 1 | ||||
|   fi | ||||
|   python tools/dynamo/verify_dynamo.py | ||||
|   python tools/dynamo/gb_id_mapping.py verify | ||||
|   # PLEASE DO NOT ADD ADDITIONAL EXCLUDES HERE. | ||||
|   # Instead, use @skipIfTorchDynamo on your tests. | ||||
|   time python test/run_test.py --dynamo \ | ||||
| @ -462,7 +457,7 @@ test_inductor_aoti() { | ||||
|   # rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled | ||||
|   /usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}" | ||||
|  | ||||
|   /usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile | ||||
|   /usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile | ||||
| } | ||||
|  | ||||
| test_inductor_cpp_wrapper_shard() { | ||||
| @ -627,8 +622,6 @@ test_perf_for_dashboard() { | ||||
|     device=cuda_a10g | ||||
|   elif [[ "${TEST_CONFIG}" == *h100* ]]; then | ||||
|     device=cuda_h100 | ||||
|   elif [[ "${TEST_CONFIG}" == *b200* ]]; then | ||||
|     device=cuda_b200 | ||||
|   elif [[ "${TEST_CONFIG}" == *rocm* ]]; then | ||||
|     device=rocm | ||||
|   fi | ||||
| @ -803,16 +796,6 @@ test_dynamo_benchmark() { | ||||
|   if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then | ||||
|     test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@" | ||||
|   elif [[ "${TEST_CONFIG}" == *perf* ]]; then | ||||
|     # TODO (huydhn): Just smoke test some sample models | ||||
|     if [[ "${TEST_CONFIG}" == *b200* ]]; then | ||||
|       if [[ "${suite}" == "huggingface" ]]; then | ||||
|         export TORCHBENCH_ONLY_MODELS="DistillGPT2" | ||||
|       elif [[ "${suite}" == "timm_models" ]]; then | ||||
|         export TORCHBENCH_ONLY_MODELS="inception_v3" | ||||
|       elif [[ "${suite}" == "torchbench" ]]; then | ||||
|         export TORCHBENCH_ONLY_MODELS="hf_Bert" | ||||
|       fi | ||||
|     fi | ||||
|     test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@" | ||||
|   else | ||||
|     if [[ "${TEST_CONFIG}" == *cpu* ]]; then | ||||
| @ -940,6 +923,12 @@ test_torchbench_gcp_smoketest(){ | ||||
|   popd | ||||
| } | ||||
|  | ||||
| test_python_gloo_with_tls() { | ||||
|   source "$(dirname "${BASH_SOURCE[0]}")/run_glootls_test.sh" | ||||
|   assert_git_not_dirty | ||||
| } | ||||
|  | ||||
|  | ||||
| test_aten() { | ||||
|   # Test ATen | ||||
|   # The following test(s) of ATen have already been skipped by caffe2 in rocm environment: | ||||
| @ -986,8 +975,6 @@ test_without_numpy() { | ||||
|   if [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then | ||||
|     python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch;torch.compile(lambda x:print(x))('Hello World')" | ||||
|   fi | ||||
|   # Regression test for https://github.com/pytorch/pytorch/pull/157734 (torch.onnx should be importable without numpy) | ||||
|   python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch; import torch.onnx" | ||||
|   popd | ||||
| } | ||||
|  | ||||
| @ -1332,13 +1319,10 @@ EOF | ||||
|  | ||||
|   # Step 2. Make sure that the public API test "test_correct_module_names" fails when an existing | ||||
|   # file is modified to introduce an invalid public API function. | ||||
|   # The filepath here must not have __all__ defined in it, otherwise the test will pass. | ||||
|   # If your PR introduces __all__ to torch/cuda/streams.py please point this to another file | ||||
|   # that does not have __all__ defined. | ||||
|   EXISTING_FILEPATH="${TORCH_INSTALL_DIR}/cuda/streams.py" | ||||
|   EXISTING_FILEPATH="${TORCH_INSTALL_DIR}/nn/parameter.py" | ||||
|   cp -v "${EXISTING_FILEPATH}" "${EXISTING_FILEPATH}.orig" | ||||
|   echo "${BAD_PUBLIC_FUNC}" >> "${EXISTING_FILEPATH}" | ||||
|   invalid_api="torch.cuda.streams.new_public_func" | ||||
|   invalid_api="torch.nn.parameter.new_public_func" | ||||
|   echo "Appended an invalid public API function to existing file ${EXISTING_FILEPATH}..." | ||||
|  | ||||
|   check_public_api_test_fails \ | ||||
| @ -1572,7 +1556,7 @@ test_executorch() { | ||||
| test_linux_aarch64() { | ||||
|   python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \ | ||||
|         test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \ | ||||
|         test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \ | ||||
|         test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops test_cpp_extensions_open_device_registration \ | ||||
|         --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose | ||||
|  | ||||
|   # Dynamo tests | ||||
| @ -1785,8 +1769,6 @@ elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then | ||||
|   test_h100_distributed | ||||
| elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then | ||||
|   test_h100_symm_mem | ||||
| elif [[ "${TEST_CONFIG}" == h100_cutlass_backend ]]; then | ||||
|   test_h100_cutlass_backend | ||||
| else | ||||
|   install_torchvision | ||||
|   install_monkeytype | ||||
|  | ||||
| @ -1,34 +0,0 @@ | ||||
| # If you want to rebuild, run this with $env:REBUILD=1 | ||||
| # If you want to build with CUDA, run this with $env:USE_CUDA=1 | ||||
| # If you want to build without CUDA, run this with $env:USE_CUDA=0 | ||||
|  | ||||
| # Check for setup.py in the current directory | ||||
| if (-not (Test-Path "setup.py")) { | ||||
|     Write-Host "ERROR: Please run this build script from PyTorch root directory." | ||||
|     exit 1 | ||||
| } | ||||
|  | ||||
| # Get the script's parent directory | ||||
| $ScriptParentDir = Split-Path -Parent $MyInvocation.MyCommand.Definition | ||||
|  | ||||
| # Set TMP_DIR and convert to Windows path | ||||
| $env:TMP_DIR = Join-Path (Get-Location) "build\win_tmp" | ||||
| $env:TMP_DIR_WIN = $env:TMP_DIR  # Already in Windows format, no cygpath needed | ||||
|  | ||||
| # Set final package directory with default fallback | ||||
| if (-not $env:PYTORCH_FINAL_PACKAGE_DIR) { | ||||
|     $env:PYTORCH_FINAL_PACKAGE_DIR = "C:\w\build-results" | ||||
| } | ||||
|  | ||||
| # Create the final package directory if it doesn't exist | ||||
| if (-not (Test-Path $env:PYTORCH_FINAL_PACKAGE_DIR)) { | ||||
|     New-Item -Path $env:PYTORCH_FINAL_PACKAGE_DIR -ItemType Directory -Force | Out-Null | ||||
| } | ||||
|  | ||||
| # Set script helpers directory | ||||
| $env:SCRIPT_HELPERS_DIR = Join-Path $ScriptParentDir "win-test-helpers\arm64" | ||||
|  | ||||
| # Run the main build script | ||||
| & "$env:SCRIPT_HELPERS_DIR\build_pytorch.ps1" | ||||
|  | ||||
| Write-Host "BUILD PASSED" | ||||
| @ -1,24 +0,0 @@ | ||||
| #!/bin/bash | ||||
| set -ex -o pipefail | ||||
|  | ||||
| SCRIPT_PARENT_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) | ||||
| # shellcheck source=./common.sh | ||||
| source "$SCRIPT_PARENT_DIR/common.sh" | ||||
|  | ||||
| run_tests() { | ||||
|     echo Running smoke_test.py... | ||||
|     python ./.ci/pytorch/smoke_test/smoke_test.py --package torchonly | ||||
|  | ||||
|     echo Running test_autograd.oy, test_nn.py, test_torch.py... | ||||
|     cd test | ||||
|  | ||||
|     CORE_TEST_LIST=("test_autograd.py" "test_nn.py" "test_modules.py") | ||||
|  | ||||
|     for t in "${CORE_TEST_LIST[@]}"; do | ||||
|         echo "Running test: $t" | ||||
|         python "$t" --verbose --save-xml --use-pytest -vvvv -rfEsxXP -p no:xdist | ||||
|     done | ||||
| } | ||||
|  | ||||
| run_tests | ||||
| echo "TEST PASSED" | ||||
| @ -1,98 +0,0 @@ | ||||
| # TODO: we may can use existing build_pytorch.bat for arm64 | ||||
|  | ||||
| if ($env:DEBUG -eq "1") { | ||||
|     $env:BUILD_TYPE = "debug" | ||||
| } else { | ||||
|     $env:BUILD_TYPE = "release" | ||||
| } | ||||
|  | ||||
| # This inflates our log size slightly, but it is REALLY useful to be | ||||
| # able to see what our cl.exe commands are. (since you can actually | ||||
| # just copy-paste them into a local Windows setup to just rebuild a | ||||
| # single file.) | ||||
| # log sizes are too long, but leaving this here in case someone wants to use it locally | ||||
| # $env:CMAKE_VERBOSE_MAKEFILE = "1" | ||||
|  | ||||
| $env:INSTALLER_DIR = Join-Path $env:SCRIPT_HELPERS_DIR "installation-helpers" | ||||
|  | ||||
| cd .. | ||||
|  | ||||
| # Environment variables | ||||
| $env:SCCACHE_IDLE_TIMEOUT = "0" | ||||
| $env:SCCACHE_IGNORE_SERVER_IO_ERROR = "1" | ||||
| $env:CMAKE_BUILD_TYPE = $env:BUILD_TYPE | ||||
| $env:CMAKE_C_COMPILER_LAUNCHER = "sccache" | ||||
| $env:CMAKE_CXX_COMPILER_LAUNCHER = "sccache" | ||||
| $env:libuv_ROOT = Join-Path $env:DEPENDENCIES_DIR "libuv\install" | ||||
| $env:MSSdk = "1" | ||||
|  | ||||
| if ($env:PYTORCH_BUILD_VERSION) { | ||||
|     $env:PYTORCH_BUILD_VERSION = $env:PYTORCH_BUILD_VERSION | ||||
|     $env:PYTORCH_BUILD_NUMBER = "1" | ||||
| } | ||||
|  | ||||
| $env:CMAKE_POLICY_VERSION_MINIMUM = "3.5" | ||||
|  | ||||
| # Set BLAS type | ||||
| if ($env:ENABLE_APL -eq "1") { | ||||
|     $env:BLAS = "APL" | ||||
|     $env:USE_LAPACK = "1" | ||||
| } elseif ($env:ENABLE_OPENBLAS -eq "1") { | ||||
|     $env:BLAS = "OpenBLAS" | ||||
|     $env:OpenBLAS_HOME = Join-Path $env:DEPENDENCIES_DIR "OpenBLAS\install" | ||||
| } | ||||
|  | ||||
| # Change to source directory | ||||
| Set-Location $env:PYTORCH_ROOT | ||||
|  | ||||
| # Copy libuv.dll | ||||
| Copy-Item -Path (Join-Path $env:libuv_ROOT "lib\Release\uv.dll") -Destination "torch\lib\uv.dll" -Force | ||||
|  | ||||
| # Create virtual environment | ||||
| python -m venv .venv | ||||
| .\.venv\Scripts\Activate.ps1 | ||||
| where.exe python | ||||
|  | ||||
| # Python install dependencies | ||||
| python -m pip install --upgrade pip | ||||
| pip install setuptools pyyaml | ||||
| pip install -r requirements.txt | ||||
|  | ||||
| # Set after installing psutil | ||||
| $env:DISTUTILS_USE_SDK = "1" | ||||
|  | ||||
| # Print all environment variables | ||||
| Get-ChildItem Env: | ||||
|  | ||||
| # Start and inspect sccache | ||||
| sccache --start-server | ||||
| sccache --zero-stats | ||||
| sccache --show-stats | ||||
|  | ||||
| # Build the wheel | ||||
| python setup.py bdist_wheel | ||||
| if ($LASTEXITCODE -ne 0) { exit 1 } | ||||
|  | ||||
| # Install the wheel locally | ||||
| $whl = Get-ChildItem -Path "dist\*.whl" | Select-Object -First 1 | ||||
| if ($whl) { | ||||
|     python -mpip install --no-index --no-deps $whl.FullName | ||||
| } | ||||
|  | ||||
| # Copy final wheel | ||||
| robocopy "dist" "$env:PYTORCH_FINAL_PACKAGE_DIR" *.whl | ||||
|  | ||||
| # Export test times | ||||
| python tools/stats/export_test_times.py | ||||
|  | ||||
| # Copy additional CI files | ||||
| robocopy ".additional_ci_files" "$env:PYTORCH_FINAL_PACKAGE_DIR\.additional_ci_files" /E | ||||
|  | ||||
| # Save ninja log | ||||
| Copy-Item -Path "build\.ninja_log" -Destination $env:PYTORCH_FINAL_PACKAGE_DIR -Force | ||||
|  | ||||
| # Final sccache stats and stop | ||||
| sccache --show-stats | ||||
| sccache --stop-server | ||||
|  | ||||
| exit 0 | ||||
| @ -126,6 +126,11 @@ if "%USE_CUDA%"=="1" ( | ||||
|   set CMAKE_CUDA_COMPILER_LAUNCHER=%TMP_DIR%/bin/randomtemp.exe;%TMP_DIR%\bin\sccache.exe | ||||
| ) | ||||
|  | ||||
| :: Install build-system requirements before running setup.py commands | ||||
| python -m pip install -r requirements-build.txt | ||||
| if errorlevel 1 goto fail | ||||
| if not errorlevel 0 goto fail | ||||
|  | ||||
| :: Print all existing environment variable for debugging | ||||
| set | ||||
|  | ||||
|  | ||||
| @ -18,5 +18,5 @@ start /wait "" python-amd64.exe /quiet InstallAllUsers=1 PrependPath=0 Include_t | ||||
| if errorlevel 1 exit /b 1 | ||||
|  | ||||
| set "PATH=%CD%\Python\Scripts;%CD%\Python;%PATH%" | ||||
| %PYTHON_EXEC% -m pip install --upgrade pip setuptools packaging wheel | ||||
| %PYTHON_EXEC% -m pip install --upgrade pip "setuptools>=77.0.0" "packaging>=24.2" wheel | ||||
| if errorlevel 1 exit /b 1 | ||||
|  | ||||
| @ -7,6 +7,9 @@ call "internal\install_python.bat" | ||||
|  | ||||
| %PYTHON_EXEC% --version | ||||
| set "PATH=%CD%\Python\Lib\site-packages\cmake\data\bin;%CD%\Python\Scripts;%CD%\Python;%PATH%" | ||||
|  | ||||
| %PYTHON_EXEC% -m pip install "setuptools>=77.0.0" "packaging>=24.2" | ||||
|  | ||||
| if "%DESIRED_PYTHON%" == "3.13t" %PYTHON_EXEC% -m pip install numpy==2.2.1 cmake | ||||
| if "%DESIRED_PYTHON%" == "3.13" %PYTHON_EXEC% -m pip install numpy==2.1.2 cmake | ||||
| if "%DESIRED_PYTHON%" == "3.12" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake | ||||
| @ -16,7 +19,7 @@ if "%DESIRED_PYTHON%" == "3.9" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake | ||||
|  | ||||
| %PYTHON_EXEC% -m pip install pyyaml | ||||
| %PYTHON_EXEC% -m pip install mkl-include mkl-static | ||||
| %PYTHON_EXEC% -m pip install boto3 ninja typing_extensions setuptools==72.1.0 | ||||
| %PYTHON_EXEC% -m pip install boto3 ninja typing-extensions | ||||
|  | ||||
| where cmake.exe | ||||
|  | ||||
|  | ||||
| @ -127,7 +127,7 @@ export INSTALL_TEST=0 # dont install test binaries into site-packages | ||||
| export MACOSX_DEPLOYMENT_TARGET=10.15 | ||||
| export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} | ||||
|  | ||||
| SETUPTOOLS_PINNED_VERSION="==70.1.0" | ||||
| SETUPTOOLS_PINNED_VERSION="==77.0.0" | ||||
| PYYAML_PINNED_VERSION="=5.3" | ||||
| EXTRA_CONDA_INSTALL_FLAGS="" | ||||
| CONDA_ENV_CREATE_FLAGS="" | ||||
| @ -135,7 +135,7 @@ RENAME_WHEEL=true | ||||
| case $desired_python in | ||||
|     3.13t) | ||||
|         echo "Using 3.13 deps" | ||||
|         SETUPTOOLS_PINNED_VERSION=">=70.1.0" | ||||
|         SETUPTOOLS_PINNED_VERSION=">=77.0.0" | ||||
|         PYYAML_PINNED_VERSION=">=6.0.1" | ||||
|         NUMPY_PINNED_VERSION="=2.1.0" | ||||
|         CONDA_ENV_CREATE_FLAGS="python-freethreading" | ||||
| @ -145,31 +145,31 @@ case $desired_python in | ||||
|         ;; | ||||
|     3.13) | ||||
|         echo "Using 3.13 deps" | ||||
|         SETUPTOOLS_PINNED_VERSION=">=70.1.0" | ||||
|         SETUPTOOLS_PINNED_VERSION=">=77.0.0" | ||||
|         PYYAML_PINNED_VERSION=">=6.0.1" | ||||
|         NUMPY_PINNED_VERSION="=2.1.0" | ||||
|         ;; | ||||
|     3.12) | ||||
|         echo "Using 3.12 deps" | ||||
|         SETUPTOOLS_PINNED_VERSION=">=70.1.0" | ||||
|         SETUPTOOLS_PINNED_VERSION=">=77.0.0" | ||||
|         PYYAML_PINNED_VERSION=">=6.0.1" | ||||
|         NUMPY_PINNED_VERSION="=2.0.2" | ||||
|         ;; | ||||
|     3.11) | ||||
|         echo "Using 3.11 deps" | ||||
|         SETUPTOOLS_PINNED_VERSION=">=70.1.0" | ||||
|         SETUPTOOLS_PINNED_VERSION=">=77.0.0" | ||||
|         PYYAML_PINNED_VERSION=">=5.3" | ||||
|         NUMPY_PINNED_VERSION="=2.0.2" | ||||
|         ;; | ||||
|     3.10) | ||||
|         echo "Using 3.10 deps" | ||||
|         SETUPTOOLS_PINNED_VERSION=">=70.1.0" | ||||
|         SETUPTOOLS_PINNED_VERSION=">=77.0.0" | ||||
|         PYYAML_PINNED_VERSION=">=5.3" | ||||
|         NUMPY_PINNED_VERSION="=2.0.2" | ||||
|         ;; | ||||
|     3.9) | ||||
|         echo "Using 3.9 deps" | ||||
|         SETUPTOOLS_PINNED_VERSION=">=70.1.0" | ||||
|         SETUPTOOLS_PINNED_VERSION=">=77.0.0" | ||||
|         PYYAML_PINNED_VERSION=">=5.3" | ||||
|         NUMPY_PINNED_VERSION="=2.0.2" | ||||
|         ;; | ||||
|  | ||||
							
								
								
									
										4
									
								
								.flake8
									
									
									
									
									
								
							
							
						
						
									
										4
									
								
								.flake8
									
									
									
									
									
								
							| @ -7,12 +7,12 @@ max-line-length = 120 | ||||
| # C408 ignored because we like the dict keyword argument syntax | ||||
| # E501 is not flexible enough, we're using B950 instead | ||||
| ignore = | ||||
|     E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824, | ||||
|     E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303, | ||||
|     # shebang has extra meaning in fbcode lints, so I think it's not worth trying | ||||
|     # to line this up with executable bit | ||||
|     EXE001, | ||||
|     # these ignores are from flake8-bugbear; please fix! | ||||
|     B007,B008,B017,B019,B023,B028,B903,B904,B905,B906,B907,B908,B910 | ||||
|     B007,B008,B017,B019,B023,B028,B903,B904,B905,B906,B907 | ||||
|     # these ignores are from flake8-comprehensions; please fix! | ||||
|     C407, | ||||
|     # these ignores are from flake8-logging-format; please fix! | ||||
|  | ||||
							
								
								
									
										10
									
								
								.github/actionlint.yaml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										10
									
								
								.github/actionlint.yaml
									
									
									
									
										vendored
									
									
								
							| @ -53,12 +53,16 @@ self-hosted-runner: | ||||
|     - linux.rocm.gpu.mi250 | ||||
|     - linux.rocm.gpu.2 | ||||
|     - linux.rocm.gpu.4 | ||||
|     # gfx942 runners | ||||
|     - linux.rocm.gpu.gfx942.2 | ||||
|     - linux.rocm.gpu.gfx942.4 | ||||
|     # MI300 runners | ||||
|     - linux.rocm.gpu.mi300.2 | ||||
|     - linux.rocm.gpu.mi300.4 | ||||
|     - rocm-docker | ||||
|     # Repo-specific Apple hosted  runners | ||||
|     - macos-m1-ultra | ||||
|     - macos-m2-14 | ||||
|     # Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors) | ||||
|     - macos-m1-stable | ||||
|     - macos-m1-13 | ||||
|     - macos-m1-14 | ||||
|     # GitHub-hosted MacOS runners | ||||
|     - macos-latest-xlarge | ||||
|  | ||||
							
								
								
									
										78
									
								
								.github/actions/build-android/action.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										78
									
								
								.github/actions/build-android/action.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @ -0,0 +1,78 @@ | ||||
| name: build android | ||||
|  | ||||
| description: build android for a specific arch | ||||
|  | ||||
| inputs: | ||||
|   arch: | ||||
|     description: arch to build | ||||
|     required: true | ||||
|   arch-for-build-env: | ||||
|     description: | | ||||
|       arch to pass to build environment. | ||||
|       This is currently different than the arch name we use elsewhere, which | ||||
|       should be fixed. | ||||
|     required: true | ||||
|   github-secret: | ||||
|     description: github token | ||||
|     required: true | ||||
|   build-environment: | ||||
|     required: true | ||||
|     description: Top-level label for what's being built/tested. | ||||
|   docker-image: | ||||
|     required: true | ||||
|     description: Name of the base docker image to build with. | ||||
|   branch: | ||||
|     required: true | ||||
|     description: What branch we are building on. | ||||
| outputs: | ||||
|   container_id: | ||||
|     description: Docker container identifier used to build the artifacts | ||||
|     value: ${{ steps.build.outputs.container_id }} | ||||
|  | ||||
| runs: | ||||
|   using: composite | ||||
|   steps: | ||||
|     - name: Build-${{ inputs.arch }} | ||||
|       id: build | ||||
|       shell: bash | ||||
|       env: | ||||
|         BRANCH: ${{ inputs.branch }} | ||||
|         BUILD_ENVIRONMENT: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-${{ inputs.arch-for-build-env }}-build" | ||||
|         AWS_DEFAULT_REGION: us-east-1 | ||||
|         PR_NUMBER: ${{ github.event.pull_request.number }} | ||||
|         SHA1: ${{ github.event.pull_request.head.sha || github.sha }} | ||||
|         SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 | ||||
|         SCCACHE_REGION: us-east-1 | ||||
|         DOCKER_IMAGE: ${{ inputs.docker-image  }} | ||||
|         MATRIX_ARCH: ${{ inputs.arch }} | ||||
|       run: | | ||||
|         # detached container should get cleaned up by teardown_ec2_linux | ||||
|         set -exo pipefail | ||||
|         export container_name | ||||
|         container_name=$(docker run \ | ||||
|           -e BUILD_ENVIRONMENT \ | ||||
|           -e MAX_JOBS="$(nproc --ignore=2)" \ | ||||
|           -e AWS_DEFAULT_REGION \ | ||||
|           -e PR_NUMBER \ | ||||
|           -e SHA1 \ | ||||
|           -e BRANCH \ | ||||
|           -e SCCACHE_BUCKET \ | ||||
|           -e SCCACHE_REGION \ | ||||
|           -e SKIP_SCCACHE_INITIALIZATION=1 \ | ||||
|           --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ | ||||
|           --security-opt seccomp=unconfined \ | ||||
|           --cap-add=SYS_PTRACE \ | ||||
|           --tty \ | ||||
|           --detach \ | ||||
|           --user jenkins \ | ||||
|           -w /var/lib/jenkins/workspace \ | ||||
|           "${DOCKER_IMAGE}" | ||||
|         ) | ||||
|         git submodule sync && git submodule update -q --init --recursive --depth 1 | ||||
|         docker cp "${GITHUB_WORKSPACE}/." "${container_name}:/var/lib/jenkins/workspace" | ||||
|         (echo "sudo chown -R jenkins . && .ci/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete" | docker exec -u jenkins -i "${container_name}" bash) 2>&1 | ||||
|  | ||||
|         # Copy install binaries back | ||||
|         mkdir -p "${GITHUB_WORKSPACE}/build_android_install_${MATRIX_ARCH}" | ||||
|         docker cp "${container_name}:/var/lib/jenkins/workspace/build_android/install" "${GITHUB_WORKSPACE}/build_android_install_${MATRIX_ARCH}" | ||||
|         echo "container_id=${container_name}" >> "${GITHUB_OUTPUT}" | ||||
| @ -70,7 +70,7 @@ runs: | ||||
|           set -eux | ||||
|           # PyYAML 6.0 doesn't work with MacOS x86 anymore | ||||
|           # This must run on Python-3.7 (AmazonLinux2) so can't use request=3.32.2 | ||||
|           python3 -m pip install requests==2.27.1 pyyaml==6.0.2 | ||||
|           python3 -m pip install requests==2.27.1 pyyaml==6.0.1 | ||||
|  | ||||
|     - name: Parse ref | ||||
|       id: parse-ref | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/ci_commit_pins/audio.txt
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/ci_commit_pins/audio.txt
									
									
									
									
										vendored
									
									
								
							| @ -1 +1 @@ | ||||
| 9b57c7bd5ad4db093c5bb31c802df9f04d933ac9 | ||||
| 00b0c91db92c51a11356249262577b9fa26c18c5 | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/ci_commit_pins/fbgemm_rocm.txt
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/ci_commit_pins/fbgemm_rocm.txt
									
									
									
									
										vendored
									
									
								
							| @ -1 +1 @@ | ||||
| 7f1de94a4c2d14f59ad4ca84538c36084ea6b2c8 | ||||
| 5fb5024118e9bb9decf96c2b0b1a8f0010bf56be | ||||
|  | ||||
							
								
								
									
										1
									
								
								.github/ci_commit_pins/vllm.txt
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.github/ci_commit_pins/vllm.txt
									
									
									
									
										vendored
									
									
								
							| @ -1 +0,0 @@ | ||||
| 6a39ba85fe0f2fff9494b5eccea717c93510c230 | ||||
							
								
								
									
										2
									
								
								.github/ci_commit_pins/xla.txt
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/ci_commit_pins/xla.txt
									
									
									
									
										vendored
									
									
								
							| @ -1 +1 @@ | ||||
| b6a5b82b9948b610fa4c304d0d869c82b8f17db1 | ||||
| 1c00dea2c9adb2137903c86b4191e8c247f8fda9 | ||||
|  | ||||
							
								
								
									
										34
									
								
								.github/merge_rules.yaml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										34
									
								
								.github/merge_rules.yaml
									
									
									
									
										vendored
									
									
								
							| @ -76,8 +76,8 @@ | ||||
|   - .github/ci_commit_pins/audio.txt | ||||
|   - .github/ci_commit_pins/vision.txt | ||||
|   - .github/ci_commit_pins/torchdynamo.txt | ||||
|   - .github/ci_commit_pins/vllm.txt | ||||
|   - .ci/docker/ci_commit_pins/triton.txt | ||||
|   - .ci/docker/ci_commit_pins/vllm.txt | ||||
|   approved_by: | ||||
|   - pytorchbot | ||||
|   mandatory_checks_name: | ||||
| @ -131,6 +131,21 @@ | ||||
|   - Lint | ||||
|   - pull | ||||
|  | ||||
| - name: Mobile | ||||
|   patterns: | ||||
|   - ios/** | ||||
|   - android/** | ||||
|   - test/mobile/** | ||||
|   approved_by: | ||||
|   - linbinyu | ||||
|   - IvanKobzarev | ||||
|   - dreiss | ||||
|   - raziel | ||||
|   mandatory_checks_name: | ||||
|   - EasyCLA | ||||
|   - Lint | ||||
|   - pull | ||||
|  | ||||
| - name: PrimTorch | ||||
|   patterns: | ||||
|   - torch/_meta_registrations.py | ||||
| @ -477,23 +492,6 @@ | ||||
|   - srossross | ||||
|   - chillee | ||||
|   - zou3519 | ||||
|   - guilhermeleobas | ||||
|   mandatory_checks_name: | ||||
|   - EasyCLA | ||||
|   - Lint | ||||
|   - pull | ||||
|  | ||||
| - name: Dynamo | ||||
|   patterns: | ||||
|   - torch/_dynamo/** | ||||
|   - torch/csrc/dynamo/** | ||||
|   - test/dynamo/** | ||||
|   - test/dynamo_expected_failures/** | ||||
|   - test/dynamo_skips/** | ||||
|   - test/inductor_expected_failures/** | ||||
|   - test/inductor_skips/** | ||||
|   approved_by: | ||||
|   - guilhermeleobas | ||||
|   mandatory_checks_name: | ||||
|   - EasyCLA | ||||
|   - Lint | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/pytorch-probot.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/pytorch-probot.yml
									
									
									
									
										vendored
									
									
								
							| @ -31,9 +31,7 @@ ciflow_push_tags: | ||||
| - ciflow/pull | ||||
| - ciflow/h100 | ||||
| - ciflow/h100-distributed | ||||
| - ciflow/win-arm64 | ||||
| - ciflow/h100-symm-mem | ||||
| - ciflow/h100-cutlass-backend | ||||
| retryable_workflows: | ||||
| - pull | ||||
| - trunk | ||||
|  | ||||
							
								
								
									
										8
									
								
								.github/requirements-gha-cache.txt
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										8
									
								
								.github/requirements-gha-cache.txt
									
									
									
									
										vendored
									
									
								
							| @ -7,9 +7,9 @@ | ||||
| #   .ci/docker/requirements-ci.txt | ||||
| boto3==1.35.42 | ||||
| jinja2==3.1.6 | ||||
| lintrunner==0.12.7 | ||||
| ninja==1.10.0.post1 | ||||
| lintrunner==0.10.7 | ||||
| ninja==1.11.1.4 | ||||
| nvidia-ml-py==11.525.84 | ||||
| pyyaml==6.0.2 | ||||
| pyyaml==6.0 | ||||
| requests==2.32.4 | ||||
| rich==14.1.0 | ||||
| rich==10.9.0 | ||||
|  | ||||
| @ -2,17 +2,17 @@ boto3==1.35.42 | ||||
| cmake==3.27.* | ||||
| expecttest==0.3.0 | ||||
| fbscribelogger==0.1.7 | ||||
| filelock==3.18.0 | ||||
| filelock==3.6.0 | ||||
| hypothesis==6.56.4 | ||||
| librosa>=0.6.2 | ||||
| mpmath==1.3.0 | ||||
| networkx==2.8.7 | ||||
| ninja==1.10.2.4 | ||||
| ninja==1.11.1.4 | ||||
| numba==0.59.0 | ||||
| numpy==1.26.4 | ||||
| opt-einsum>=3.3 | ||||
| optree==0.13.0 | ||||
| packaging==23.1 | ||||
| packaging==25.0 | ||||
| parameterized==0.8.1 | ||||
| pillow==10.3.0 | ||||
| protobuf==5.29.4 | ||||
| @ -26,7 +26,7 @@ pytest-xdist==3.3.1 | ||||
| pytest==7.3.2 | ||||
| pyyaml==6.0.2 | ||||
| scipy==1.12.0 | ||||
| setuptools==72.1.0 | ||||
| setuptools==80.9.0 | ||||
| sympy==1.13.3 | ||||
| tlparse==0.3.30 | ||||
| tensorboard==2.13.0 | ||||
|  | ||||
| @ -193,7 +193,7 @@ LIBTORCH_CONTAINER_IMAGES: dict[str, str] = { | ||||
|     "cpu": "libtorch-cxx11-builder:cpu", | ||||
| } | ||||
|  | ||||
| FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"] | ||||
| FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"] | ||||
|  | ||||
|  | ||||
| def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str: | ||||
| @ -315,11 +315,6 @@ def generate_wheels_matrix( | ||||
|             # TODO: Enable python 3.13t on cpu-s390x | ||||
|             if gpu_arch_type == "cpu-s390x" and python_version == "3.13t": | ||||
|                 continue | ||||
|             # TODO: Enable python 3.14 on non linux OSes | ||||
|             if os != "linux" and ( | ||||
|                 python_version == "3.14" or python_version == "3.14t" | ||||
|             ): | ||||
|                 continue | ||||
|  | ||||
|             if use_split_build and ( | ||||
|                 arch_version not in ["12.6", "12.8", "12.9", "cpu"] or os != "linux" | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/scripts/lintrunner.sh
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/scripts/lintrunner.sh
									
									
									
									
										vendored
									
									
								
							| @ -2,7 +2,7 @@ | ||||
| set -ex | ||||
|  | ||||
| # Use uv to speed up lintrunner init | ||||
| python3 -m pip install -U uv==0.8.* setuptools | ||||
| python3 -m pip install -U uv setuptools | ||||
|  | ||||
| CACHE_DIRECTORY="/tmp/.lintbin" | ||||
| # Try to recover the cached binaries | ||||
|  | ||||
							
								
								
									
										4
									
								
								.github/scripts/trymerge.py
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/scripts/trymerge.py
									
									
									
									
										vendored
									
									
								
							| @ -1891,9 +1891,7 @@ def validate_revert( | ||||
|         else pr.get_comment_by_id(comment_id) | ||||
|     ) | ||||
|     if comment.editor_login is not None: | ||||
|         raise PostCommentError( | ||||
|             "Halting the revert as the revert comment has been edited." | ||||
|         ) | ||||
|         raise PostCommentError("Don't want to revert based on edited command") | ||||
|     author_association = comment.author_association | ||||
|     author_login = comment.author_login | ||||
|     allowed_reverters = ["COLLABORATOR", "MEMBER", "OWNER"] | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/scripts/windows/build_triton.bat
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/scripts/windows/build_triton.bat
									
									
									
									
										vendored
									
									
								
							| @ -10,7 +10,7 @@ if "%PY_VERS%" == "3.13t" ( | ||||
|     call conda create -n %PYTHON_PREFIX% -y -c=conda-forge python=%PY_VERS% | ||||
| ) | ||||
| :: Fix cmake version for issue https://github.com/pytorch/pytorch/issues/150480 | ||||
| call conda run -n %PYTHON_PREFIX% pip install wheel pybind11 certifi cython cmake==3.31.6 setuptools==72.1.0 ninja | ||||
| call conda run -n %PYTHON_PREFIX% pip install wheel pybind11 certifi cython cmake==3.31.6 setuptools==78.1.1 ninja | ||||
|  | ||||
| dir "%VC_INSTALL_PATH%" | ||||
|  | ||||
|  | ||||
							
								
								
									
										4
									
								
								.github/workflows/_get-changed-files.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/workflows/_get-changed-files.yml
									
									
									
									
										vendored
									
									
								
							| @ -27,7 +27,7 @@ jobs: | ||||
|             PR_NUMBER="${{ github.event.number }}" | ||||
|  | ||||
|             # Use gh CLI to get changed files in the PR with explicit repo | ||||
|             CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//') | ||||
|             CHANGED_FILES=$(gh pr view "$PR_NUMBER" --repo "${{ github.repository }}" --json files --jq '.files[].path' | tr '\n' ' ' | sed 's/ $//') | ||||
|  | ||||
|             if [ -z "$CHANGED_FILES" ]; then | ||||
|               echo "No changed files found, setting to '*'" | ||||
| @ -40,4 +40,4 @@ jobs: | ||||
|           else | ||||
|             echo "Not in PR context, setting changed files to '*'" | ||||
|             echo "changed-files=*" >> "$GITHUB_OUTPUT" | ||||
|           fi | ||||
|           fi | ||||
							
								
								
									
										20
									
								
								.github/workflows/_linux-test.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										20
									
								
								.github/workflows/_linux-test.yml
									
									
									
									
										vendored
									
									
								
							| @ -96,7 +96,7 @@ jobs: | ||||
|     steps: | ||||
|       - name: Setup SSH (Click me for login details) | ||||
|         uses: pytorch/test-infra/.github/actions/setup-ssh@main | ||||
|         if: ${{ !contains(matrix.runner, 'b200') && inputs.build-environment != 'linux-s390x-binary-manywheel' }} | ||||
|         if: ${{ matrix.runner != 'B200' && inputs.build-environment != 'linux-s390x-binary-manywheel' }} | ||||
|         with: | ||||
|           github-secret: ${{ secrets.GITHUB_TOKEN }} | ||||
|           instructions: | | ||||
| @ -109,7 +109,7 @@ jobs: | ||||
|           no-sudo: true | ||||
|  | ||||
|       - name: Setup Python | ||||
|         if: contains(matrix.runner, 'b200') | ||||
|         if: matrix.runner == 'B200' | ||||
|         uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 | ||||
|         with: | ||||
|           python-version: '3.12' | ||||
| @ -117,7 +117,7 @@ jobs: | ||||
|  | ||||
|       - name: Setup Linux | ||||
|         uses: ./.github/actions/setup-linux | ||||
|         if: inputs.build-environment != 'linux-s390x-binary-manywheel' && !contains(matrix.runner, 'b200') | ||||
|         if: inputs.build-environment != 'linux-s390x-binary-manywheel' && matrix.runner != 'B200' | ||||
|  | ||||
|       - name: configure aws credentials | ||||
|         if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }} | ||||
| @ -128,7 +128,7 @@ jobs: | ||||
|           aws-region: us-east-1 | ||||
|  | ||||
|       - name: Login to Amazon ECR | ||||
|         if: ${{ inputs.aws-role-to-assume != '' && contains(matrix.runner, 'b200') }} | ||||
|         if: ${{ inputs.aws-role-to-assume != '' && matrix.runner == 'B200' }} | ||||
|         id: login-ecr | ||||
|         continue-on-error: true | ||||
|         uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 | ||||
| @ -166,17 +166,17 @@ jobs: | ||||
|         uses: pytorch/test-infra/.github/actions/setup-nvidia@main | ||||
|         with: | ||||
|           driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }} | ||||
|         if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }} | ||||
|         if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && matrix.runner != 'B200' }} | ||||
|  | ||||
|       - name: Setup GPU_FLAG for docker run | ||||
|         id: setup-gpu-flag | ||||
|         run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}" | ||||
|         if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || contains(matrix.runner, 'b200')) }} | ||||
|         if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || matrix.runner == 'B200') }} | ||||
|  | ||||
|       - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container | ||||
|         id: setup-sscache-port-flag | ||||
|         run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}" | ||||
|         if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && !contains(matrix.runner, 'b200') }} | ||||
|         if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && matrix.runner != 'B200' }} | ||||
|  | ||||
|       - name: Lock NVIDIA A100 40GB Frequency | ||||
|         run: | | ||||
| @ -277,8 +277,8 @@ jobs: | ||||
|           NO_TD: ${{ steps.keep-going.outputs.ci-no-td }} | ||||
|           TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }} | ||||
|           # Do not set SCCACHE_S3_KEY_PREFIX to share the cache between all build jobs | ||||
|           SCCACHE_BUCKET: ${{ !contains(matrix.runner, 'b200') && 'ossci-compiler-cache-circleci-v2' || '' }} | ||||
|           SCCACHE_REGION: ${{ !contains(matrix.runner, 'b200') && 'us-east-1' || '' }} | ||||
|           SCCACHE_BUCKET: ${{ matrix.runner != 'B200' && 'ossci-compiler-cache-circleci-v2' || '' }} | ||||
|           SCCACHE_REGION: ${{ matrix.runner != 'B200' && 'us-east-1' || '' }} | ||||
|           SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }} | ||||
|           DOCKER_IMAGE: ${{ inputs.docker-image }} | ||||
|           XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }} | ||||
| @ -403,7 +403,7 @@ jobs: | ||||
|           job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }} | ||||
|  | ||||
|       - name: Authenticate with AWS | ||||
|         if: ${{ contains(matrix.runner, 'b200') }} | ||||
|         if: ${{ matrix.runner == 'B200' }} | ||||
|         uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 | ||||
|         with: | ||||
|           role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results | ||||
|  | ||||
							
								
								
									
										5
									
								
								.github/workflows/_mac-test.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										5
									
								
								.github/workflows/_mac-test.yml
									
									
									
									
										vendored
									
									
								
							| @ -80,6 +80,11 @@ jobs: | ||||
|         run: | | ||||
|           sysctl machdep.cpu.brand_string kern.osproductversion | ||||
|  | ||||
|       - name: Install build toolchain | ||||
|         run: | | ||||
|           brew update --quiet | ||||
|           brew install --formula cmake ninja | ||||
|  | ||||
|       - name: Clean up leftover processes on MacOS pet runner | ||||
|         continue-on-error: true | ||||
|         run: | | ||||
|  | ||||
							
								
								
									
										4
									
								
								.github/workflows/_rocm-test.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/workflows/_rocm-test.yml
									
									
									
									
										vendored
									
									
								
							| @ -269,8 +269,8 @@ jobs: | ||||
|           # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct | ||||
|           docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test" | ||||
|  | ||||
|       - name: Change permissions (only needed for kubernetes runners for now) | ||||
|         if: ${{ always() && steps.test.conclusion && (contains(matrix.runner, 'gfx942') || contains(matrix.runner, 'mi355')) }} | ||||
|       - name: Change permissions (only needed for MI300 runners for now) | ||||
|         if: ${{ always() && steps.test.conclusion && contains(matrix.runner, 'mi300') }} | ||||
|         run: | | ||||
|           docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test" | ||||
|  | ||||
|  | ||||
							
								
								
									
										8
									
								
								.github/workflows/build-triton-wheel.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										8
									
								
								.github/workflows/build-triton-wheel.yml
									
									
									
									
										vendored
									
									
								
							| @ -50,7 +50,7 @@ jobs: | ||||
|     strategy: | ||||
|       fail-fast: false | ||||
|       matrix: | ||||
|         py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ] | ||||
|         py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t" ] | ||||
|         device: ["cuda", "rocm", "xpu", "aarch64"] | ||||
|         docker-image: ["pytorch/manylinux2_28-builder:cpu"] | ||||
|         include: | ||||
| @ -126,12 +126,6 @@ jobs: | ||||
|           3.13t) | ||||
|             PYTHON_EXECUTABLE=/opt/python/cp313-cp313t/bin/python | ||||
|             ;; | ||||
|           3.14) | ||||
|             PYTHON_EXECUTABLE=/opt/python/cp314-cp314/bin/python | ||||
|             ;; | ||||
|           3.14t) | ||||
|             PYTHON_EXECUTABLE=/opt/python/cp314-cp314t/bin/python | ||||
|             ;; | ||||
|           *) | ||||
|             echo "Unsupported python version ${PY_VERS}" | ||||
|             exit 1 | ||||
|  | ||||
| @ -56,7 +56,7 @@ jobs: | ||||
|           cache: pip | ||||
|           architecture: x64 | ||||
|  | ||||
|       - run: pip install pyyaml==6.0.2 | ||||
|       - run: pip install pyyaml==6.0 | ||||
|         shell: bash | ||||
|  | ||||
|       - name: Verify mergeability | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/workflows/cherry-pick.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/cherry-pick.yml
									
									
									
									
										vendored
									
									
								
							| @ -26,7 +26,7 @@ jobs: | ||||
|           cache: pip | ||||
|  | ||||
|       # Not the direct dependencies but the script uses trymerge | ||||
|       - run: pip install pyyaml==6.0.2 | ||||
|       - run: pip install pyyaml==6.0 | ||||
|  | ||||
|       - name: Setup committer id | ||||
|         run: | | ||||
|  | ||||
							
								
								
									
										4
									
								
								.github/workflows/docker-builds.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/workflows/docker-builds.yml
									
									
									
									
										vendored
									
									
								
							| @ -50,7 +50,6 @@ jobs: | ||||
|         runner: [linux.12xlarge] | ||||
|         docker-image-name: [ | ||||
|           pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11, | ||||
|           pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm, | ||||
|           pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks, | ||||
|           pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks, | ||||
|           pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks, | ||||
| @ -76,8 +75,7 @@ jobs: | ||||
|           pytorch-linux-jammy-py3-clang12-onnx, | ||||
|           pytorch-linux-jammy-linter, | ||||
|           pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter, | ||||
|           # Executorch pin needs update | ||||
|           # pytorch-linux-jammy-py3-clang12-executorch, | ||||
|           pytorch-linux-jammy-py3-clang12-executorch, | ||||
|           pytorch-linux-jammy-py3.12-triton-cpu | ||||
|         ] | ||||
|         include: | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/workflows/docker-release.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/docker-release.yml
									
									
									
									
										vendored
									
									
								
							| @ -144,7 +144,7 @@ jobs: | ||||
|         run: | | ||||
|           make -f docker.Makefile "${BUILD_IMAGE_TYPE}-image" | ||||
|       - name: Push nightly tags | ||||
|         if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' && matrix.platform == 'linux/amd4' }} | ||||
|         if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' && matrix.build_platforms == 'linux/amd4' }} | ||||
|         run: | | ||||
|           PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-cuda${CUDA_VERSION_SHORT}-cudnn${CUDNN_VERSION}-runtime" | ||||
|           CUDA_SUFFIX="-cu${CUDA_VERSION}" | ||||
|  | ||||
							
								
								
									
										1226
									
								
								.github/workflows/generated-linux-binary-manywheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							
							
						
						
									
										1226
									
								
								.github/workflows/generated-linux-binary-manywheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										58
									
								
								.github/workflows/h100-cutlass-backend.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										58
									
								
								.github/workflows/h100-cutlass-backend.yml
									
									
									
									
										vendored
									
									
								
							| @ -1,58 +0,0 @@ | ||||
| name: Limited CI for CUTLASS backend on H100 | ||||
|  | ||||
| on: | ||||
|   pull_request: | ||||
|     paths: | ||||
|       - .github/workflows/h100-cutlass-backend.yml | ||||
|   workflow_dispatch: | ||||
|   schedule: | ||||
|     - cron: 22 9 * * *  # every 24 hours about 2:22am PDT | ||||
|   push: | ||||
|     tags: | ||||
|       - ciflow/h100-cutlass-backend/* | ||||
|  | ||||
| concurrency: | ||||
|   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} | ||||
|   cancel-in-progress: true | ||||
|  | ||||
| permissions: | ||||
|   id-token: write | ||||
|   contents: read | ||||
|  | ||||
| jobs: | ||||
|  | ||||
|   get-label-type: | ||||
|     if: github.repository_owner == 'pytorch' | ||||
|     name: get-label-type | ||||
|     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main | ||||
|     with: | ||||
|       triggering_actor: ${{ github.triggering_actor }} | ||||
|       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} | ||||
|       curr_branch: ${{ github.head_ref || github.ref_name }} | ||||
|       curr_ref_type: ${{ github.ref_type }} | ||||
|  | ||||
|   linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-cutlass-backend: | ||||
|     name: linux-jammy-cuda12.8-py3.10-gcc11-sm90-cutlass-backend | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|     needs: get-label-type | ||||
|     with: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-cutlass-backend | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 | ||||
|       cuda-arch-list: '9.0' | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
|           { config: "h100_cutlass_backend", shard: 1, num_shards: 1, runner: "linux.aws.h100", owners: ["oncall:pt2"] }, | ||||
|         ]} | ||||
|     secrets: inherit | ||||
|  | ||||
|   linux-jammy-cuda12_8-py3_10-gcc11-sm90-test: | ||||
|     name: linux-jammy-cuda12.8-py3.10-gcc11-sm90-cutlass-backend | ||||
|     uses: ./.github/workflows/_linux-test.yml | ||||
|     needs: | ||||
|       - linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-cutlass-backend | ||||
|     with: | ||||
|       build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-cutlass-backend | ||||
|       docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-cutlass-backend.outputs.docker-image }} | ||||
|       test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-cutlass-backend.outputs.test-matrix }} | ||||
|     secrets: inherit | ||||
							
								
								
									
										154
									
								
								.github/workflows/inductor-perf-test-b200.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										154
									
								
								.github/workflows/inductor-perf-test-b200.yml
									
									
									
									
										vendored
									
									
								
							| @ -1,154 +0,0 @@ | ||||
| name: inductor-perf-b200 | ||||
|  | ||||
| on: | ||||
|   schedule: | ||||
|     - cron: 0 7 * * 1-6 | ||||
|     - cron: 0 7 * * 0 | ||||
|   # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it | ||||
|   # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs | ||||
|   workflow_dispatch: | ||||
|     inputs: | ||||
|       training: | ||||
|         description: Run training (on by default)? | ||||
|         required: false | ||||
|         type: boolean | ||||
|         default: true | ||||
|       inference: | ||||
|         description: Run inference (on by default)? | ||||
|         required: false | ||||
|         type: boolean | ||||
|         default: true | ||||
|       default: | ||||
|         description: Run inductor_default? | ||||
|         required: false | ||||
|         type: boolean | ||||
|         default: false | ||||
|       dynamic: | ||||
|         description: Run inductor_dynamic_shapes? | ||||
|         required: false | ||||
|         type: boolean | ||||
|         default: false | ||||
|       cppwrapper: | ||||
|         description: Run inductor_cpp_wrapper? | ||||
|         required: false | ||||
|         type: boolean | ||||
|         default: false | ||||
|       cudagraphs: | ||||
|         description: Run inductor_cudagraphs? | ||||
|         required: false | ||||
|         type: boolean | ||||
|         default: true | ||||
|       freezing_cudagraphs: | ||||
|         description: Run inductor_cudagraphs with freezing for inference? | ||||
|         required: false | ||||
|         type: boolean | ||||
|         default: false | ||||
|       aotinductor: | ||||
|         description: Run aot_inductor for inference? | ||||
|         required: false | ||||
|         type: boolean | ||||
|         default: false | ||||
|       maxautotune: | ||||
|         description: Run inductor_max_autotune? | ||||
|         required: false | ||||
|         type: boolean | ||||
|         default: false | ||||
|       benchmark_configs: | ||||
|         description: The list of configs used the benchmark | ||||
|         required: false | ||||
|         type: string | ||||
|         default: inductor_huggingface_perf_cuda_b200,inductor_timm_perf_cuda_b200,inductor_torchbench_perf_cuda_b200 | ||||
|  | ||||
| concurrency: | ||||
|   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} | ||||
|   cancel-in-progress: true | ||||
|  | ||||
| permissions: | ||||
|   id-token: write | ||||
|   contents: read | ||||
|  | ||||
| jobs: | ||||
|   get-label-type: | ||||
|     name: get-label-type | ||||
|     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main | ||||
|     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} | ||||
|     with: | ||||
|       triggering_actor: ${{ github.triggering_actor }} | ||||
|       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} | ||||
|       curr_branch: ${{ github.head_ref || github.ref_name }} | ||||
|       curr_ref_type: ${{ github.ref_type }} | ||||
|       opt_out_experiments: lf | ||||
|  | ||||
|   build: | ||||
|     name: cuda12.8-py3.10-gcc9-sm100 | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|     needs: get-label-type | ||||
|     with: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       # Use a bigger runner here because CUDA_ARCH 9.0 is only built for H100 | ||||
|       # or newer GPUs, so it doesn't benefit much from existing compiler cache | ||||
|       # from trunk. Also use a memory-intensive runner here because memory is | ||||
|       # usually the bottleneck | ||||
|       runner: linux.12xlarge.memory | ||||
|       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100 | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks | ||||
|       cuda-arch-list: '10.0' | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
|           { config: "inductor_huggingface_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" }, | ||||
|           { config: "inductor_timm_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" }, | ||||
|           { config: "inductor_torchbench_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" }, | ||||
|         ]} | ||||
|       selected-test-configs: ${{ inputs.benchmark_configs }} | ||||
|       build-additional-packages: "vision audio fbgemm torchao" | ||||
|     secrets: inherit | ||||
|  | ||||
|   test-periodically: | ||||
|     name: cuda12.8-py3.10-gcc9-sm100 | ||||
|     uses: ./.github/workflows/_linux-test.yml | ||||
|     needs: build | ||||
|     if: github.event.schedule == '0 7 * * 1-6' | ||||
|     with: | ||||
|       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100 | ||||
|       dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true | ||||
|       docker-image: ${{ needs.build.outputs.docker-image }} | ||||
|       test-matrix: ${{ needs.build.outputs.test-matrix }} | ||||
|       aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only | ||||
|       timeout-minutes: 720 | ||||
|       disable-monitor: false | ||||
|       monitor-log-interval: 15 | ||||
|       monitor-data-collect-interval: 4 | ||||
|     secrets: inherit | ||||
|  | ||||
|   test-weekly: | ||||
|     name: cuda12.8-py3.10-gcc9-sm100 | ||||
|     uses: ./.github/workflows/_linux-test.yml | ||||
|     needs: build | ||||
|     if: github.event.schedule == '0 7 * * 0' | ||||
|     with: | ||||
|       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100 | ||||
|       dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true | ||||
|       docker-image: ${{ needs.build.outputs.docker-image }} | ||||
|       test-matrix: ${{ needs.build.outputs.test-matrix }} | ||||
|       timeout-minutes: 1440 | ||||
|       aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only | ||||
|       disable-monitor: false | ||||
|       monitor-log-interval: 15 | ||||
|       monitor-data-collect-interval: 4 | ||||
|     secrets: inherit | ||||
|  | ||||
|   test: | ||||
|     name: cuda12.8-py3.10-gcc9-sm100 | ||||
|     uses: ./.github/workflows/_linux-test.yml | ||||
|     needs: build | ||||
|     with: | ||||
|       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100 | ||||
|       dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }} | ||||
|       docker-image: ${{ needs.build.outputs.docker-image }} | ||||
|       test-matrix: ${{ needs.build.outputs.test-matrix }} | ||||
|       aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only | ||||
|       timeout-minutes: 720 | ||||
|       disable-monitor: false | ||||
|       monitor-log-interval: 15 | ||||
|       monitor-data-collect-interval: 4 | ||||
|     secrets: inherit | ||||
| @ -2,7 +2,7 @@ name: inductor-perf-nightly-h100 | ||||
|  | ||||
| on: | ||||
|   schedule: | ||||
|     - cron: 15 0,12 * * 1-6 | ||||
|     - cron: 15 0,4,8,12,16,20 * * 1-6 | ||||
|     - cron: 0 7 * * 0 | ||||
|   # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it | ||||
|   # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs | ||||
| @ -126,7 +126,7 @@ jobs: | ||||
|     name: cuda12.8-py3.10-gcc9-sm90 | ||||
|     uses: ./.github/workflows/_linux-test.yml | ||||
|     needs: build | ||||
|     if: github.event.schedule == '15 0,12 * * 1-6' | ||||
|     if: github.event.schedule == '15 0,4,8,12,16,20 * * 1-6' | ||||
|     with: | ||||
|       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90 | ||||
|       dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true | ||||
|  | ||||
| @ -88,23 +88,23 @@ jobs: | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
|           { config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|         ]} | ||||
|     secrets: inherit | ||||
|  | ||||
|  | ||||
							
								
								
									
										30
									
								
								.github/workflows/inductor-periodic.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										30
									
								
								.github/workflows/inductor-periodic.yml
									
									
									
									
										vendored
									
									
								
							| @ -81,21 +81,21 @@ jobs: | ||||
|       sync-tag: rocm-build | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
|           { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|         ]} | ||||
|     secrets: inherit | ||||
|  | ||||
|  | ||||
							
								
								
									
										4
									
								
								.github/workflows/inductor-rocm-mi300.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/workflows/inductor-rocm-mi300.yml
									
									
									
									
										vendored
									
									
								
							| @ -47,8 +47,8 @@ jobs: | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
|           { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|         ]} | ||||
|     secrets: inherit | ||||
|  | ||||
|  | ||||
							
								
								
									
										1
									
								
								.github/workflows/mac-mps.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.github/workflows/mac-mps.yml
									
									
									
									
										vendored
									
									
								
							| @ -28,6 +28,7 @@ jobs: | ||||
|       # than our AWS macos-m1-14 runners | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
|           { config: "test_mps", shard: 1, num_shards: 1, runner: "macos-m1-13" }, | ||||
|           { config: "test_mps", shard: 1, num_shards: 1, runner: "macos-m1-14" }, | ||||
|           { config: "test_mps", shard: 1, num_shards: 1, runner: "macos-m2-15" }, | ||||
|         ]} | ||||
|  | ||||
							
								
								
									
										11
									
								
								.github/workflows/nightly.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										11
									
								
								.github/workflows/nightly.yml
									
									
									
									
										vendored
									
									
								
							| @ -75,11 +75,10 @@ jobs: | ||||
|             repo-owner: pytorch | ||||
|             branch: main | ||||
|             pin-folder: .github/ci_commit_pins | ||||
|           # executorch jobs are disabled since it needs some manual work for the hash update | ||||
|           # - repo-name: executorch | ||||
|           #   repo-owner: pytorch | ||||
|           #   branch: main | ||||
|           #   pin-folder: .ci/docker/ci_commit_pins | ||||
|           - repo-name: executorch | ||||
|             repo-owner: pytorch | ||||
|             branch: main | ||||
|             pin-folder: .ci/docker/ci_commit_pins | ||||
|           - repo-name: triton | ||||
|             repo-owner: triton-lang | ||||
|             branch: main | ||||
| @ -87,7 +86,7 @@ jobs: | ||||
|           - repo-name: vllm | ||||
|             repo-owner: vllm-project | ||||
|             branch: main | ||||
|             pin-folder: .github/ci_commit_pins | ||||
|             pin-folder: .ci/docker/ci_commit_pins | ||||
|     # Allow this to be triggered on either a schedule or on workflow_dispatch to allow for easier testing | ||||
|     if: github.repository_owner == 'pytorch' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') | ||||
|     steps: | ||||
|  | ||||
							
								
								
									
										6
									
								
								.github/workflows/periodic-rocm-mi300.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										6
									
								
								.github/workflows/periodic-rocm-mi300.yml
									
									
									
									
										vendored
									
									
								
							| @ -59,9 +59,9 @@ jobs: | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
|           { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] }, | ||||
|           { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] }, | ||||
|           { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] }, | ||||
|           { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] }, | ||||
|           { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] }, | ||||
|           { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] }, | ||||
|         ]} | ||||
|     secrets: inherit | ||||
|  | ||||
|  | ||||
							
								
								
									
										16
									
								
								.github/workflows/pull.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										16
									
								
								.github/workflows/pull.yml
									
									
									
									
										vendored
									
									
								
							| @ -315,6 +315,21 @@ jobs: | ||||
|       test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }} | ||||
|     secrets: inherit | ||||
|  | ||||
|   linux-jammy-py3-clang18-mobile-build: | ||||
|     name: linux-jammy-py3-clang18-mobile-build | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|     needs: get-label-type | ||||
|     with: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build-environment: linux-jammy-py3-clang12-mobile-build | ||||
|       docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan | ||||
|       build-generates-artifacts: false | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
|           { config: "default", shard: 1, num_shards: 1 }, | ||||
|         ]} | ||||
|     secrets: inherit | ||||
|  | ||||
|   linux-jammy-cuda12_8-cudnn9-py3_9-clang12-build: | ||||
|     name: linux-jammy-cuda12.8-cudnn9-py3.9-clang12 | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
| @ -434,7 +449,6 @@ jobs: | ||||
|     secrets: inherit | ||||
|  | ||||
|   linux-jammy-py3-clang12-executorch-build: | ||||
|     if: false  # Docker build needs pin update | ||||
|     name: linux-jammy-py3-clang12-executorch | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|     needs: get-label-type | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/workflows/revert.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/revert.yml
									
									
									
									
										vendored
									
									
								
							| @ -26,7 +26,7 @@ jobs: | ||||
|           architecture: x64 | ||||
|           check-latest: false | ||||
|           cache: pip | ||||
|       - run: pip install pyyaml==6.0.2 | ||||
|       - run: pip install pyyaml==6.0 | ||||
|  | ||||
|       - name: Setup committer id | ||||
|         run: | | ||||
|  | ||||
							
								
								
									
										12
									
								
								.github/workflows/rocm-mi300.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										12
									
								
								.github/workflows/rocm-mi300.yml
									
									
									
									
										vendored
									
									
								
							| @ -48,12 +48,12 @@ jobs: | ||||
|       sync-tag: rocm-build | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
|           { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" }, | ||||
|           { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|           { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" }, | ||||
|         ]} | ||||
|     secrets: inherit | ||||
|  | ||||
|  | ||||
							
								
								
									
										68
									
								
								.github/workflows/rocm-mi355.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										68
									
								
								.github/workflows/rocm-mi355.yml
									
									
									
									
										vendored
									
									
								
							| @ -1,68 +0,0 @@ | ||||
| name: rocm-mi355 | ||||
|  | ||||
| on: | ||||
|   workflow_dispatch: | ||||
|   schedule: | ||||
|     - cron: 30 11,1 * * *  # about 4:30am PDT and 6:30pm PDT | ||||
|  | ||||
| concurrency: | ||||
|   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} | ||||
|   cancel-in-progress: true | ||||
|  | ||||
| permissions: read-all | ||||
|  | ||||
| jobs: | ||||
|   target-determination: | ||||
|     if: github.repository_owner == 'pytorch' | ||||
|     name: before-test | ||||
|     uses: ./.github/workflows/target_determination.yml | ||||
|     permissions: | ||||
|       id-token: write | ||||
|       contents: read | ||||
|  | ||||
|   get-label-type: | ||||
|     name: get-label-type | ||||
|     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main | ||||
|     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} | ||||
|     with: | ||||
|       triggering_actor: ${{ github.triggering_actor }} | ||||
|       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} | ||||
|       curr_branch: ${{ github.head_ref || github.ref_name }} | ||||
|       curr_ref_type: ${{ github.ref_type }} | ||||
|  | ||||
|   linux-noble-rocm-py3_12-build: | ||||
|     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} | ||||
|     name: linux-noble-rocm-py3.12-mi355 | ||||
|     uses: ./.github/workflows/_linux-build.yml | ||||
|     needs: get-label-type | ||||
|     with: | ||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||
|       build-environment: linux-noble-rocm-py3.12-mi355 | ||||
|       docker-image-name: ci-image:pytorch-linux-noble-rocm-alpha-py3 | ||||
|       sync-tag: rocm-build | ||||
|       test-matrix: | | ||||
|         { include: [ | ||||
|           { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" }, | ||||
|           { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" }, | ||||
|           { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" }, | ||||
|           { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" }, | ||||
|           { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" }, | ||||
|           { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" }, | ||||
|         ]} | ||||
|     secrets: inherit | ||||
|  | ||||
|   linux-noble-rocm-py3_12-test: | ||||
|     permissions: | ||||
|       id-token: write | ||||
|       contents: read | ||||
|     name: linux-noble-rocm-py3.12-mi355 | ||||
|     uses: ./.github/workflows/_rocm-test.yml | ||||
|     needs: | ||||
|       - linux-noble-rocm-py3_12-build | ||||
|       - target-determination | ||||
|     with: | ||||
|       build-environment: linux-noble-rocm-py3.12-mi355 | ||||
|       docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }} | ||||
|       test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }} | ||||
|       tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor" | ||||
|     secrets: inherit | ||||
							
								
								
									
										1
									
								
								.github/workflows/trunk.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.github/workflows/trunk.yml
									
									
									
									
										vendored
									
									
								
							| @ -94,6 +94,7 @@ jobs: | ||||
|           { config: "default", shard: 1, num_shards: 3, runner: "macos-m1-stable" }, | ||||
|           { config: "default", shard: 2, num_shards: 3, runner: "macos-m1-stable" }, | ||||
|           { config: "default", shard: 3, num_shards: 3, runner: "macos-m1-stable" }, | ||||
|           { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-13" }, | ||||
|           { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-14" }, | ||||
|           { config: "mps", shard: 1, num_shards: 1, runner: "macos-m2-15" }, | ||||
|         ]} | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/workflows/trymerge.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/trymerge.yml
									
									
									
									
										vendored
									
									
								
							| @ -28,7 +28,7 @@ jobs: | ||||
|           check-latest: false | ||||
|           cache: pip | ||||
|           architecture: x64 | ||||
|       - run: pip install pyyaml==6.0.2 | ||||
|       - run: pip install pyyaml==6.0 | ||||
|  | ||||
|       - name: Setup committer id | ||||
|         run: | | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/workflows/tryrebase.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/tryrebase.yml
									
									
									
									
										vendored
									
									
								
							| @ -25,7 +25,7 @@ jobs: | ||||
|           architecture: x64 | ||||
|           check-latest: false | ||||
|           cache: pip | ||||
|       - run: pip install pyyaml==6.0.2 | ||||
|       - run: pip install pyyaml==6.0 | ||||
|  | ||||
|       - name: Setup committer id | ||||
|         run: | | ||||
|  | ||||
							
								
								
									
										2
									
								
								.github/workflows/update-viablestrict.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/update-viablestrict.yml
									
									
									
									
										vendored
									
									
								
							| @ -23,7 +23,7 @@ jobs: | ||||
|         with: | ||||
|           repository: pytorch/pytorch | ||||
|           stable-branch: viable/strict | ||||
|           requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\", \"linux-aarch64\"]' | ||||
|           requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\"]' | ||||
|           secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }} | ||||
|           clickhouse-url: ${{ secrets.CLICKHOUSE_URL }} | ||||
|           clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }} | ||||
|  | ||||
							
								
								
									
										1
									
								
								.github/workflows/upload-test-stats.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.github/workflows/upload-test-stats.yml
									
									
									
									
										vendored
									
									
								
							| @ -14,7 +14,6 @@ on: | ||||
|       - inductor-periodic | ||||
|       - rocm | ||||
|       - rocm-mi300 | ||||
|       - rocm-mi355 | ||||
|       - inductor-micro-benchmark | ||||
|       - inductor-micro-benchmark-x86 | ||||
|       - inductor-cu124 | ||||
|  | ||||
							
								
								
									
										187
									
								
								.github/workflows/win-arm64-build-test.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										187
									
								
								.github/workflows/win-arm64-build-test.yml
									
									
									
									
										vendored
									
									
								
							| @ -1,187 +0,0 @@ | ||||
| name: windows-arm64-build-test | ||||
|  | ||||
| on: | ||||
|   push: | ||||
|     tags: | ||||
|       - ciflow/win-arm64/* | ||||
|  | ||||
| env: | ||||
|   GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} | ||||
|   PYTHON_VERSION: "3.12" | ||||
|   PYTORCH_ROOT: ${{ github.workspace }}/pytorch | ||||
|   DOWNLOADS_DIR: c:\temp\downloads | ||||
|   DEPENDENCIES_DIR: c:\temp\dependencies | ||||
|   ENABLE_APL: 1 | ||||
|   ENABLE_OPENBLAS: 0 | ||||
|   BUILD_TYPE: release | ||||
|  | ||||
| permissions: | ||||
|   id-token: write | ||||
|   contents: read | ||||
|  | ||||
| jobs: | ||||
|   build: | ||||
|     # Don't run on forked repos. | ||||
|     if: github.repository_owner == 'pytorch' | ||||
|     runs-on: "windows-11-arm64-preview" | ||||
|     timeout-minutes: 240 | ||||
|     steps: | ||||
|       - name: configure aws credentials | ||||
|         id: aws_creds | ||||
|         uses: aws-actions/configure-aws-credentials@v4 | ||||
|         with: | ||||
|           role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_sscache | ||||
|           aws-region: us-east-1 | ||||
|           role-duration-seconds: 18000 | ||||
|  | ||||
|       - name: Enable long paths | ||||
|         shell: cmd | ||||
|         run: | | ||||
|           git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now" | ||||
|           git config --system core.longpaths true | ||||
|  | ||||
|       - name: Git checkout PyTorch | ||||
|         uses: actions/checkout@v4 | ||||
|         with: | ||||
|           path: pytorch | ||||
|           submodules: recursive | ||||
|  | ||||
|       - name: Bootstrap Python | ||||
|         shell: cmd | ||||
|         run: | | ||||
|           "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat" | ||||
|  | ||||
|       - name: Parse ref | ||||
|         id: parse-ref | ||||
|         shell: bash | ||||
|         run: python pytorch/.github/scripts/parse_ref.py | ||||
|  | ||||
|       - name: Get workflow job id | ||||
|         shell: bash | ||||
|         id: get-job-id | ||||
|         run: | | ||||
|           set -eux | ||||
|           python pytorch/.github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}" | ||||
|         env: | ||||
|           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||||
|  | ||||
|       - name: Bootstrap APL | ||||
|         shell: cmd | ||||
|         run: | | ||||
|           "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat" | ||||
|  | ||||
|       - name: Bootstrap Rust | ||||
|         shell: cmd | ||||
|         run: | | ||||
|           "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat" | ||||
|  | ||||
|       - name: Bootstrap sccache | ||||
|         shell: cmd | ||||
|         run: | | ||||
|           "pytorch/.ci/pytorch/windows/arm64/bootstrap_sccache.bat" | ||||
|  | ||||
|       - name: Bootstrap Libuv | ||||
|         shell: cmd | ||||
|         run: | | ||||
|           "pytorch/.ci/pytorch/windows/arm64/bootstrap_libuv.bat" | ||||
|  | ||||
|       - name: Build | ||||
|         id: build | ||||
|         shell: cmd | ||||
|         env: | ||||
|           PYTORCH_FINAL_PACKAGE_DIR: C:/${{ github.run_id }}/build-results/ | ||||
|           BRANCH: ${{ steps.parse-ref.outputs.branch }} | ||||
|           BUILD_WHEEL: 1 | ||||
|           MAX_JOBS: 8 | ||||
|           PYTHON_VERSION: "3.12" | ||||
|           SCCACHE_BUCKET: "ossci-compiler-cache" | ||||
|           SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }} | ||||
|           SCCACHE_REGION: us-east-1 | ||||
|           VC_PRODUCT: "BuildTools" | ||||
|           VC_VERSION: "" | ||||
|           ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" | ||||
|           AWS_DEFAULT_REGION: us-east-1 | ||||
|           USE_CUDA: '0' | ||||
|           USE_XPU: '0' | ||||
|           OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }} | ||||
|         run: | | ||||
|           cd pytorch | ||||
|           call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64 | ||||
|           powershell -ExecutionPolicy Bypass -File ".ci/pytorch/win-arm64-build.ps1" | ||||
|  | ||||
|       - name: Upload artifacts | ||||
|         uses: actions/upload-artifact@v4.4.0 | ||||
|         if: always() | ||||
|         with: | ||||
|           name: torch-wheel-win-arm64-py3-12 | ||||
|           retention-days: 14 | ||||
|           if-no-files-found: error | ||||
|           path: C:\${{ github.run_id }}\build-results | ||||
|  | ||||
|   test: | ||||
|     if: github.repository_owner == 'pytorch' | ||||
|     strategy: | ||||
|       fail-fast: false | ||||
|     runs-on: "windows-11-arm64-preview" | ||||
|     needs: build | ||||
|     steps: | ||||
|       - name: Enable long paths | ||||
|         shell: cmd | ||||
|         run: | | ||||
|           git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now" | ||||
|           git config --system core.longpaths true | ||||
|  | ||||
|       - name: Git checkout PyTorch | ||||
|         uses: actions/checkout@v4 | ||||
|         with: | ||||
|           path: pytorch | ||||
|           submodules: recursive | ||||
|  | ||||
|       - name: Bootstrap Python | ||||
|         shell: cmd | ||||
|         run: | | ||||
|           "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat" | ||||
|  | ||||
|       - name: Bootstrap Rust | ||||
|         shell: cmd | ||||
|         run: | | ||||
|           "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat" | ||||
|  | ||||
|       - name: Get workflow job id | ||||
|         shell: bash | ||||
|         id: get-job-id | ||||
|         run: | | ||||
|           set -eux | ||||
|           python pytorch/.github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}" | ||||
|         env: | ||||
|           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||||
|  | ||||
|       - name: Download Build Artifacts | ||||
|         uses: actions/download-artifact@v4.1.7 | ||||
|         with: | ||||
|           name: torch-wheel-win-arm64-py3-12 | ||||
|           path: C:\${{ github.run_id }}\build-results | ||||
|  | ||||
|       - name: Test | ||||
|         id: test | ||||
|         shell: cmd | ||||
|         env: | ||||
|           USE_CUDA: '0' | ||||
|           INSTALL_WINDOWS_SDK: 1 | ||||
|           PYTHON_VERSION: "3.12" | ||||
|           VC_PRODUCT: "BuildTools" | ||||
|           AWS_DEFAULT_REGION: us-east-1 | ||||
|           GITHUB_REPOSITORY: ${{ github.repository }} | ||||
|           GITHUB_WORKFLOW: ${{ github.workflow }} | ||||
|           GITHUB_JOB: ${{ github.job }} | ||||
|           GITHUB_RUN_ID: ${{ github.run_id }} | ||||
|           GITHUB_RUN_NUMBER: ${{ github.run_number }} | ||||
|           GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }} | ||||
|           JOB_ID: ${{ steps.get-job-id.outputs.job-id }} | ||||
|           JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} | ||||
|           PYTORCH_FINAL_PACKAGE_DIR: C:/${{ github.run_id }}/build-results/ | ||||
|         run: | | ||||
|           mkdir "%PYTORCH_FINAL_PACKAGE_DIR%" | ||||
|           call pytorch/.ci/pytorch/windows/arm64/bootstrap_tests.bat | ||||
|           set GIT_BASH=C:\Program Files\Git\usr\bin\bash.exe | ||||
|           "%GIT_BASH%" -c "bash --noprofile --norc .ci/pytorch/win-arm64-test.sh" | ||||
| @ -39,16 +39,16 @@ init_command = [ | ||||
|     'python3', | ||||
|     'tools/linter/adapters/pip_init.py', | ||||
|     '--dry-run={{DRYRUN}}', | ||||
|     'flake8==7.3.0', | ||||
|     'flake8-bugbear==24.12.12', | ||||
|     'flake8-comprehensions==3.16.0', | ||||
|     'flake8==6.1.0', | ||||
|     'flake8-bugbear==23.3.23', | ||||
|     'flake8-comprehensions==3.15.0', | ||||
|     'flake8-executable==2.1.3', | ||||
|     'flake8-logging-format==2024.24.12', | ||||
|     'flake8-pyi==25.5.0', | ||||
|     'flake8-simplify==0.22.0', | ||||
|     'flake8-logging-format==0.9.0', | ||||
|     'flake8-pyi==23.3.1', | ||||
|     'flake8-simplify==0.19.3', | ||||
|     'mccabe==0.7.0', | ||||
|     'pycodestyle==2.14.0', | ||||
|     'pyflakes==3.4.0', | ||||
|     'pycodestyle==2.11.1', | ||||
|     'pyflakes==3.1.0', | ||||
|     'torchfix==0.4.0 ; python_version >= "3.9" and python_version < "3.13"', | ||||
| ] | ||||
|  | ||||
| @ -158,16 +158,16 @@ init_command = [ | ||||
|     'mypy==1.16.0', | ||||
|     'sympy==1.13.3', | ||||
|     'types-requests==2.27.25', | ||||
|     'types-pyyaml==6.0.2', | ||||
|     'types-pyyaml==6.0.1', | ||||
|     'types-tabulate==0.8.8', | ||||
|     'types-protobuf==5.29.1.20250403', | ||||
|     'types-setuptools==79.0.0.20250422', | ||||
|     'types-jinja2==2.11.9', | ||||
|     'types-colorama==0.4.6', | ||||
|     'filelock==3.18.0', | ||||
|     'filelock==3.13.1', | ||||
|     'junitparser==2.1.1', | ||||
|     'rich==14.1.0', | ||||
|     'pyyaml==6.0.2', | ||||
|     'rich==10.9.0', | ||||
|     'pyyaml==6.0.1', | ||||
|     'optree==0.13.0', | ||||
|     'dataclasses-json==0.6.7', | ||||
|     'pandas==2.2.3', | ||||
| @ -1111,7 +1111,7 @@ init_command = [ | ||||
|     'python3', | ||||
|     'tools/linter/adapters/pip_init.py', | ||||
|     '--dry-run={{DRYRUN}}', | ||||
|     'pyyaml==6.0.2', | ||||
|     'PyYAML==6.0.1', | ||||
| ] | ||||
|  | ||||
| [[linter]] | ||||
| @ -1133,7 +1133,7 @@ init_command = [ | ||||
|     'python3', | ||||
|     'tools/linter/adapters/pip_init.py', | ||||
|     '--dry-run={{DRYRUN}}', | ||||
|     'pyyaml==6.0.2', | ||||
|     'PyYAML==6.0.1', | ||||
| ] | ||||
|  | ||||
| [[linter]] | ||||
| @ -1794,12 +1794,3 @@ include_patterns = [ | ||||
|     'torch/header_only_apis.txt', | ||||
| ] | ||||
| is_formatter = false | ||||
|  | ||||
|  | ||||
| [[linter]] | ||||
| code = "GB_REGISTRY" | ||||
| include_patterns = ["torch/_dynamo/**/*.py"] | ||||
| command = [ | ||||
|   "python3", | ||||
|   "tools/linter/adapters/gb_registry_linter.py", | ||||
| ] | ||||
|  | ||||
| @ -679,7 +679,6 @@ cc_library( | ||||
|         [ | ||||
|             "torch/*.h", | ||||
|             "torch/csrc/**/*.h", | ||||
|             "torch/nativert/**/*.h", | ||||
|             "torch/csrc/distributed/c10d/**/*.hpp", | ||||
|             "torch/lib/libshm/*.h", | ||||
|         ], | ||||
|  | ||||
| @ -564,7 +564,7 @@ if(MSVC) | ||||
|   set(CMAKE_NINJA_CMCLDEPS_RC OFF) | ||||
|   if(MSVC_Z7_OVERRIDE) | ||||
|     # CMake set debug flags to use /Z7 | ||||
|     set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$<CONFIG:Debug,RelWithDebInfo>:Embedded>") | ||||
|     set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT Embedded) | ||||
|   endif() | ||||
|   foreach( | ||||
|     flag_var | ||||
| @ -872,14 +872,6 @@ cmake_dependent_option( | ||||
|   "USE_CUDA OR USE_ROCM;NOT MSVC" | ||||
|   OFF) | ||||
|  | ||||
| cmake_dependent_option( | ||||
|   USE_FBGEMM_GENAI | ||||
|   "Whether to build FBGEMM GenAI quantized GEMM kernels.\ | ||||
|   Will be disabled if not supported by the platform" | ||||
|   OFF | ||||
|   "USE_CUDA OR USE_ROCM" | ||||
|   OFF) | ||||
|  | ||||
| # CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem | ||||
| # Eff Attention won't | ||||
| cmake_dependent_option( | ||||
| @ -913,10 +905,6 @@ if(USE_FBGEMM) | ||||
|   string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM") | ||||
| endif() | ||||
|  | ||||
| if(USE_FBGEMM_GENAI) | ||||
|   string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM_GENAI") | ||||
| endif() | ||||
|  | ||||
| if(USE_PYTORCH_QNNPACK) | ||||
|   string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_QNNPACK") | ||||
| endif() | ||||
|  | ||||
							
								
								
									
										18
									
								
								CODEOWNERS
									
									
									
									
									
								
							
							
						
						
									
										18
									
								
								CODEOWNERS
									
									
									
									
									
								
							| @ -14,6 +14,7 @@ | ||||
| /torch/csrc/autograd/ @albanD @soulitzer | ||||
| /torch/autograd/ @albanD @soulitzer | ||||
| /tools/autograd/ @albanD @soulitzer | ||||
| /torch/header_only_apis.txt @janeyx99 | ||||
| /torch/nn/ @albanD @jbschlosser @mikaylagawarecki | ||||
| /torch/optim/ @albanD @janeyx99 | ||||
| /test/test_public_bindings.py @albanD | ||||
| @ -50,12 +51,12 @@ nn/qat/ @jerryzh168 | ||||
| /torch/csrc/distributed/c10d/Ops.* @kwen2501 | ||||
|  | ||||
| # ONNX Export | ||||
| /torch/_dynamo/backends/onnxrt.py @titaiwangms @xadupre @justinchuby | ||||
| /torch/csrc/jit/passes/onnx.h @titaiwangms @xadupre | ||||
| /torch/csrc/jit/passes/onnx.cpp @titaiwangms @xadupre | ||||
| /torch/csrc/jit/passes/onnx/ @titaiwangms @xadupre | ||||
| /torch/onnx/ @titaiwangms @xadupre @justinchuby | ||||
| /test/onnx/  @titaiwangms @xadupre @justinchuby | ||||
| /torch/_dynamo/backends/onnxrt.py @wschin | ||||
| /torch/csrc/jit/passes/onnx.h @titaiwangms @shubhambhokare1 | ||||
| /torch/csrc/jit/passes/onnx.cpp @titaiwangms @shubhambhokare1 | ||||
| /torch/csrc/jit/passes/onnx/ @titaiwangms @shubhambhokare1 | ||||
| /torch/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin | ||||
| /test/onnx/  @titaiwangms @shubhambhokare1 @justinchuby @wschin | ||||
|  | ||||
| # CI | ||||
| /.ci  @pytorch/pytorch-dev-infra | ||||
| @ -195,8 +196,3 @@ torch/backends/cudnn/ @eqy @syed-ahmed | ||||
| /torch/utils/_cxx_pytree.py @XuehaiPan | ||||
| /torch/utils/pytree/ @XuehaiPan | ||||
| /torch/_dynamo/polyfills/pytree.py @XuehaiPan | ||||
|  | ||||
| # Relating to libtorch ABI | ||||
| /torch/csrc/stable/ @janeyx99 @mikaylagawarecki | ||||
| /torch/headeronly/ @janeyx99 | ||||
| /torch/header_only_apis.txt @janeyx99 | ||||
|  | ||||
							
								
								
									
										15
									
								
								Dockerfile
									
									
									
									
									
								
							
							
						
						
									
										15
									
								
								Dockerfile
									
									
									
									
									
								
							| @ -47,6 +47,18 @@ WORKDIR /opt/pytorch | ||||
| COPY . . | ||||
| RUN git submodule update --init --recursive | ||||
|  | ||||
| FROM conda as build | ||||
| ARG CMAKE_VARS | ||||
| WORKDIR /opt/pytorch | ||||
| COPY --from=conda /opt/conda /opt/conda | ||||
| COPY --from=submodule-update /opt/pytorch /opt/pytorch | ||||
| RUN make triton | ||||
| RUN --mount=type=cache,target=/opt/ccache \ | ||||
|     export eval ${CMAKE_VARS} && \ | ||||
|     TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 8.9 9.0 9.0a" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ | ||||
|     CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \ | ||||
|     python -m pip install --no-build-isolation -v . | ||||
|  | ||||
| FROM conda as conda-installs | ||||
| ARG PYTHON_VERSION=3.11 | ||||
| ARG CUDA_PATH=cu121 | ||||
| @ -97,5 +109,4 @@ WORKDIR /workspace | ||||
|  | ||||
| FROM official as dev | ||||
| # Should override the already installed version from the official-image stage | ||||
| COPY --from=conda /opt/conda /opt/conda | ||||
| COPY --from=submodule-update /opt/pytorch /opt/pytorch | ||||
| COPY --from=build /opt/conda /opt/conda | ||||
|  | ||||
| @ -294,12 +294,14 @@ Install PyTorch | ||||
|  | ||||
| ```bash | ||||
| export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}" | ||||
| python -m pip install -r requirements-build.txt | ||||
| python -m pip install --no-build-isolation -v -e . | ||||
| ``` | ||||
|  | ||||
| **On macOS** | ||||
|  | ||||
| ```bash | ||||
| python -m pip install -r requirements-build.txt | ||||
| python -m pip install --no-build-isolation -v -e . | ||||
| ``` | ||||
|  | ||||
|  | ||||
| @ -247,50 +247,6 @@ if(USE_MEM_EFF_ATTENTION) | ||||
|   list(APPEND ATen_ATTENTION_KERNEL_SRCS ${mem_eff_attention_cuda_kernels_cu}) | ||||
| endif() | ||||
|  | ||||
| IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH) | ||||
|   message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF") | ||||
|   set(USE_FBGEMM_GENAI off) | ||||
| endif() | ||||
|  | ||||
| # FBGEMM GenAI | ||||
| IF(USE_FBGEMM_GENAI) | ||||
|   set(FBGEMM_THIRD_PARTY ${PROJECT_SOURCE_DIR}/third_party/fbgemm/external/) | ||||
|   set(FBGEMM_GENAI_DIR ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize) | ||||
|  | ||||
|   if(USE_ROCM) | ||||
|     # Only include the kernels we want to build to avoid increasing binary size. | ||||
|     file(GLOB_RECURSE fbgemm_genai_native_rocm_hip | ||||
|       "${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip" | ||||
|       "${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip") | ||||
|     set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) | ||||
|  | ||||
|     # Add additional HIPCC compiler flags for performance | ||||
|     set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS | ||||
|       -mllvm | ||||
|       -amdgpu-coerce-illegal-types=1 | ||||
|       -mllvm | ||||
|       -enable-post-misched=0 | ||||
|       -mllvm | ||||
|       -greedy-reverse-local-assignment=1 | ||||
|       -fhip-new-launch-api) | ||||
|  | ||||
|     hip_add_library( | ||||
|       fbgemm_genai STATIC | ||||
|       ${fbgemm_genai_native_rocm_hip} | ||||
|       HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS}) | ||||
|     set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON) | ||||
|     target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES) | ||||
|  | ||||
|     target_include_directories(fbgemm_genai PUBLIC | ||||
|       # FBGEMM version of Composable Kernel is used due to some customizations | ||||
|       ${FBGEMM_THIRD_PARTY}/composable_kernel/include | ||||
|       ${FBGEMM_THIRD_PARTY}/composable_kernel/library/include | ||||
|       ${FBGEMM_GENAI_DIR}/include/ | ||||
|       ${FBGEMM_GENAI_DIR}/common/include/ | ||||
|     ) | ||||
|   endif() | ||||
| endif() | ||||
|  | ||||
| # XNNPACK | ||||
| file(GLOB native_xnnpack "native/xnnpack/*.cpp") | ||||
|  | ||||
| @ -439,7 +395,6 @@ if(USE_ROCM) | ||||
|   list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip) | ||||
|   list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include) | ||||
|   list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include) | ||||
|   list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/example/ck_tile/01_fmha) | ||||
|   list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel) | ||||
|   list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/aiter/csrc/include) | ||||
|   _pytorch_rocm_generate_ck_conf() | ||||
| @ -631,10 +586,17 @@ if(USE_CUDA AND NOT USE_ROCM) | ||||
|       CUDA::cufft_static_nocallback | ||||
|     ) | ||||
|    if(NOT BUILD_LAZY_CUDA_LINALG) | ||||
|      list(APPEND ATen_CUDA_DEPENDENCY_LIBS | ||||
|        CUDA::cusolver_static | ||||
|        ${CUDAToolkit_LIBRARY_DIR}/libcusolver_lapack_static.a     # needed for libcusolver_static | ||||
|      ) | ||||
|      if(CUDA_VERSION_MAJOR LESS_EQUAL 11) | ||||
|        list(APPEND ATen_CUDA_DEPENDENCY_LIBS | ||||
|          CUDA::cusolver_static | ||||
|          ${CUDAToolkit_LIBRARY_DIR}/liblapack_static.a     # needed for libcusolver_static | ||||
|        ) | ||||
|      elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 12) | ||||
|        list(APPEND ATen_CUDA_DEPENDENCY_LIBS | ||||
|          CUDA::cusolver_static | ||||
|          ${CUDAToolkit_LIBRARY_DIR}/libcusolver_lapack_static.a     # needed for libcusolver_static | ||||
|        ) | ||||
|      endif() | ||||
|    endif() | ||||
|   else() | ||||
|     list(APPEND ATen_CUDA_DEPENDENCY_LIBS | ||||
| @ -704,17 +666,21 @@ if(USE_MPS) | ||||
|     if(CAN_COMPILE_METAL) | ||||
|         foreach(SHADER ${native_mps_metal}) | ||||
|             cmake_path(GET SHADER STEM TGT_STEM) | ||||
|             string(CONCAT TGT_BASIC ${TGT_STEM} "_31.air") | ||||
|             string(CONCAT TGT_BASIC ${TGT_STEM} "_30.air") | ||||
|             string(CONCAT TGT_BFLOAT ${TGT_STEM} "_31.air") | ||||
|             list(APPEND AIR_BASIC ${TGT_BASIC}) | ||||
|             metal_to_air(${SHADER} ${TGT_BASIC} "-std=metal3.1") | ||||
|             list(APPEND AIR_BFLOAT ${TGT_BFLOAT}) | ||||
|             metal_to_air(${SHADER} ${TGT_BASIC} "-std=metal3.0") | ||||
|             metal_to_air(${SHADER} ${TGT_BFLOAT} "-std=metal3.1") | ||||
|         endforeach() | ||||
|         air_to_metallib(kernels_basic.metallib ${AIR_BASIC}) | ||||
|         air_to_metallib(kernels_bfloat.metallib ${AIR_BFLOAT}) | ||||
|         add_custom_command( | ||||
|                           COMMAND echo "// $$(date)" > metallib_dummy.cpp | ||||
|                           DEPENDS kernels_basic.metallib | ||||
|                           DEPENDS kernels_basic.metallib kernels_bfloat.metallib | ||||
|                           OUTPUT metallib_dummy.cpp | ||||
|                           COMMENT "Updating metallibs timestamp") | ||||
|         add_custom_target(metallibs DEPENDS kernels_basic.metallib metallib_dummy.cpp) | ||||
|         add_custom_target(metallibs DEPENDS kernels_basic.metallib kernels_bfloat.metallib metallib_dummy.cpp) | ||||
|     else() | ||||
|         file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/native/mps") | ||||
|         foreach(SHADER ${native_mps_metal}) | ||||
|  | ||||
| @ -14,9 +14,7 @@ | ||||
| #include <ATen/cpu/FlushDenormal.h> | ||||
|  | ||||
| #ifdef USE_FBGEMM | ||||
| C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi") | ||||
| #include <fbgemm/Fbgemm.h> | ||||
| C10_DIAGNOSTIC_POP() | ||||
| #endif // USE_FBGEMM | ||||
| #if defined(__aarch64__) && !defined(C10_MOBILE) | ||||
| #include <cpuinfo.h> | ||||
| @ -334,14 +332,6 @@ void Context::setBenchmarkLimitCuDNN(int b) { | ||||
|   benchmark_limit_cudnn = b; | ||||
| } | ||||
|  | ||||
| bool Context::immediateMiopen() const { | ||||
|   return immediate_miopen; | ||||
| } | ||||
|  | ||||
| void Context::setImmediateMiopen(bool b) { | ||||
|   immediate_miopen = b; | ||||
| } | ||||
|  | ||||
| bool Context::allowTF32CuBLAS() const { | ||||
| #ifdef USE_ROCM | ||||
|     const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32); | ||||
| @ -512,7 +502,7 @@ at::BlasBackend Context::blasPreferredBackend() { | ||||
|       static const std::vector<std::string> archs = { | ||||
|           "gfx90a", "gfx942", | ||||
| #if ROCM_VERSION >= 60300 | ||||
|           "gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908", | ||||
|           "gfx1100", "gfx1101", "gfx1200", "gfx1201", | ||||
| #endif | ||||
| #if ROCM_VERSION >= 60500 | ||||
|           "gfx950" | ||||
|  | ||||
| @ -205,8 +205,6 @@ class TORCH_API Context { | ||||
|   void setBenchmarkCuDNN(bool); | ||||
|   int benchmarkLimitCuDNN() const; | ||||
|   void setBenchmarkLimitCuDNN(int); | ||||
|   bool immediateMiopen() const; | ||||
|   void setImmediateMiopen(bool); | ||||
|   bool deterministicCuDNN() const; | ||||
|   void setDeterministicCuDNN(bool); | ||||
|   bool deterministicMkldnn() const; | ||||
| @ -442,7 +440,6 @@ class TORCH_API Context { | ||||
|   bool enabled_overrideable = true; | ||||
|   bool allow_fp16_bf16_reduction_mathSDP = false; | ||||
|   bool benchmark_cudnn = false; | ||||
|   bool immediate_miopen = false; | ||||
|   Float32MatmulPrecision float32_matmul_precision = | ||||
|       c10::utils::check_env("TORCH_ALLOW_TF32_CUBLAS_OVERRIDE") == true | ||||
|       ? at::Float32MatmulPrecision::HIGH | ||||
|  | ||||
| @ -132,9 +132,6 @@ DLDevice torchDeviceToDLDevice(at::Device device) { | ||||
|     case DeviceType::PrivateUse1: | ||||
|       ctx.device_type = DLDeviceType::kDLExtDev; | ||||
|       break; | ||||
|     case DeviceType::MPS: | ||||
|       ctx.device_type = DLDeviceType::kDLMetal; | ||||
|       break; | ||||
|     default: | ||||
|       TORCH_CHECK_BUFFER(false, "Cannot pack tensors on " + device.str()); | ||||
|   } | ||||
| @ -167,8 +164,6 @@ static Device getATenDevice(DLDeviceType type, c10::DeviceIndex index, void* dat | ||||
|       return at::Device(DeviceType::MAIA, index); | ||||
|     case DLDeviceType::kDLExtDev: | ||||
|       return at::Device(DeviceType::PrivateUse1, index); | ||||
|     case DLDeviceType::kDLMetal: | ||||
|       return at::Device(DeviceType::MPS, index); | ||||
|     default: | ||||
|       TORCH_CHECK_BUFFER( | ||||
|           false, "Unsupported device_type: ", std::to_string(type)); | ||||
|  | ||||
| @ -1,5 +1,6 @@ | ||||
| #pragma once | ||||
|  | ||||
| #include <c10/core/CachingDeviceAllocator.h> | ||||
| #include <c10/core/DeviceType.h> | ||||
| #include <c10/macros/Macros.h> | ||||
|  | ||||
| @ -72,6 +73,27 @@ TORCH_API c10::DeviceIndex exchangeDevice(c10::DeviceIndex device_index); | ||||
| // original device index that was active before the change. | ||||
| TORCH_API c10::DeviceIndex maybeExchangeDevice(c10::DeviceIndex device_index); | ||||
|  | ||||
| TORCH_API inline void emptyCache() { | ||||
|   const auto device_type = getAccelerator(true).value(); | ||||
|   at::getDeviceAllocator(device_type)->emptyCache(); | ||||
| } | ||||
|  | ||||
| TORCH_API inline at::CachingDeviceAllocator::DeviceStats getDeviceStats( | ||||
|     c10::DeviceIndex device_index) { | ||||
|   const auto device_type = getAccelerator(true).value(); | ||||
|   return at::getDeviceAllocator(device_type)->getDeviceStats(device_index); | ||||
| } | ||||
|  | ||||
| TORCH_API inline void resetAccumulatedStats(c10::DeviceIndex device_index) { | ||||
|   const auto device_type = getAccelerator(true).value(); | ||||
|   at::getDeviceAllocator(device_type)->resetAccumulatedStats(device_index); | ||||
| } | ||||
|  | ||||
| TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) { | ||||
|   const auto device_type = getAccelerator(true).value(); | ||||
|   at::getDeviceAllocator(device_type)->resetPeakStats(device_index); | ||||
| } | ||||
|  | ||||
| } // namespace at::accelerator | ||||
|  | ||||
| namespace at { | ||||
|  | ||||
| @ -9,36 +9,7 @@ | ||||
|  | ||||
| namespace at { | ||||
|  | ||||
|  /* | ||||
|   * Design: | ||||
|   * 1. ZeroTensors are regular tensors with TensorOptions, a storage | ||||
|   *    pointing to nullptr and a ZeroTensor dispatch key set. | ||||
|   * | ||||
|   * 2. ZeroTensors are immutable. This is done to prevent data race in the case of multithreading | ||||
|   *    (when two threads try to read the same zero tensor and materialize it in-place). | ||||
|   * | ||||
|   * 3. ZeroTensor has a boxed fallback that will be dispatched to any ops that don't | ||||
|   *    have special ZeroTensor handling. This fallback materializes each ZeroTensor to | ||||
|   *    `at::zeros({}, tensor.options()).expand(tensor.sizes())`. | ||||
|  | ||||
|   * 4. ZeroTensors are handled above autograd. This is necessary because fallback | ||||
|   *    operations are not differentiable. | ||||
|   *     - Example: Consider add in the case it was using the fallback: zerotensor_a + b. | ||||
|   *       zerotensor_a would be materialized to c=torch.zeros_like(zerotensor_a) after | ||||
|   *       passing through the fallback. If this happens above the autograd, then the | ||||
|   *       gradients would be populated on c instead of zerotensor_a. | ||||
|   * | ||||
|   * 5. The grad field is always populated with an honest to goodness tensor. This | ||||
|   *    materialization of ZeroTensors will happen in: | ||||
|   *     - AccumulateGrad for Backward Mode AD. | ||||
|   *     - will never be required for ForwardMode AD. | ||||
|   *       - This is because if all the tangents were undefined (efficient ZeroTensors), | ||||
|   *         no computation will be performed (this is ensured via an existing pre-check). | ||||
|   * | ||||
|   * Today ZeroTensors are primarily used to represent undefined gradients in forward AD, | ||||
|   * it does not perfectly handle NaNs and Infs as we don't check the actual values | ||||
|   * and assume that they are non-zero, non-inf, non-NaN etc. | ||||
|   */ | ||||
|   // TODO: add a note explaining the design decisions | ||||
|   // ZeroTensors are designed to be immutable. Thus, we error out when an in-place operation is performed on ZeroTensors | ||||
|   static void zeroTensorFallback(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, torch::jit::Stack* stack) { | ||||
|     const auto& arguments = op.schema().arguments(); | ||||
|  | ||||
| @ -1 +1,55 @@ | ||||
| #include <torch/headeronly/cpu/vec/intrinsics.h> | ||||
| #pragma once | ||||
| #if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) | ||||
| /* GCC or clang-compatible compiler, targeting x86/x86-64 */ | ||||
| #include <x86intrin.h> | ||||
| #elif defined(__clang__) && (defined(__ARM_NEON__) || defined(__aarch64__)) | ||||
| /* Clang-compatible compiler, targeting arm neon */ | ||||
| #include <arm_neon.h> | ||||
| #if defined(__ARM_FEATURE_SVE) | ||||
| /* CLANG-compatible compiler, targeting ARM with SVE */ | ||||
| #include <arm_sve.h> | ||||
| #endif | ||||
| #elif defined(_MSC_VER) | ||||
| /* Microsoft C/C++-compatible compiler */ | ||||
| #include <intrin.h> | ||||
| #if _MSC_VER <= 1900 | ||||
| #define _mm256_extract_epi64(X, Y) \ | ||||
|   (_mm_extract_epi64(_mm256_extractf128_si256(X, Y >> 1), Y % 2)) | ||||
| #define _mm256_extract_epi32(X, Y) \ | ||||
|   (_mm_extract_epi32(_mm256_extractf128_si256(X, Y >> 2), Y % 4)) | ||||
| #define _mm256_extract_epi16(X, Y) \ | ||||
|   (_mm_extract_epi16(_mm256_extractf128_si256(X, Y >> 3), Y % 8)) | ||||
| #define _mm256_extract_epi8(X, Y) \ | ||||
|   (_mm_extract_epi8(_mm256_extractf128_si256(X, Y >> 4), Y % 16)) | ||||
| #endif | ||||
| #elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__aarch64__)) | ||||
| /* GCC-compatible compiler, targeting ARM with NEON */ | ||||
| #include <arm_neon.h> | ||||
| #if defined(__ARM_FEATURE_SVE) | ||||
| /* GCC-compatible compiler, targeting ARM with SVE */ | ||||
| #include <arm_sve.h> | ||||
| #endif | ||||
| #if defined(MISSING_ARM_VLD1) | ||||
| #include <ATen/cpu/vec/vec256/missing_vld1_neon.h> | ||||
| #elif defined(MISSING_ARM_VST1) | ||||
| #include <ATen/cpu/vec/vec256/missing_vst1_neon.h> | ||||
| #endif | ||||
| #elif defined(__GNUC__) && defined(__IWMMXT__) | ||||
| /* GCC-compatible compiler, targeting ARM with WMMX */ | ||||
| #include <mmintrin.h> | ||||
| #elif defined(__s390x__) | ||||
| // targets Z/architecture | ||||
| // we will include vecintrin later | ||||
| #elif (defined(__GNUC__) || defined(__xlC__)) && \ | ||||
|     (defined(__VEC__) || defined(__ALTIVEC__)) | ||||
| /* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */ | ||||
| #include <altivec.h> | ||||
| /* We need to undef those tokens defined by <altivec.h> to avoid conflicts | ||||
|    with the C++ types. => Can still use __bool/__vector */ | ||||
| #undef bool | ||||
| #undef vector | ||||
| #undef pixel | ||||
| #elif defined(__GNUC__) && defined(__SPE__) | ||||
| /* GCC-compatible compiler, targeting PowerPC with SPE */ | ||||
| #include <spe.h> | ||||
| #endif | ||||
|  | ||||
| @ -5,7 +5,6 @@ | ||||
| #include <ATen/cpu/vec/sve/vec_common_sve.h> | ||||
| #include <ATen/cpu/vec/sve/vec_float.h> | ||||
| #include <ATen/cpu/vec/vec_base.h> | ||||
| #include <c10/util/bit_cast.h> | ||||
| #include <cmath> | ||||
| namespace at { | ||||
| namespace vec { | ||||
| @ -37,7 +36,7 @@ class Vectorized<BFloat16> { | ||||
|     return VECTOR_WIDTH / sizeof(BFloat16); | ||||
|   } | ||||
|  | ||||
|   Vectorized(); | ||||
|   Vectorized() {} | ||||
|   Vectorized(svbfloat16_t v) : values(v) {} | ||||
|   Vectorized(int val); | ||||
|   Vectorized(BFloat16 val); | ||||
| @ -307,11 +306,6 @@ Vectorized<c10::BFloat16> inline operator/( | ||||
|   return binary_operator_via_float(std::divides<Vectorized<float>>(), a, b); | ||||
| } | ||||
|  | ||||
| inline Vectorized<BFloat16>::Vectorized() { | ||||
|   const short zero = 0; | ||||
|   values = svdup_n_bf16(c10::bit_cast<bfloat16_t>(zero)); | ||||
| } | ||||
|  | ||||
| inline Vectorized<BFloat16>::Vectorized(int val) { | ||||
|   auto vals_f = svdup_n_f32(val); | ||||
|   values = convert_float_bfloat16(vals_f, vals_f); | ||||
|  | ||||
| @ -38,9 +38,7 @@ class Vectorized<double> { | ||||
|   static constexpr size_type size() { | ||||
|     return VECTOR_WIDTH / sizeof(double); | ||||
|   } | ||||
|   Vectorized() { | ||||
|     values = svdup_n_f64(0); | ||||
|   } | ||||
|   Vectorized() {} | ||||
|   Vectorized(svfloat64_t v) : values(v) {} | ||||
|   Vectorized(double val) { | ||||
|     values = svdup_n_f64(val); | ||||
| @ -587,30 +585,6 @@ Vectorized<double> inline fmadd( | ||||
|   return svmad_f64_x(ptrue, a, b, c); | ||||
| } | ||||
|  | ||||
| template <> | ||||
| Vectorized<double> inline fnmadd( | ||||
|     const Vectorized<double>& a, | ||||
|     const Vectorized<double>& b, | ||||
|     const Vectorized<double>& c) { | ||||
|   return svmsb_f64_x(ptrue, a, b, c); | ||||
| } | ||||
|  | ||||
| template <> | ||||
| Vectorized<double> inline fmsub( | ||||
|     const Vectorized<double>& a, | ||||
|     const Vectorized<double>& b, | ||||
|     const Vectorized<double>& c) { | ||||
|   return svnmsb_f64_x(ptrue, a, b, c); | ||||
| } | ||||
|  | ||||
| template <> | ||||
| Vectorized<double> inline fnmsub( | ||||
|     const Vectorized<double>& a, | ||||
|     const Vectorized<double>& b, | ||||
|     const Vectorized<double>& c) { | ||||
|   return svnmad_f64_x(ptrue, a, b, c); | ||||
| } | ||||
|  | ||||
| #endif // defined(CPU_CAPABILITY_SVE) | ||||
|  | ||||
| } // namespace CPU_CAPABILITY | ||||
|  | ||||
| @ -38,9 +38,7 @@ class Vectorized<float> { | ||||
|   static constexpr size_type size() { | ||||
|     return VECTOR_WIDTH / sizeof(float); | ||||
|   } | ||||
|   Vectorized() { | ||||
|     values = svdup_n_f32(0); | ||||
|   } | ||||
|   Vectorized() {} | ||||
|   Vectorized(svfloat32_t v) : values(v) {} | ||||
|   Vectorized(float val) { | ||||
|     values = svdup_n_f32(val); | ||||
| @ -758,30 +756,6 @@ Vectorized<float> inline fmadd( | ||||
|   return svmad_f32_x(ptrue, a, b, c); | ||||
| } | ||||
|  | ||||
| template <> | ||||
| Vectorized<float> inline fnmadd( | ||||
|     const Vectorized<float>& a, | ||||
|     const Vectorized<float>& b, | ||||
|     const Vectorized<float>& c) { | ||||
|   return svmsb_f32_x(ptrue, a, b, c); | ||||
| } | ||||
|  | ||||
| template <> | ||||
| Vectorized<float> inline fmsub( | ||||
|     const Vectorized<float>& a, | ||||
|     const Vectorized<float>& b, | ||||
|     const Vectorized<float>& c) { | ||||
|   return svnmsb_f32_x(ptrue, a, b, c); | ||||
| } | ||||
|  | ||||
| template <> | ||||
| Vectorized<float> inline fnmsub( | ||||
|     const Vectorized<float>& a, | ||||
|     const Vectorized<float>& b, | ||||
|     const Vectorized<float>& c) { | ||||
|   return svnmad_f32_x(ptrue, a, b, c); | ||||
| } | ||||
|  | ||||
| #endif // defined(CPU_CAPABILITY_SVE) | ||||
|  | ||||
| } // namespace CPU_CAPABILITY | ||||
|  | ||||
| @ -32,9 +32,7 @@ inline namespace CPU_CAPABILITY { | ||||
|     static constexpr size_type size() {                                       \ | ||||
|       return vl;                                                              \ | ||||
|     }                                                                         \ | ||||
|     Vectorized() {                                                            \ | ||||
|       values = svdup_n_s##bit(0);                                             \ | ||||
|     }                                                                         \ | ||||
|     Vectorized() {}                                                           \ | ||||
|     Vectorized(svint##bit##_t v) : values(v) {}                               \ | ||||
|     Vectorized(int##bit##_t val) {                                            \ | ||||
|       values = svdup_n_s##bit(val);                                           \ | ||||
|  | ||||
| @ -552,15 +552,6 @@ Vectorized<c10::BFloat16> inline fmadd( | ||||
|   return a * b + c; | ||||
| } | ||||
|  | ||||
| template <> | ||||
| Vectorized<c10::BFloat16> inline fnmadd( | ||||
|     const Vectorized<c10::BFloat16>& a, | ||||
|     const Vectorized<c10::BFloat16>& b, | ||||
|     const Vectorized<c10::BFloat16>& c) { | ||||
|   // See NOTE [BF16 FMA] above. | ||||
|   return -a * b + c; | ||||
| } | ||||
|  | ||||
| template <> | ||||
| Vectorized<c10::BFloat16> inline fmsub( | ||||
|     const Vectorized<c10::BFloat16>& a, | ||||
| @ -570,15 +561,6 @@ Vectorized<c10::BFloat16> inline fmsub( | ||||
|   return a * b - c; | ||||
| } | ||||
|  | ||||
| template <> | ||||
| Vectorized<c10::BFloat16> inline fnmsub( | ||||
|     const Vectorized<c10::BFloat16>& a, | ||||
|     const Vectorized<c10::BFloat16>& b, | ||||
|     const Vectorized<c10::BFloat16>& c) { | ||||
|   // See NOTE [BF16 FMA] above. | ||||
|   return -a * b - c; | ||||
| } | ||||
|  | ||||
| #endif // !defined(C10_MOBILE) && defined(__aarch64__) | ||||
|  | ||||
| } // namespace CPU_CAPABILITY | ||||
|  | ||||
| @ -83,9 +83,7 @@ class Vectorized<float> { | ||||
|   static constexpr size_type size() { | ||||
|     return 4; | ||||
|   } | ||||
|   Vectorized() { | ||||
|     values = vmovq_n_f32(0); | ||||
|   } | ||||
|   Vectorized() {} | ||||
|   Vectorized(float32x4_t v) : values(v) {} | ||||
|   Vectorized(float val) : values{vdupq_n_f32(val)} {} | ||||
|   Vectorized(float val0, float val1, float val2, float val3) | ||||
| @ -584,14 +582,6 @@ Vectorized<float> inline fmadd( | ||||
|   return Vectorized<float>(vfmaq_f32(c, a, b)); | ||||
| } | ||||
|  | ||||
| template <> | ||||
| Vectorized<float> inline fnmadd( | ||||
|     const Vectorized<float>& a, | ||||
|     const Vectorized<float>& b, | ||||
|     const Vectorized<float>& c) { | ||||
|   return Vectorized<float>(vfmsq_f32(c, a, b)); | ||||
| } | ||||
|  | ||||
| template <> | ||||
| Vectorized<float> inline fmsub( | ||||
|     const Vectorized<float>& a, | ||||
| @ -600,14 +590,6 @@ Vectorized<float> inline fmsub( | ||||
|   return Vectorized<float>(vnegq_f32(vfmsq_f32(c, a, b))); | ||||
| } | ||||
|  | ||||
| template <> | ||||
| Vectorized<float> inline fnmsub( | ||||
|     const Vectorized<float>& a, | ||||
|     const Vectorized<float>& b, | ||||
|     const Vectorized<float>& c) { | ||||
|   return Vectorized<float>(vnegq_f32(vfmaq_f32(c, a, b))); | ||||
| } | ||||
|  | ||||
| inline Vectorized<float> Vectorized<float>::erf() const { | ||||
|   // constants | ||||
|   const Vectorized<float> neg_zero_vec(-0.f); | ||||
|  | ||||
| @ -621,18 +621,6 @@ Vectorized<c10::Half> inline fmadd( | ||||
| #endif | ||||
| } | ||||
|  | ||||
| template <> | ||||
| Vectorized<c10::Half> inline fnmadd( | ||||
|     const Vectorized<c10::Half>& a, | ||||
|     const Vectorized<c10::Half>& b, | ||||
|     const Vectorized<c10::Half>& c) { | ||||
| #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | ||||
|   return Vectorized<c10::Half>(vfmsq_f16(c, a, b)); | ||||
| #else | ||||
|   return -a * b + c; | ||||
| #endif | ||||
| } | ||||
|  | ||||
| template <> | ||||
| Vectorized<c10::Half> inline fmsub( | ||||
|     const Vectorized<c10::Half>& a, | ||||
| @ -644,18 +632,6 @@ Vectorized<c10::Half> inline fmsub( | ||||
|   return a * b - c; | ||||
| #endif | ||||
| } | ||||
|  | ||||
| template <> | ||||
| Vectorized<c10::Half> inline fnmsub( | ||||
|     const Vectorized<c10::Half>& a, | ||||
|     const Vectorized<c10::Half>& b, | ||||
|     const Vectorized<c10::Half>& c) { | ||||
| #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | ||||
|   return Vectorized<c10::Half>(vnegq_f16(vfmaq_f16(c, a, b))); | ||||
| #else | ||||
|   return -a * b - c; | ||||
| #endif | ||||
| } | ||||
| #endif // !defined(C10_MOBILE) && defined(__aarch64__) | ||||
|  | ||||
| } // namespace CPU_CAPABILITY | ||||
|  | ||||
| @ -1 +1,396 @@ | ||||
| #include <torch/headeronly/cpu/vec/vec256/missing_vld1_neon.h> | ||||
| /* Workaround for missing vld1_*_x2 and vst1_*_x2 intrinsics in gcc-7.  */ | ||||
|  | ||||
| __extension__ extern __inline uint8x8x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1_u8_x2(const uint8_t* __a) { | ||||
|   uint8x8x2_t ret; | ||||
|   asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline int8x8x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1_s8_x2(const int8_t* __a) { | ||||
|   int8x8x2_t ret; | ||||
|   asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline uint16x4x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1_u16_x2(const uint16_t* __a) { | ||||
|   uint16x4x2_t ret; | ||||
|   asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline int16x4x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1_s16_x2(const int16_t* __a) { | ||||
|   int16x4x2_t ret; | ||||
|   asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline uint32x2x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1_u32_x2(const uint32_t* __a) { | ||||
|   uint32x2x2_t ret; | ||||
|   asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline int32x2x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1_s32_x2(const int32_t* __a) { | ||||
|   int32x2x2_t ret; | ||||
|   asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline uint64x1x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1_u64_x2(const uint64_t* __a) { | ||||
|   uint64x1x2_t ret; | ||||
|   asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline int64x1x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1_s64_x2(const int64_t* __a) { | ||||
|   int64x1x2_t ret; | ||||
|   __builtin_aarch64_simd_oi __o; | ||||
|   asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline float16x4x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1_f16_x2(const float16_t* __a) { | ||||
|   float16x4x2_t ret; | ||||
|   asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline float32x2x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1_f32_x2(const float32_t* __a) { | ||||
|   float32x2x2_t ret; | ||||
|   asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline float64x1x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1_f64_x2(const float64_t* __a) { | ||||
|   float64x1x2_t ret; | ||||
|   asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline poly8x8x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1_p8_x2(const poly8_t* __a) { | ||||
|   poly8x8x2_t ret; | ||||
|   asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline poly16x4x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1_p16_x2(const poly16_t* __a) { | ||||
|   poly16x4x2_t ret; | ||||
|   asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline poly64x1x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1_p64_x2(const poly64_t* __a) { | ||||
|   poly64x1x2_t ret; | ||||
|   asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline uint8x16x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1q_u8_x2(const uint8_t* __a) { | ||||
|   uint8x16x2_t ret; | ||||
|   asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline int8x16x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1q_s8_x2(const int8_t* __a) { | ||||
|   int8x16x2_t ret; | ||||
|   asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline uint16x8x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1q_u16_x2(const uint16_t* __a) { | ||||
|   uint16x8x2_t ret; | ||||
|   asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline int16x8x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1q_s16_x2(const int16_t* __a) { | ||||
|   int16x8x2_t ret; | ||||
|   asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline uint32x4x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1q_u32_x2(const uint32_t* __a) { | ||||
|   uint32x4x2_t ret; | ||||
|   asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline int32x4x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1q_s32_x2(const int32_t* __a) { | ||||
|   int32x4x2_t ret; | ||||
|   asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline uint64x2x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1q_u64_x2(const uint64_t* __a) { | ||||
|   uint64x2x2_t ret; | ||||
|   asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline int64x2x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1q_s64_x2(const int64_t* __a) { | ||||
|   int64x2x2_t ret; | ||||
|   asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline float16x8x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1q_f16_x2(const float16_t* __a) { | ||||
|   float16x8x2_t ret; | ||||
|   asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline float32x4x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1q_f32_x2(const float32_t* __a) { | ||||
|   float32x4x2_t ret; | ||||
|   asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline float64x2x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1q_f64_x2(const float64_t* __a) { | ||||
|   float64x2x2_t ret; | ||||
|   asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline poly8x16x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1q_p8_x2(const poly8_t* __a) { | ||||
|   poly8x16x2_t ret; | ||||
|   asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline poly16x8x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1q_p16_x2(const poly16_t* __a) { | ||||
|   poly16x8x2_t ret; | ||||
|   asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline poly64x2x2_t | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vld1q_p64_x2(const poly64_t* __a) { | ||||
|   poly64x2x2_t ret; | ||||
|   asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a)); | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| /* vst1x2 */ | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1_s64_x2(int64_t* __a, int64x1x2_t val) { | ||||
|   asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1_u64_x2(uint64_t* __a, uint64x1x2_t val) { | ||||
|   asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1_f64_x2(float64_t* __a, float64x1x2_t val) { | ||||
|   asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1_s8_x2(int8_t* __a, int8x8x2_t val) { | ||||
|   asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1_p8_x2(poly8_t* __a, poly8x8x2_t val) { | ||||
|   asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1_s16_x2(int16_t* __a, int16x4x2_t val) { | ||||
|   asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1_p16_x2(poly16_t* __a, poly16x4x2_t val) { | ||||
|   asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1_s32_x2(int32_t* __a, int32x2x2_t val) { | ||||
|   asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1_u8_x2(uint8_t* __a, uint8x8x2_t val) { | ||||
|   asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1_u16_x2(uint16_t* __a, uint16x4x2_t val) { | ||||
|   asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1_u32_x2(uint32_t* __a, uint32x2x2_t val) { | ||||
|   asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1_f16_x2(float16_t* __a, float16x4x2_t val) { | ||||
|   asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1_f32_x2(float32_t* __a, float32x2x2_t val) { | ||||
|   asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1_p64_x2(poly64_t* __a, poly64x1x2_t val) { | ||||
|   asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1q_s8_x2(int8_t* __a, int8x16x2_t val) { | ||||
|   asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1q_p8_x2(poly8_t* __a, poly8x16x2_t val) { | ||||
|   asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1q_s16_x2(int16_t* __a, int16x8x2_t val) { | ||||
|   asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1q_p16_x2(poly16_t* __a, poly16x8x2_t val) { | ||||
|   asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1q_s32_x2(int32_t* __a, int32x4x2_t val) { | ||||
|   asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1q_s64_x2(int64_t* __a, int64x2x2_t val) { | ||||
|   asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1q_u8_x2(uint8_t* __a, uint8x16x2_t val) { | ||||
|   asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1q_u16_x2(uint16_t* __a, uint16x8x2_t val) { | ||||
|   asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1q_u32_x2(uint32_t* __a, uint32x4x2_t val) { | ||||
|   asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1q_u64_x2(uint64_t* __a, uint64x2x2_t val) { | ||||
|   asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1q_f16_x2(float16_t* __a, float16x8x2_t val) { | ||||
|   asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1q_f32_x2(float32_t* __a, float32x4x2_t val) { | ||||
|   asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1q_f64_x2(float64_t* __a, float64x2x2_t val) { | ||||
|   asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1q_p64_x2(poly64_t* __a, poly64x2x2_t val) { | ||||
|   asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| @ -1 +1,7 @@ | ||||
| #include <torch/headeronly/cpu/vec/vec256/missing_vst1_neon.h> | ||||
| /* Workaround for missing vst1q_f32_x2 in gcc-8.  */ | ||||
|  | ||||
| __extension__ extern __inline void | ||||
|     __attribute__((__always_inline__, __gnu_inline__, __artificial__)) | ||||
|     vst1q_f32_x2(float32_t* __a, float32x4x2_t val) { | ||||
|   asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val)); | ||||
| } | ||||
|  | ||||
| @ -34,9 +34,7 @@ class Vectorized<c10::complex<double>> { | ||||
|   static constexpr size_type size() { | ||||
|     return 2; | ||||
|   } | ||||
|   Vectorized() { | ||||
|     values = _mm256_setzero_pd(); | ||||
|   } | ||||
|   Vectorized() {} | ||||
|   Vectorized(__m256d v) : values(v) {} | ||||
|   Vectorized(c10::complex<double> val) { | ||||
|     double real_value = val.real(); | ||||
|  | ||||
| @ -33,9 +33,7 @@ class Vectorized<c10::complex<float>> { | ||||
|   static constexpr size_type size() { | ||||
|     return 4; | ||||
|   } | ||||
|   Vectorized() { | ||||
|     values = _mm256_setzero_ps(); | ||||
|   } | ||||
|   Vectorized() {} | ||||
|   Vectorized(__m256 v) : values(v) {} | ||||
|   Vectorized(c10::complex<float> val) { | ||||
|     float real_value = val.real(); | ||||
|  | ||||
| @ -31,9 +31,7 @@ class Vectorized<double> { | ||||
|   static constexpr size_type size() { | ||||
|     return 4; | ||||
|   } | ||||
|   Vectorized() { | ||||
|     values = _mm256_setzero_pd(); | ||||
|   } | ||||
|   Vectorized() {} | ||||
|   Vectorized(__m256d v) : values(v) {} | ||||
|   Vectorized(double val) { | ||||
|     values = _mm256_set1_pd(val); | ||||
| @ -495,14 +493,6 @@ Vectorized<double> inline fmadd( | ||||
|   return _mm256_fmadd_pd(a, b, c); | ||||
| } | ||||
|  | ||||
| template <> | ||||
| Vectorized<double> inline fnmadd( | ||||
|     const Vectorized<double>& a, | ||||
|     const Vectorized<double>& b, | ||||
|     const Vectorized<double>& c) { | ||||
|   return _mm256_fnmadd_pd(a, b, c); | ||||
| } | ||||
|  | ||||
| template <> | ||||
| Vectorized<double> inline fmsub( | ||||
|     const Vectorized<double>& a, | ||||
| @ -510,14 +500,6 @@ Vectorized<double> inline fmsub( | ||||
|     const Vectorized<double>& c) { | ||||
|   return _mm256_fmsub_pd(a, b, c); | ||||
| } | ||||
|  | ||||
| template <> | ||||
| Vectorized<double> inline fnmsub( | ||||
|     const Vectorized<double>& a, | ||||
|     const Vectorized<double>& b, | ||||
|     const Vectorized<double>& c) { | ||||
|   return _mm256_fnmsub_pd(a, b, c); | ||||
| } | ||||
| #endif | ||||
|  | ||||
| #endif | ||||
|  | ||||
| @ -30,9 +30,7 @@ class Vectorized<float> { | ||||
|   static constexpr size_type size() { | ||||
|     return 8; | ||||
|   } | ||||
|   Vectorized() { | ||||
|     values = _mm256_setzero_ps(); | ||||
|   } | ||||
|   Vectorized() {} | ||||
|   Vectorized(__m256 v) : values(v) {} | ||||
|   Vectorized(float val) { | ||||
|     values = _mm256_set1_ps(val); | ||||
| @ -696,14 +694,6 @@ Vectorized<float> inline fmadd( | ||||
|   return _mm256_fmadd_ps(a, b, c); | ||||
| } | ||||
|  | ||||
| template <> | ||||
| Vectorized<float> inline fnmadd( | ||||
|     const Vectorized<float>& a, | ||||
|     const Vectorized<float>& b, | ||||
|     const Vectorized<float>& c) { | ||||
|   return _mm256_fnmadd_ps(a, b, c); | ||||
| } | ||||
|  | ||||
| template <> | ||||
| Vectorized<float> inline fmsub( | ||||
|     const Vectorized<float>& a, | ||||
| @ -712,14 +702,6 @@ Vectorized<float> inline fmsub( | ||||
|   return _mm256_fmsub_ps(a, b, c); | ||||
| } | ||||
|  | ||||
| template <> | ||||
| Vectorized<float> inline fnmsub( | ||||
|     const Vectorized<float>& a, | ||||
|     const Vectorized<float>& b, | ||||
|     const Vectorized<float>& c) { | ||||
|   return _mm256_fnmsub_ps(a, b, c); | ||||
| } | ||||
|  | ||||
| // TODO: rewrite with ATEN vectorized (need to add unpack and shuffle) | ||||
| // Used by Inductor CPP codegen for micro gemm | ||||
| inline void transpose_block(at::vec::VectorizedN<float, 8>& input) { | ||||
|  | ||||
| @ -23,9 +23,7 @@ struct Vectorizedi { | ||||
|   } | ||||
|  | ||||
|  public: | ||||
|   Vectorizedi() { | ||||
|     values = _mm256_setzero_si256(); | ||||
|   } | ||||
|   Vectorizedi() {} | ||||
|   Vectorizedi(__m256i v) : values(v) {} | ||||
|   operator __m256i() const { | ||||
|     return values; | ||||
| @ -55,9 +53,7 @@ class Vectorized<int64_t> : public Vectorizedi { | ||||
|     return 4; | ||||
|   } | ||||
|   using Vectorizedi::Vectorizedi; | ||||
|   Vectorized() { | ||||
|     values = _mm256_setzero_si256(); | ||||
|   } | ||||
|   Vectorized() {} | ||||
|   Vectorized(int64_t v) { | ||||
|     values = _mm256_set1_epi64x(v); | ||||
|   } | ||||
|  | ||||
| @ -54,9 +54,7 @@ struct Vectorizedqi { | ||||
| #endif | ||||
|  | ||||
|  public: | ||||
|   Vectorizedqi() { | ||||
|     vals = _mm256_setzero_si256(); | ||||
|   } | ||||
|   Vectorizedqi() {} | ||||
|   Vectorizedqi(__m256i v) : vals(v) {} | ||||
|   operator __m256i() const { | ||||
|     return vals; | ||||
|  | ||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user
	