mirror of
				https://github.com/pytorch/pytorch.git
				synced 2025-10-31 12:15:03 +08:00 
			
		
		
		
	Compare commits
	
		
			1 Commits
		
	
	
		
			yolo-llama
			...
			eager_mode
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| c56b575e61 | 
| @ -241,7 +241,7 @@ def wait_for_connection(addr, port, timeout=15, attempt_cnt=5): | |||||||
|         try: |         try: | ||||||
|             with socket.create_connection((addr, port), timeout=timeout): |             with socket.create_connection((addr, port), timeout=timeout): | ||||||
|                 return |                 return | ||||||
|         except (ConnectionRefusedError, TimeoutError):  # noqa: PERF203 |         except (ConnectionRefusedError, socket.timeout):  # noqa: PERF203 | ||||||
|             if i == attempt_cnt - 1: |             if i == attempt_cnt - 1: | ||||||
|                 raise |                 raise | ||||||
|             time.sleep(timeout) |             time.sleep(timeout) | ||||||
| @ -1004,7 +1004,7 @@ if __name__ == "__main__": | |||||||
|         install_condaforge_python(host, args.python_version) |         install_condaforge_python(host, args.python_version) | ||||||
|         sys.exit(0) |         sys.exit(0) | ||||||
|  |  | ||||||
|     python_version = args.python_version if args.python_version is not None else "3.10" |     python_version = args.python_version if args.python_version is not None else "3.9" | ||||||
|  |  | ||||||
|     if args.use_torch_from_pypi: |     if args.use_torch_from_pypi: | ||||||
|         configure_system(host, compiler=args.compiler, python_version=python_version) |         configure_system(host, compiler=args.compiler, python_version=python_version) | ||||||
|  | |||||||
| @ -69,8 +69,7 @@ RUN bash ./install_cuda.sh 13.0 | |||||||
| ENV DESIRED_CUDA=13.0 | ENV DESIRED_CUDA=13.0 | ||||||
|  |  | ||||||
| FROM ${ROCM_IMAGE} as rocm | FROM ${ROCM_IMAGE} as rocm | ||||||
| ARG PYTORCH_ROCM_ARCH | ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" | ||||||
| ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH} |  | ||||||
| ADD ./common/install_mkl.sh install_mkl.sh | ADD ./common/install_mkl.sh install_mkl.sh | ||||||
| RUN bash ./install_mkl.sh && rm install_mkl.sh | RUN bash ./install_mkl.sh && rm install_mkl.sh | ||||||
| ENV MKLROOT /opt/intel | ENV MKLROOT /opt/intel | ||||||
|  | |||||||
| @ -36,12 +36,6 @@ case ${DOCKER_TAG_PREFIX} in | |||||||
|     ;; |     ;; | ||||||
|   rocm*) |   rocm*) | ||||||
|     BASE_TARGET=rocm |     BASE_TARGET=rocm | ||||||
|     PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" |  | ||||||
|     # add gfx950 conditionally starting in ROCm 7.0 |  | ||||||
|     if [[ "$ROCM_VERSION" == *"7.0"* ]]; then |  | ||||||
|         PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950" |  | ||||||
|     fi |  | ||||||
|     EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}" |  | ||||||
|     ;; |     ;; | ||||||
|   *) |   *) | ||||||
|     echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}" |     echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}" | ||||||
|  | |||||||
| @ -262,10 +262,13 @@ case "$tag" in | |||||||
|     TRITON_CPU=yes |     TRITON_CPU=yes | ||||||
|     ;; |     ;; | ||||||
|   pytorch-linux-jammy-linter) |   pytorch-linux-jammy-linter) | ||||||
|     PYTHON_VERSION=3.10 |     # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627. | ||||||
|  |     # We will need to update mypy version eventually, but that's for another day. The task | ||||||
|  |     # would be to upgrade mypy to 1.0.0 with Python 3.11 | ||||||
|  |     PYTHON_VERSION=3.9 | ||||||
|     ;; |     ;; | ||||||
|   pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter) |   pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter) | ||||||
|     PYTHON_VERSION=3.10 |     PYTHON_VERSION=3.9 | ||||||
|     CUDA_VERSION=12.8.1 |     CUDA_VERSION=12.8.1 | ||||||
|     ;; |     ;; | ||||||
|   pytorch-linux-jammy-aarch64-py3.10-gcc11) |   pytorch-linux-jammy-aarch64-py3.10-gcc11) | ||||||
|  | |||||||
| @ -1 +1 @@ | |||||||
| v2.28.3-1 | v2.27.5-1 | ||||||
|  | |||||||
| @ -1 +1 @@ | |||||||
| v2.28.3-1 | v2.27.7-1 | ||||||
|  | |||||||
| @ -1 +1 @@ | |||||||
| bbb06c0334a6772b92d24bde54956e675c8c6604 | 5ae38bdb0dc066c5823e34dc9797afb9de42c866 | ||||||
|  | |||||||
| @ -12,8 +12,8 @@ function do_install() { | |||||||
|  |  | ||||||
|     rocm_version_nodot=${rocm_version//./} |     rocm_version_nodot=${rocm_version//./} | ||||||
|  |  | ||||||
|     # https://github.com/icl-utk-edu/magma/pull/65 |     # Version 2.7.2 + ROCm related updates | ||||||
|     MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec |     MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6 | ||||||
|     magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2" |     magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2" | ||||||
|  |  | ||||||
|     rocm_dir="/opt/rocm" |     rocm_dir="/opt/rocm" | ||||||
|  | |||||||
| @ -40,16 +40,12 @@ case ${DOCKER_TAG_PREFIX} in | |||||||
|         ;; |         ;; | ||||||
|     rocm*) |     rocm*) | ||||||
|         # we want the patch version of 6.4 instead |         # we want the patch version of 6.4 instead | ||||||
|         if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then |         if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then | ||||||
|             GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2" |             GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2" | ||||||
|         fi |         fi | ||||||
|         BASE_TARGET=rocm |         BASE_TARGET=rocm | ||||||
|         GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete |         GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete | ||||||
|         PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" |         PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" | ||||||
|         # add gfx950 conditionally starting in ROCm 7.0 |  | ||||||
|         if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then |  | ||||||
|             PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950" |  | ||||||
|         fi |  | ||||||
|         DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}" |         DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}" | ||||||
|         ;; |         ;; | ||||||
|     *) |     *) | ||||||
|  | |||||||
| @ -82,7 +82,7 @@ case ${image} in | |||||||
|         ;; |         ;; | ||||||
|     manylinux2_28-builder:rocm*) |     manylinux2_28-builder:rocm*) | ||||||
|         # we want the patch version of 6.4 instead |         # we want the patch version of 6.4 instead | ||||||
|         if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then |         if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then | ||||||
|             GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2" |             GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2" | ||||||
|         fi |         fi | ||||||
|         TARGET=rocm_final |         TARGET=rocm_final | ||||||
| @ -90,10 +90,6 @@ case ${image} in | |||||||
|         DEVTOOLSET_VERSION="11" |         DEVTOOLSET_VERSION="11" | ||||||
|         GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete |         GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete | ||||||
|         PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" |         PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" | ||||||
|         # add gfx950 conditionally starting in ROCm 7.0 |  | ||||||
|         if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then |  | ||||||
|             PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950" |  | ||||||
|         fi |  | ||||||
|         DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}" |         DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}" | ||||||
|         ;; |         ;; | ||||||
|     manylinux2_28-builder:xpu) |     manylinux2_28-builder:xpu) | ||||||
|  | |||||||
| @ -112,6 +112,8 @@ ninja==1.11.1.3 | |||||||
| #Pinned versions: 1.11.1.3 | #Pinned versions: 1.11.1.3 | ||||||
| #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py | #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py | ||||||
|  |  | ||||||
|  | numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x" | ||||||
|  | numba==0.55.2 ; python_version == "3.9" and platform_machine != "s390x" | ||||||
| numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x" | numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x" | ||||||
| numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x" | numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x" | ||||||
| #Description: Just-In-Time Compiler for Numerical Functions | #Description: Just-In-Time Compiler for Numerical Functions | ||||||
| @ -132,7 +134,7 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x" | |||||||
| #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py, | #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py, | ||||||
| #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py, | #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py, | ||||||
| #test_binary_ufuncs.py | #test_binary_ufuncs.py | ||||||
| numpy==1.22.4; python_version == "3.10" | numpy==1.22.4; python_version == "3.9" or python_version == "3.10" | ||||||
| numpy==1.26.2; python_version == "3.11" or python_version == "3.12" | numpy==1.26.2; python_version == "3.11" or python_version == "3.12" | ||||||
| numpy==2.1.2; python_version >= "3.13" | numpy==2.1.2; python_version >= "3.13" | ||||||
|  |  | ||||||
| @ -324,6 +326,8 @@ pywavelets==1.7.0 ; python_version >= "3.12" | |||||||
| lxml==5.3.0 | lxml==5.3.0 | ||||||
| #Description: This is a requirement of unittest-xml-reporting | #Description: This is a requirement of unittest-xml-reporting | ||||||
|  |  | ||||||
|  | # Python-3.9 binaries | ||||||
|  |  | ||||||
| PyGithub==2.3.0 | PyGithub==2.3.0 | ||||||
|  |  | ||||||
| sympy==1.13.3 | sympy==1.13.3 | ||||||
|  | |||||||
| @ -1,15 +1,8 @@ | |||||||
| sphinx==5.3.0 | sphinx==5.3.0 | ||||||
| #Description: This is used to generate PyTorch docs | #Description: This is used to generate PyTorch docs | ||||||
| #Pinned versions: 5.3.0 | #Pinned versions: 5.3.0 | ||||||
|  | -e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2 | ||||||
|  |  | ||||||
| standard-imghdr==3.13.0; python_version >= "3.13" |  | ||||||
| #Description: This is needed by Sphinx, so it needs to be added here. |  | ||||||
| # The reasons are as follows: |  | ||||||
| # 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr); |  | ||||||
| # 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13. |  | ||||||
| # Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency. |  | ||||||
|  |  | ||||||
| -e git+https://github.com/pytorch/pytorch_sphinx_theme.git@d53b0ffb9b1cda68260693ea98f3483823c88d8e#egg=pytorch_sphinx_theme2 |  | ||||||
| # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering | # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering | ||||||
| # but it doesn't seem to work and hangs around idly. The initial thought that it is probably | # but it doesn't seem to work and hangs around idly. The initial thought that it is probably | ||||||
| # something related to Docker setup. We can investigate this later. | # something related to Docker setup. We can investigate this later. | ||||||
|  | |||||||
| @ -72,7 +72,7 @@ def sample_vllm_test_library(): | |||||||
|                     ] |                     ] | ||||||
|                 ), |                 ), | ||||||
|                 "pytest -v -s entrypoints/llm/test_generate.py", |                 "pytest -v -s entrypoints/llm/test_generate.py", | ||||||
|                 "pytest -v -s entrypoints/offline_mode", |                 "VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode", | ||||||
|             ], |             ], | ||||||
|         }, |         }, | ||||||
|         "vllm_regression_test": { |         "vllm_regression_test": { | ||||||
|  | |||||||
| @ -1,11 +1,11 @@ | |||||||
| SHELL=/usr/bin/env bash | SHELL=/usr/bin/env bash | ||||||
|  |  | ||||||
| DOCKER_CMD ?= docker | DOCKER_CMD ?= docker | ||||||
| DESIRED_ROCM ?= 7.0 | DESIRED_ROCM ?= 6.4 | ||||||
| DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM)) | DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM)) | ||||||
| PACKAGE_NAME = magma-rocm | PACKAGE_NAME = magma-rocm | ||||||
| # inherit this from underlying docker image, do not pass this env var to docker | # inherit this from underlying docker image, do not pass this env var to docker | ||||||
| #PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201 | #PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201 | ||||||
|  |  | ||||||
| DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \ | DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \ | ||||||
| 	-v $(shell git rev-parse --show-toplevel)/.ci:/builder \ | 	-v $(shell git rev-parse --show-toplevel)/.ci:/builder \ | ||||||
| @ -16,7 +16,6 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \ | |||||||
| 	magma-rocm/build_magma.sh | 	magma-rocm/build_magma.sh | ||||||
|  |  | ||||||
| .PHONY: all | .PHONY: all | ||||||
| all: magma-rocm70 |  | ||||||
| all: magma-rocm64 | all: magma-rocm64 | ||||||
| all: magma-rocm63 | all: magma-rocm63 | ||||||
|  |  | ||||||
| @ -25,11 +24,6 @@ clean: | |||||||
| 	$(RM) -r magma-* | 	$(RM) -r magma-* | ||||||
| 	$(RM) -r output | 	$(RM) -r output | ||||||
|  |  | ||||||
| .PHONY: magma-rocm70 |  | ||||||
| magma-rocm70: DESIRED_ROCM := 7.0 |  | ||||||
| magma-rocm70: |  | ||||||
| 	$(DOCKER_RUN) |  | ||||||
|  |  | ||||||
| .PHONY: magma-rocm64 | .PHONY: magma-rocm64 | ||||||
| magma-rocm64: DESIRED_ROCM := 6.4 | magma-rocm64: DESIRED_ROCM := 6.4 | ||||||
| magma-rocm64: | magma-rocm64: | ||||||
|  | |||||||
| @ -6,8 +6,8 @@ set -eou pipefail | |||||||
| # The script expects DESIRED_CUDA and PACKAGE_NAME to be set | # The script expects DESIRED_CUDA and PACKAGE_NAME to be set | ||||||
| ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" | ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" | ||||||
|  |  | ||||||
| # https://github.com/icl-utk-edu/magma/pull/65 | # Version 2.7.2 + ROCm related updates | ||||||
| MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec | MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6 | ||||||
|  |  | ||||||
| # Folders for the build | # Folders for the build | ||||||
| PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata | PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata | ||||||
| @ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE | |||||||
|  |  | ||||||
| # Fetch magma sources and verify checksum | # Fetch magma sources and verify checksum | ||||||
| pushd ${PACKAGE_DIR} | pushd ${PACKAGE_DIR} | ||||||
| git clone https://github.com/jeffdaily/magma | git clone https://bitbucket.org/icl/magma.git | ||||||
| pushd magma | pushd magma | ||||||
| git checkout ${MAGMA_VERSION} | git checkout ${MAGMA_VERSION} | ||||||
| popd | popd | ||||||
|  | |||||||
| @ -58,7 +58,7 @@ time python tools/setup_helpers/generate_code.py \ | |||||||
|  |  | ||||||
| # Build the docs | # Build the docs | ||||||
| pushd docs/cpp | pushd docs/cpp | ||||||
| time make VERBOSE=1 html | time make VERBOSE=1 html -j | ||||||
|  |  | ||||||
| popd | popd | ||||||
| popd | popd | ||||||
|  | |||||||
| @ -55,7 +55,7 @@ test_python_shard() { | |||||||
|  |  | ||||||
|   setup_test_python |   setup_test_python | ||||||
|  |  | ||||||
|   time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "$1" "$NUM_TEST_SHARDS" |   time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --shard "$1" "$NUM_TEST_SHARDS" | ||||||
|  |  | ||||||
|   assert_git_not_dirty |   assert_git_not_dirty | ||||||
| } | } | ||||||
|  | |||||||
| @ -322,29 +322,23 @@ test_python_shard() { | |||||||
|  |  | ||||||
|   # modify LD_LIBRARY_PATH to ensure it has the conda env. |   # modify LD_LIBRARY_PATH to ensure it has the conda env. | ||||||
|   # This set of tests has been shown to be buggy without it for the split-build |   # This set of tests has been shown to be buggy without it for the split-build | ||||||
|   time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running |   time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running | ||||||
|  |  | ||||||
|   assert_git_not_dirty |   assert_git_not_dirty | ||||||
| } | } | ||||||
|  |  | ||||||
| test_python() { | test_python() { | ||||||
|   # shellcheck disable=SC2086 |   # shellcheck disable=SC2086 | ||||||
|   time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION |   time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION | ||||||
|   assert_git_not_dirty |   assert_git_not_dirty | ||||||
| } | } | ||||||
|  |  | ||||||
| test_python_smoke() { | test_python_smoke() { | ||||||
|   # Smoke tests for H100/B200 |   # Smoke tests for H100 | ||||||
|   time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running |   time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running | ||||||
|   assert_git_not_dirty |   assert_git_not_dirty | ||||||
| } | } | ||||||
|  |  | ||||||
| test_python_smoke_b200() { |  | ||||||
|   # Targeted smoke tests for B200 - staged approach to avoid too many failures |  | ||||||
|   time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running |  | ||||||
|   assert_git_not_dirty |  | ||||||
| } |  | ||||||
|  |  | ||||||
| test_h100_distributed() { | test_h100_distributed() { | ||||||
|   # Distributed tests at H100 |   # Distributed tests at H100 | ||||||
|   time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py  $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running |   time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py  $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running | ||||||
| @ -390,7 +384,6 @@ test_dynamo_wrapped_shard() { | |||||||
|     --exclude-distributed-tests \ |     --exclude-distributed-tests \ | ||||||
|     --exclude-torch-export-tests \ |     --exclude-torch-export-tests \ | ||||||
|     --exclude-aot-dispatch-tests \ |     --exclude-aot-dispatch-tests \ | ||||||
|     --exclude-quantization-tests \ |  | ||||||
|     --shard "$1" "$NUM_TEST_SHARDS" \ |     --shard "$1" "$NUM_TEST_SHARDS" \ | ||||||
|     --verbose \ |     --verbose \ | ||||||
|     --upload-artifacts-while-running |     --upload-artifacts-while-running | ||||||
| @ -568,6 +561,43 @@ else | |||||||
|   DYNAMO_BENCHMARK_FLAGS+=(--device cuda) |   DYNAMO_BENCHMARK_FLAGS+=(--device cuda) | ||||||
| fi | fi | ||||||
|  |  | ||||||
|  | # Validate backend availability for dynamo_eager configs | ||||||
|  | if [[ "${TEST_CONFIG}" == *dynamo_eager* ]]; then | ||||||
|  |   echo "Validating eager backend availability for TEST_CONFIG: ${TEST_CONFIG}" | ||||||
|  |   if ! python -c "import torch; backends = torch._dynamo.list_backends(); print('Available backends:', backends); assert 'eager' in backends, f'eager backend not available. Available: {backends}'"; then | ||||||
|  |     echo "ERROR: eager backend not available in this environment" | ||||||
|  |     echo "This might be due to missing dependencies or incorrect PyTorch installation" | ||||||
|  |     exit 1 | ||||||
|  |   fi | ||||||
|  |   echo "eager backend validation successful" | ||||||
|  |    | ||||||
|  |   # Additional validation: test that torch.compile works with eager backend | ||||||
|  |   echo "Testing torch.compile with eager backend..." | ||||||
|  |   if ! python -c " | ||||||
|  | import torch | ||||||
|  | import torch._dynamo as dynamo | ||||||
|  |  | ||||||
|  | def test_func(x): | ||||||
|  |     return x * 2 | ||||||
|  |  | ||||||
|  | # Test that eager backend works | ||||||
|  | try: | ||||||
|  |     compiled_func = torch.compile(test_func, backend='eager') | ||||||
|  |     result = compiled_func(torch.tensor([1.0, 2.0])) | ||||||
|  |     print('torch.compile with eager backend test successful') | ||||||
|  | except Exception as e: | ||||||
|  |     print(f'ERROR: torch.compile with eager backend failed: {e}') | ||||||
|  |     exit(1) | ||||||
|  | "; then | ||||||
|  |     echo "ERROR: torch.compile with eager backend failed" | ||||||
|  |     exit 1 | ||||||
|  |   fi | ||||||
|  | fi | ||||||
|  |  | ||||||
|  | # Debug logging for backend selection | ||||||
|  | echo "TEST_CONFIG: ${TEST_CONFIG}" | ||||||
|  | echo "DYNAMO_BENCHMARK_FLAGS: ${DYNAMO_BENCHMARK_FLAGS[*]}" | ||||||
|  |  | ||||||
| test_cachebench() { | test_cachebench() { | ||||||
|   TEST_REPORTS_DIR=$(pwd)/test/test-reports |   TEST_REPORTS_DIR=$(pwd)/test/test-reports | ||||||
|   mkdir -p "$TEST_REPORTS_DIR" |   mkdir -p "$TEST_REPORTS_DIR" | ||||||
| @ -629,6 +659,16 @@ test_perf_for_dashboard() { | |||||||
|   shift |   shift | ||||||
|  |  | ||||||
|   local backend=inductor |   local backend=inductor | ||||||
|  |   # Allow surfacing eager metrics in CI by switching backend based on TEST_CONFIG | ||||||
|  |   if [[ "${TEST_CONFIG}" == *dynamo_eager* ]]; then | ||||||
|  |     backend=eager | ||||||
|  |   elif [[ "${TEST_CONFIG}" == *aot_eager* ]]; then | ||||||
|  |     backend=aot_eager | ||||||
|  |   fi | ||||||
|  |    | ||||||
|  |   # Debug logging for backend selection in test_perf_for_dashboard | ||||||
|  |   echo "test_perf_for_dashboard: TEST_CONFIG=${TEST_CONFIG}, selected backend=${backend}" | ||||||
|  |   echo "DASHBOARD_TAG=${DASHBOARD_TAG}" | ||||||
|   local modes=() |   local modes=() | ||||||
|   if [[ "$DASHBOARD_TAG" == *training-true* ]]; then |   if [[ "$DASHBOARD_TAG" == *training-true* ]]; then | ||||||
|     modes+=(training) |     modes+=(training) | ||||||
| @ -682,20 +722,37 @@ test_perf_for_dashboard() { | |||||||
|       fi |       fi | ||||||
|  |  | ||||||
|       if [[ "$DASHBOARD_TAG" == *default-true* ]]; then |       if [[ "$DASHBOARD_TAG" == *default-true* ]]; then | ||||||
|         $TASKSET python "benchmarks/dynamo/$suite.py" \ |         echo "Running benchmark: ${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}" | ||||||
|  |         echo "Command: $TASKSET python benchmarks/dynamo/$suite.py ${target_flag[*]} --$mode --$dtype --backend $backend --disable-cudagraphs $*" | ||||||
|  |         if ! $TASKSET python "benchmarks/dynamo/$suite.py" \ | ||||||
|             "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \ |             "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \ | ||||||
|             --output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv" |             --output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"; then | ||||||
|  |           echo "ERROR: Benchmark failed for ${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}" | ||||||
|  |           echo "This might indicate an issue with the eager backend or benchmark configuration" | ||||||
|  |           exit 1 | ||||||
|  |         fi | ||||||
|  |         echo "Benchmark completed successfully: ${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}" | ||||||
|       fi |       fi | ||||||
|       if [[ "$DASHBOARD_TAG" == *cudagraphs-true* ]]; then |       if [[ "$DASHBOARD_TAG" == *cudagraphs-true* ]]; then | ||||||
|         $TASKSET python "benchmarks/dynamo/$suite.py" \ |         echo "Running benchmark: ${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}" | ||||||
|  |         if ! $TASKSET python "benchmarks/dynamo/$suite.py" \ | ||||||
|             "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" \ |             "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" \ | ||||||
|             --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv" |             --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"; then | ||||||
|  |           echo "ERROR: Benchmark failed for ${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}" | ||||||
|  |           exit 1 | ||||||
|  |         fi | ||||||
|  |         echo "Benchmark completed successfully: ${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}" | ||||||
|       fi |       fi | ||||||
|       if [[ "$DASHBOARD_TAG" == *dynamic-true* ]]; then |       if [[ "$DASHBOARD_TAG" == *dynamic-true* ]]; then | ||||||
|         $TASKSET python "benchmarks/dynamo/$suite.py" \ |         echo "Running benchmark: ${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}" | ||||||
|  |         if ! $TASKSET python "benchmarks/dynamo/$suite.py" \ | ||||||
|             "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --dynamic-shapes \ |             "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --dynamic-shapes \ | ||||||
|             --dynamic-batch-only "$@" \ |             --dynamic-batch-only "$@" \ | ||||||
|             --output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}.csv" |             --output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}.csv"; then | ||||||
|  |           echo "ERROR: Benchmark failed for ${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}" | ||||||
|  |           exit 1 | ||||||
|  |         fi | ||||||
|  |         echo "Benchmark completed successfully: ${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}" | ||||||
|       fi |       fi | ||||||
|       if [[ "$DASHBOARD_TAG" == *cppwrapper-true* ]]; then |       if [[ "$DASHBOARD_TAG" == *cppwrapper-true* ]]; then | ||||||
|         TORCHINDUCTOR_CPP_WRAPPER=1 $TASKSET python "benchmarks/dynamo/$suite.py" \ |         TORCHINDUCTOR_CPP_WRAPPER=1 $TASKSET python "benchmarks/dynamo/$suite.py" \ | ||||||
| @ -1163,12 +1220,6 @@ test_distributed() { | |||||||
|   fi |   fi | ||||||
| } | } | ||||||
|  |  | ||||||
| test_quantization() { |  | ||||||
|   echo "Testing quantization" |  | ||||||
|  |  | ||||||
|   python test/test_quantization.py |  | ||||||
| } |  | ||||||
|  |  | ||||||
| test_rpc() { | test_rpc() { | ||||||
|   echo "Testing RPC C++ tests" |   echo "Testing RPC C++ tests" | ||||||
|   # NB: the ending test_rpc must match the current function name for the current |   # NB: the ending test_rpc must match the current function name for the current | ||||||
| @ -1586,7 +1637,7 @@ test_executorch() { | |||||||
| test_linux_aarch64() { | test_linux_aarch64() { | ||||||
|   python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \ |   python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \ | ||||||
|         test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \ |         test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \ | ||||||
|         test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops profiler/test_memory_profiler \ |         test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \ | ||||||
|         distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \ |         distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \ | ||||||
|         --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose |         --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose | ||||||
|  |  | ||||||
| @ -1630,25 +1681,6 @@ test_operator_benchmark() { | |||||||
|       --expected "expected_ci_operator_benchmark_eager_float32_cpu.csv" |       --expected "expected_ci_operator_benchmark_eager_float32_cpu.csv" | ||||||
| } | } | ||||||
|  |  | ||||||
| test_operator_microbenchmark() { |  | ||||||
|   TEST_REPORTS_DIR=$(pwd)/test/test-reports |  | ||||||
|   mkdir -p "$TEST_REPORTS_DIR" |  | ||||||
|   TEST_DIR=$(pwd) |  | ||||||
|  |  | ||||||
|   cd benchmarks/operator_benchmark/pt_extension |  | ||||||
|   python -m pip install . |  | ||||||
|  |  | ||||||
|   cd "${TEST_DIR}"/benchmarks/operator_benchmark |  | ||||||
|  |  | ||||||
|   for OP_BENCHMARK_TESTS in matmul mm addmm bmm; do |  | ||||||
|     $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \ |  | ||||||
|       --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \ |  | ||||||
|       --benchmark-name "PyTorch operator microbenchmark" --use-compile |  | ||||||
|     $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \ |  | ||||||
|       --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}.json" \ |  | ||||||
|       --benchmark-name "PyTorch operator microbenchmark" |  | ||||||
|   done |  | ||||||
| } |  | ||||||
|  |  | ||||||
| if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then | if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then | ||||||
|   (cd test && python -c "import torch; print(torch.__config__.show())") |   (cd test && python -c "import torch; print(torch.__config__.show())") | ||||||
| @ -1681,8 +1713,6 @@ elif [[ "${TEST_CONFIG}" == *executorch* ]]; then | |||||||
|   test_executorch |   test_executorch | ||||||
| elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then | elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then | ||||||
|   test_python_legacy_jit |   test_python_legacy_jit | ||||||
| elif [[ "$TEST_CONFIG" == 'quantization' ]]; then |  | ||||||
|   test_quantization |  | ||||||
| elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then | elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then | ||||||
|   # TODO: run some C++ tests |   # TODO: run some C++ tests | ||||||
|   echo "no-op at the moment" |   echo "no-op at the moment" | ||||||
| @ -1705,8 +1735,6 @@ elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then | |||||||
|     test_operator_benchmark cpu ${TEST_MODE} |     test_operator_benchmark cpu ${TEST_MODE} | ||||||
|  |  | ||||||
|   fi |   fi | ||||||
| elif [[ "${TEST_CONFIG}" == *operator_microbenchmark* ]]; then |  | ||||||
|   test_operator_microbenchmark |  | ||||||
| elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then | elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then | ||||||
|   test_inductor_distributed |   test_inductor_distributed | ||||||
| elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then | elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then | ||||||
| @ -1809,14 +1837,10 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then | |||||||
|   test_xpu_bin |   test_xpu_bin | ||||||
| elif [[ "${TEST_CONFIG}" == smoke ]]; then | elif [[ "${TEST_CONFIG}" == smoke ]]; then | ||||||
|   test_python_smoke |   test_python_smoke | ||||||
| elif [[ "${TEST_CONFIG}" == smoke_b200 ]]; then |  | ||||||
|   test_python_smoke_b200 |  | ||||||
| elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then | elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then | ||||||
|   test_h100_distributed |   test_h100_distributed | ||||||
| elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then | elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then | ||||||
|   test_h100_symm_mem |   test_h100_symm_mem | ||||||
| elif [[ "${TEST_CONFIG}" == "b200-symm-mem" ]]; then |  | ||||||
|   test_h100_symm_mem |  | ||||||
| elif [[ "${TEST_CONFIG}" == h100_cutlass_backend ]]; then | elif [[ "${TEST_CONFIG}" == h100_cutlass_backend ]]; then | ||||||
|   test_h100_cutlass_backend |   test_h100_cutlass_backend | ||||||
| else | else | ||||||
|  | |||||||
| @ -25,7 +25,7 @@ echo Copying over test times file | |||||||
| robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files" | robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files" | ||||||
|  |  | ||||||
| echo Run nn tests | echo Run nn tests | ||||||
| python run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose | python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose | ||||||
| if ERRORLEVEL 1 goto fail | if ERRORLEVEL 1 goto fail | ||||||
|  |  | ||||||
| popd | popd | ||||||
|  | |||||||
| @ -63,7 +63,7 @@ if errorlevel 1 exit /b 1 | |||||||
| call %CONDA_HOME%\condabin\activate.bat testenv | call %CONDA_HOME%\condabin\activate.bat testenv | ||||||
| if errorlevel 1 exit /b 1 | if errorlevel 1 exit /b 1 | ||||||
|  |  | ||||||
| call conda install  -y -q -c conda-forge libuv=1.51 | call conda install  -y -q -c conda-forge libuv=1.39 | ||||||
| call conda install -y -q intel-openmp | call conda install -y -q intel-openmp | ||||||
|  |  | ||||||
| echo "install and test libtorch" | echo "install and test libtorch" | ||||||
|  | |||||||
							
								
								
									
										47
									
								
								.circleci/scripts/functorch_doc_push_script.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										47
									
								
								.circleci/scripts/functorch_doc_push_script.sh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,47 @@ | |||||||
|  | #!/bin/bash | ||||||
|  | # =================== The following code **should** be executed inside Docker container =================== | ||||||
|  |  | ||||||
|  | # Install dependencies | ||||||
|  | sudo apt-get -y update | ||||||
|  | sudo apt-get -y install expect-dev | ||||||
|  |  | ||||||
|  | # This is where the local pytorch install in the docker image is located | ||||||
|  | pt_checkout="/var/lib/jenkins/workspace" | ||||||
|  | source "$pt_checkout/.ci/pytorch/common_utils.sh" | ||||||
|  | echo "functorch_doc_push_script.sh: Invoked with $*" | ||||||
|  |  | ||||||
|  | set -ex | ||||||
|  |  | ||||||
|  | version=${DOCS_VERSION:-nightly} | ||||||
|  | echo "version: $version" | ||||||
|  |  | ||||||
|  | # Build functorch docs | ||||||
|  | pushd $pt_checkout/functorch/docs | ||||||
|  | pip -q install -r requirements.txt | ||||||
|  | make html | ||||||
|  | popd | ||||||
|  |  | ||||||
|  | git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages | ||||||
|  | pushd functorch_ghpages | ||||||
|  |  | ||||||
|  | if [ $version == "main" ]; then | ||||||
|  |   version=nightly | ||||||
|  | fi | ||||||
|  |  | ||||||
|  | git rm -rf "$version" || true | ||||||
|  | mv "$pt_checkout/functorch/docs/build/html" "$version" | ||||||
|  |  | ||||||
|  | git add "$version" || true | ||||||
|  | git status | ||||||
|  | git config user.email "soumith+bot@pytorch.org" | ||||||
|  | git config user.name "pytorchbot" | ||||||
|  | # If there aren't changes, don't make a commit; push is no-op | ||||||
|  | git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true | ||||||
|  | git status | ||||||
|  |  | ||||||
|  | if [[ "${WITH_PUSH:-}" == true ]]; then | ||||||
|  |   git push -u origin gh-pages | ||||||
|  | fi | ||||||
|  |  | ||||||
|  | popd | ||||||
|  | # =================== The above code **should** be executed inside Docker container =================== | ||||||
| @ -69,8 +69,6 @@ readability-string-compare, | |||||||
| ' | ' | ||||||
| HeaderFilterRegex: '^(aten/|c10/|torch/).*$' | HeaderFilterRegex: '^(aten/|c10/|torch/).*$' | ||||||
| WarningsAsErrors: '*' | WarningsAsErrors: '*' | ||||||
| LineFilter: |  | ||||||
|   - name: '/usr/include/.*' |  | ||||||
| CheckOptions: | CheckOptions: | ||||||
|   cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor: true |   cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor: true | ||||||
|   cppcoreguidelines-special-member-functions.AllowImplicitlyDeletedCopyOrMove: true |   cppcoreguidelines-special-member-functions.AllowImplicitlyDeletedCopyOrMove: true | ||||||
|  | |||||||
							
								
								
									
										4
									
								
								.github/ISSUE_TEMPLATE/ci-sev.md
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/ISSUE_TEMPLATE/ci-sev.md
									
									
									
									
										vendored
									
									
								
							| @ -1,10 +1,6 @@ | |||||||
| --- | --- | ||||||
| name: "⚠️ CI SEV" | name: "⚠️ CI SEV" | ||||||
| about: Tracking incidents for PyTorch's CI infra. | about: Tracking incidents for PyTorch's CI infra. | ||||||
| title: '' |  | ||||||
| labels: '' |  | ||||||
| assignees: '' |  | ||||||
|  |  | ||||||
| --- | --- | ||||||
|  |  | ||||||
| > NOTE: Remember to label this issue with "`ci: sev`" | > NOTE: Remember to label this issue with "`ci: sev`" | ||||||
|  | |||||||
							
								
								
									
										18
									
								
								.github/ISSUE_TEMPLATE/disable-autorevert.md
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										18
									
								
								.github/ISSUE_TEMPLATE/disable-autorevert.md
									
									
									
									
										vendored
									
									
								
							| @ -1,18 +0,0 @@ | |||||||
| --- |  | ||||||
| name: DISABLE AUTOREVERT |  | ||||||
| about: Disables autorevert when open |  | ||||||
| title: "❌\U0001F519 [DISABLE AUTOREVERT]" |  | ||||||
| labels: 'ci: disable-autorevert' |  | ||||||
| assignees: '' |  | ||||||
|  |  | ||||||
| --- |  | ||||||
|  |  | ||||||
| This issue, while open, disables the autorevert functionality. |  | ||||||
|  |  | ||||||
| More details can be found [here](https://github.com/pytorch/test-infra/blob/main/aws/lambda/pytorch-auto-revert/README.md) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| ## Why are you disabling autorevert? |  | ||||||
|  |  | ||||||
|  |  | ||||||
| ## Links to any issues/commits/errors that shows the source of problem |  | ||||||
							
								
								
									
										6
									
								
								.github/ISSUE_TEMPLATE/disable-ci-jobs.md
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										6
									
								
								.github/ISSUE_TEMPLATE/disable-ci-jobs.md
									
									
									
									
										vendored
									
									
								
							| @ -1,10 +1,8 @@ | |||||||
| --- | --- | ||||||
| name: Disable CI jobs (PyTorch Dev Infra only) | name: Disable CI jobs (PyTorch Dev Infra only) | ||||||
| about: Use this template to disable CI jobs | about: Use this template to disable CI jobs | ||||||
| title: DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME] | title: "DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]" | ||||||
| labels: 'module: ci' | labels: "module: ci" | ||||||
| assignees: '' |  | ||||||
|  |  | ||||||
| --- | --- | ||||||
|  |  | ||||||
| > For example, DISABLED pull / win-vs2022-cpu-py3 / test (default). Once | > For example, DISABLED pull / win-vs2022-cpu-py3 / test (default). Once | ||||||
|  | |||||||
							
								
								
									
										3
									
								
								.github/actionlint.yaml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.github/actionlint.yaml
									
									
									
									
										vendored
									
									
								
							| @ -22,9 +22,6 @@ self-hosted-runner: | |||||||
|     - linux.arm64.m7g.4xlarge |     - linux.arm64.m7g.4xlarge | ||||||
|     - linux.arm64.m7g.4xlarge.ephemeral |     - linux.arm64.m7g.4xlarge.ephemeral | ||||||
|     - linux.arm64.r7g.12xlarge.memory |     - linux.arm64.r7g.12xlarge.memory | ||||||
|     - linux.aws.h100 |  | ||||||
|     - linux.aws.h100.4 |  | ||||||
|     - linux.aws.h100.8 |  | ||||||
|     - linux.4xlarge.nvidia.gpu |     - linux.4xlarge.nvidia.gpu | ||||||
|     - linux.8xlarge.nvidia.gpu |     - linux.8xlarge.nvidia.gpu | ||||||
|     - linux.16xlarge.nvidia.gpu |     - linux.16xlarge.nvidia.gpu | ||||||
|  | |||||||
							
								
								
									
										2
									
								
								.github/actions/setup-win/action.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/actions/setup-win/action.yml
									
									
									
									
										vendored
									
									
								
							| @ -59,7 +59,7 @@ runs: | |||||||
|         set -x |         set -x | ||||||
|  |  | ||||||
|         # Create new py_tmp env with python-version |         # Create new py_tmp env with python-version | ||||||
|         ${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp libuv |         ${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp | ||||||
|  |  | ||||||
|         PYTHON3=$(${CONDA_RUN} -n py_tmp which python3) |         PYTHON3=$(${CONDA_RUN} -n py_tmp which python3) | ||||||
|         EXIT_CODE=$? |         EXIT_CODE=$? | ||||||
|  | |||||||
							
								
								
									
										2
									
								
								.github/ci_commit_pins/vllm.txt
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/ci_commit_pins/vllm.txt
									
									
									
									
										vendored
									
									
								
							| @ -1 +1 @@ | |||||||
| 0307428d65acf5cf1a73a70a7722e076bbb83f22 | 367a480bd3534edf27a8dac3c6f7ea8af9d1ed45 | ||||||
|  | |||||||
							
								
								
									
										2
									
								
								.github/ci_commit_pins/xla.txt
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/ci_commit_pins/xla.txt
									
									
									
									
										vendored
									
									
								
							| @ -1 +1 @@ | |||||||
| 0fc62aa26a30ed7ca419d285f285cb5ba02c4394 | c77852e117bdf056c8e9a087e51d6f65cf6ba53d | ||||||
|  | |||||||
							
								
								
									
										15
									
								
								.github/merge_rules.yaml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										15
									
								
								.github/merge_rules.yaml
									
									
									
									
										vendored
									
									
								
							| @ -525,21 +525,6 @@ | |||||||
|   - Lint |   - Lint | ||||||
|   - pull |   - pull | ||||||
|  |  | ||||||
| - name: typechecking |  | ||||||
|   patterns: |  | ||||||
|   - 'pyrefly.toml' |  | ||||||
|   - 'mypy.ini' |  | ||||||
|   - 'mypy-strict.ini' |  | ||||||
|   approved_by: |  | ||||||
|   - lolpack |  | ||||||
|   - maggiemoss |  | ||||||
|   - ndmitchell |  | ||||||
|   - kinto0 |  | ||||||
|   mandatory_checks_name: |  | ||||||
|   - EasyCLA |  | ||||||
|   - Lint |  | ||||||
|   - pull |  | ||||||
|  |  | ||||||
| - name: superuser | - name: superuser | ||||||
|   patterns: |   patterns: | ||||||
|   - '*' |   - '*' | ||||||
|  | |||||||
							
								
								
									
										39
									
								
								.github/pytorch-probot.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										39
									
								
								.github/pytorch-probot.yml
									
									
									
									
										vendored
									
									
								
							| @ -1,44 +1,41 @@ | |||||||
| tracking_issue: 24422 | tracking_issue: 24422 | ||||||
| ciflow_tracking_issue: 64124 | ciflow_tracking_issue: 64124 | ||||||
| ciflow_push_tags: | ciflow_push_tags: | ||||||
| - ciflow/b200 |  | ||||||
| - ciflow/b200-symm-mem |  | ||||||
| - ciflow/binaries | - ciflow/binaries | ||||||
| - ciflow/binaries_libtorch | - ciflow/binaries_libtorch | ||||||
| - ciflow/binaries_wheel | - ciflow/binaries_wheel | ||||||
| - ciflow/h100 | - ciflow/triton_binaries | ||||||
| - ciflow/h100-cutlass-backend |  | ||||||
| - ciflow/h100-distributed |  | ||||||
| - ciflow/h100-symm-mem |  | ||||||
| - ciflow/inductor | - ciflow/inductor | ||||||
| - ciflow/inductor-cu126 |  | ||||||
| - ciflow/inductor-micro-benchmark |  | ||||||
| - ciflow/inductor-micro-benchmark-cpu-x86 |  | ||||||
| - ciflow/inductor-perf-compare |  | ||||||
| - ciflow/inductor-perf-test-nightly-rocm |  | ||||||
| - ciflow/inductor-perf-test-nightly-x86-zen |  | ||||||
| - ciflow/inductor-periodic | - ciflow/inductor-periodic | ||||||
| - ciflow/inductor-rocm | - ciflow/inductor-rocm | ||||||
|  | - ciflow/inductor-perf-test-nightly-rocm | ||||||
|  | - ciflow/inductor-perf-compare | ||||||
|  | - ciflow/inductor-micro-benchmark | ||||||
|  | - ciflow/inductor-micro-benchmark-cpu-x86 | ||||||
|  | - ciflow/inductor-perf-test-nightly-x86-zen | ||||||
|  | - ciflow/inductor-cu126 | ||||||
| - ciflow/linux-aarch64 | - ciflow/linux-aarch64 | ||||||
| - ciflow/mps | - ciflow/mps | ||||||
| - ciflow/nightly | - ciflow/nightly | ||||||
| - ciflow/op-benchmark |  | ||||||
| - ciflow/periodic | - ciflow/periodic | ||||||
| - ciflow/periodic-rocm-mi300 | - ciflow/periodic-rocm-mi300 | ||||||
| - ciflow/pull |  | ||||||
| - ciflow/quantization-periodic |  | ||||||
| - ciflow/riscv64 |  | ||||||
| - ciflow/rocm | - ciflow/rocm | ||||||
| - ciflow/rocm-mi300 | - ciflow/rocm-mi300 | ||||||
| - ciflow/s390 | - ciflow/s390 | ||||||
|  | - ciflow/riscv64 | ||||||
| - ciflow/slow | - ciflow/slow | ||||||
| - ciflow/torchbench |  | ||||||
| - ciflow/triton_binaries |  | ||||||
| - ciflow/trunk | - ciflow/trunk | ||||||
| - ciflow/unstable | - ciflow/unstable | ||||||
| - ciflow/vllm |  | ||||||
| - ciflow/win-arm64 |  | ||||||
| - ciflow/xpu | - ciflow/xpu | ||||||
|  | - ciflow/vllm | ||||||
|  | - ciflow/torchbench | ||||||
|  | - ciflow/op-benchmark | ||||||
|  | - ciflow/pull | ||||||
|  | - ciflow/h100 | ||||||
|  | - ciflow/h100-distributed | ||||||
|  | - ciflow/win-arm64 | ||||||
|  | - ciflow/h100-symm-mem | ||||||
|  | - ciflow/h100-cutlass-backend | ||||||
| retryable_workflows: | retryable_workflows: | ||||||
| - pull | - pull | ||||||
| - trunk | - trunk | ||||||
| @ -47,4 +44,4 @@ retryable_workflows: | |||||||
| - inductor-A100-perf-nightly | - inductor-A100-perf-nightly | ||||||
| labeler_config: labeler.yml | labeler_config: labeler.yml | ||||||
| label_to_label_config: label_to_label.yml | label_to_label_config: label_to_label.yml | ||||||
| mergebot: true | mergebot: True | ||||||
|  | |||||||
| @ -30,7 +30,7 @@ CUDA_ARCHES_CUDNN_VERSION = { | |||||||
| } | } | ||||||
|  |  | ||||||
| # NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this | # NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this | ||||||
| ROCM_ARCHES = ["6.4", "7.0"] | ROCM_ARCHES = ["6.3", "6.4"] | ||||||
|  |  | ||||||
| XPU_ARCHES = ["xpu"] | XPU_ARCHES = ["xpu"] | ||||||
|  |  | ||||||
| @ -53,7 +53,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = { | |||||||
|         "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | " |         "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | " | ||||||
|         "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | " |         "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | " | ||||||
|         "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | " |         "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | " | ||||||
|         "nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | " |         "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | " | ||||||
|         "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | " |         "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | " | ||||||
|         "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | " |         "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | " | ||||||
|         "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | " |         "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | " | ||||||
| @ -70,7 +70,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = { | |||||||
|         "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | " |         "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | " | ||||||
|         "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | " |         "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | " | ||||||
|         "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | " |         "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | " | ||||||
|         "nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | " |         "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | " | ||||||
|         "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | " |         "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | " | ||||||
|         "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | " |         "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | " | ||||||
|         "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | " |         "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | " | ||||||
| @ -87,7 +87,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = { | |||||||
|         "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | " |         "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | " | ||||||
|         "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | " |         "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | " | ||||||
|         "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | " |         "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | " | ||||||
|         "nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | " |         "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | " | ||||||
|         "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | " |         "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | " | ||||||
|         "nvidia-nvtx==13.0.39; platform_system == 'Linux' | " |         "nvidia-nvtx==13.0.39; platform_system == 'Linux' | " | ||||||
|         "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | " |         "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | " | ||||||
|  | |||||||
							
								
								
									
										2
									
								
								.github/scripts/generate_ci_workflows.py
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/scripts/generate_ci_workflows.py
									
									
									
									
										vendored
									
									
								
							| @ -155,7 +155,7 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [ | |||||||
|         package_type="manywheel", |         package_type="manywheel", | ||||||
|         build_configs=generate_binary_build_matrix.generate_wheels_matrix( |         build_configs=generate_binary_build_matrix.generate_wheels_matrix( | ||||||
|             OperatingSystem.LINUX, |             OperatingSystem.LINUX, | ||||||
|             arches=["13.0"], |             arches=["12.8"], | ||||||
|             python_versions=["3.12"], |             python_versions=["3.12"], | ||||||
|         ), |         ), | ||||||
|         branches="main", |         branches="main", | ||||||
|  | |||||||
| @ -71,15 +71,12 @@ jobs: | |||||||
|     with:!{{ upload.binary_env_as_input(config) }} |     with:!{{ upload.binary_env_as_input(config) }} | ||||||
|       {%- if "aarch64" in build_environment %} |       {%- if "aarch64" in build_environment %} | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       {%- elif "s390x" in build_environment %} |       {%- elif "s390x" in build_environment %} | ||||||
|       runs_on: linux.s390x |       runs_on: linux.s390x | ||||||
|       ALPINE_IMAGE: "docker.io/s390x/alpine" |       ALPINE_IMAGE: "docker.io/s390x/alpine" | ||||||
|       timeout-minutes: 420 |       timeout-minutes: 420 | ||||||
|       {%- elif config["gpu_arch_type"] == "rocm" %} |  | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |  | ||||||
|       timeout-minutes: 300 |  | ||||||
|       {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %} |       {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %} | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.24xlarge.ephemeral |       runs_on: linux.24xlarge.ephemeral | ||||||
|  | |||||||
							
								
								
									
										2
									
								
								.github/workflows/_docs.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/_docs.yml
									
									
									
									
										vendored
									
									
								
							| @ -67,7 +67,7 @@ jobs: | |||||||
|             # an OOM issue when running the job, so this upgrades the runner from 4xlarge |             # an OOM issue when running the job, so this upgrades the runner from 4xlarge | ||||||
|             # to the next available tier of 12xlarge. So much memory just to generate cpp |             # to the next available tier of 12xlarge. So much memory just to generate cpp | ||||||
|             # doc |             # doc | ||||||
|             runner: ${{ inputs.runner_prefix }}linux.12xlarge.memory |             runner: ${{ inputs.runner_prefix }}linux.12xlarge | ||||||
|             # TODO: Nightly cpp docs take longer and longer to finish (more than 3h now) |             # TODO: Nightly cpp docs take longer and longer to finish (more than 3h now) | ||||||
|             # Let's try to figure out how this can be improved |             # Let's try to figure out how this can be improved | ||||||
|             timeout-minutes: 360 |             timeout-minutes: 360 | ||||||
|  | |||||||
							
								
								
									
										28
									
								
								.github/workflows/_get-changed-files.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										28
									
								
								.github/workflows/_get-changed-files.yml
									
									
									
									
										vendored
									
									
								
							| @ -2,12 +2,6 @@ name: Get Changed Files | |||||||
|  |  | ||||||
| on: | on: | ||||||
|   workflow_call: |   workflow_call: | ||||||
|     inputs: |  | ||||||
|       all_files: |  | ||||||
|         description: "Whether to return all files instead of just changed files" |  | ||||||
|         required: false |  | ||||||
|         type: boolean |  | ||||||
|         default: false |  | ||||||
|     outputs: |     outputs: | ||||||
|       changed-files: |       changed-files: | ||||||
|         description: "List of changed files (space-separated) or '*' if not in a PR" |         description: "List of changed files (space-separated) or '*' if not in a PR" | ||||||
| @ -32,23 +26,17 @@ jobs: | |||||||
|             # Get the PR number from the github context |             # Get the PR number from the github context | ||||||
|             PR_NUMBER="${{ github.event.number }}" |             PR_NUMBER="${{ github.event.number }}" | ||||||
|  |  | ||||||
|             # Check if all_files is requested |             # Use gh CLI to get changed files in the PR with explicit repo | ||||||
|             if [ "${{ inputs.all_files }}" = "true" ]; then |             CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//') | ||||||
|               echo "all_files input is true, returning all files" |  | ||||||
|               echo "changed-files=*" >> "$GITHUB_OUTPUT" |  | ||||||
|             else |  | ||||||
|               # Use gh CLI to get changed files in the PR with explicit repo |  | ||||||
|               CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//') |  | ||||||
|  |  | ||||||
|               if [ -z "$CHANGED_FILES" ]; then |             if [ -z "$CHANGED_FILES" ]; then | ||||||
|                 echo "No changed files found, setting to '*'" |               echo "No changed files found, setting to '*'" | ||||||
|                 CHANGED_FILES="*" |               CHANGED_FILES="*" | ||||||
|               fi |  | ||||||
|  |  | ||||||
|               echo "Changed files: $CHANGED_FILES" |  | ||||||
|               echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT" |  | ||||||
|             fi |             fi | ||||||
|  |  | ||||||
|  |             echo "Changed files: $CHANGED_FILES" | ||||||
|  |             echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT" | ||||||
|  |  | ||||||
|           else |           else | ||||||
|             echo "Not in PR context, setting changed files to '*'" |             echo "Not in PR context, setting changed files to '*'" | ||||||
|             echo "changed-files=*" >> "$GITHUB_OUTPUT" |             echo "changed-files=*" >> "$GITHUB_OUTPUT" | ||||||
|  | |||||||
							
								
								
									
										2
									
								
								.github/workflows/_linux-test.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/_linux-test.yml
									
									
									
									
										vendored
									
									
								
							| @ -273,8 +273,6 @@ jobs: | |||||||
|           TEST_CONFIG: ${{ matrix.config }} |           TEST_CONFIG: ${{ matrix.config }} | ||||||
|           SHARD_NUMBER: ${{ matrix.shard }} |           SHARD_NUMBER: ${{ matrix.shard }} | ||||||
|           NUM_TEST_SHARDS: ${{ matrix.num_shards }} |           NUM_TEST_SHARDS: ${{ matrix.num_shards }} | ||||||
|           EXTRA_FLAGS: ${{ matrix.extra_flags || '' }} |  | ||||||
|           OP_BENCHMARK_TESTS: ${{ matrix.op_benchmark_tests }} |  | ||||||
|           REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }} |           REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }} | ||||||
|           CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} |           CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} | ||||||
|           VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} |           VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} | ||||||
|  | |||||||
							
								
								
									
										60
									
								
								.github/workflows/b200-symm-mem.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										60
									
								
								.github/workflows/b200-symm-mem.yml
									
									
									
									
										vendored
									
									
								
							| @ -1,60 +0,0 @@ | |||||||
| name: Limited CI for symmetric memory tests on B200 |  | ||||||
|  |  | ||||||
| on: |  | ||||||
|   pull_request: |  | ||||||
|     paths: |  | ||||||
|       - .github/workflows/b200-symm-mem.yml |  | ||||||
|   workflow_dispatch: |  | ||||||
|   push: |  | ||||||
|     tags: |  | ||||||
|       - ciflow/b200-symm-mem/* |  | ||||||
|   schedule: |  | ||||||
|     - cron: 22 8 * * *  # about 1:22am PDT |  | ||||||
|  |  | ||||||
| concurrency: |  | ||||||
|   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} |  | ||||||
|   cancel-in-progress: true |  | ||||||
|  |  | ||||||
| permissions: |  | ||||||
|   id-token: write |  | ||||||
|   contents: read |  | ||||||
|  |  | ||||||
| jobs: |  | ||||||
|  |  | ||||||
|   get-label-type: |  | ||||||
|     if: github.repository_owner == 'pytorch' |  | ||||||
|     name: get-label-type |  | ||||||
|     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main |  | ||||||
|     with: |  | ||||||
|       triggering_actor: ${{ github.triggering_actor }} |  | ||||||
|       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} |  | ||||||
|       curr_branch: ${{ github.head_ref || github.ref_name }} |  | ||||||
|       curr_ref_type: ${{ github.ref_type }} |  | ||||||
|  |  | ||||||
|   linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm: |  | ||||||
|     name: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm |  | ||||||
|     uses: ./.github/workflows/_linux-build.yml |  | ||||||
|     needs: get-label-type |  | ||||||
|     with: |  | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |  | ||||||
|       runner: linux.12xlarge.memory |  | ||||||
|       build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm |  | ||||||
|       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 |  | ||||||
|       cuda-arch-list: '10.0' |  | ||||||
|       test-matrix: | |  | ||||||
|         { include: [ |  | ||||||
|           { config: "b200-symm-mem", shard: 1, num_shards: 1, runner: "linux.dgx.b200.8" }, |  | ||||||
|         ]} |  | ||||||
|     secrets: inherit |  | ||||||
|  |  | ||||||
|   linux-jammy-cuda12_8-py3_10-gcc11-sm100-test: |  | ||||||
|     name: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm |  | ||||||
|     uses: ./.github/workflows/_linux-test.yml |  | ||||||
|     needs: |  | ||||||
|       - linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm |  | ||||||
|     with: |  | ||||||
|       build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm |  | ||||||
|       docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm.outputs.docker-image }} |  | ||||||
|       test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm.outputs.test-matrix }} |  | ||||||
|       aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only |  | ||||||
|     secrets: inherit |  | ||||||
							
								
								
									
										2
									
								
								.github/workflows/build-almalinux-images.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/build-almalinux-images.yml
									
									
									
									
										vendored
									
									
								
							| @ -36,7 +36,7 @@ jobs: | |||||||
|     runs-on: linux.9xlarge.ephemeral |     runs-on: linux.9xlarge.ephemeral | ||||||
|     strategy: |     strategy: | ||||||
|       matrix: |       matrix: | ||||||
|         tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "rocm7.0", "cpu"] |         tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "cpu"] | ||||||
|     steps: |     steps: | ||||||
|       - name: Build docker image |       - name: Build docker image | ||||||
|         uses: pytorch/pytorch/.github/actions/binary-docker-build@main |         uses: pytorch/pytorch/.github/actions/binary-docker-build@main | ||||||
|  | |||||||
							
								
								
									
										2
									
								
								.github/workflows/build-libtorch-images.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/build-libtorch-images.yml
									
									
									
									
										vendored
									
									
								
							| @ -52,8 +52,8 @@ jobs: | |||||||
|           { tag: "cuda12.9" }, |           { tag: "cuda12.9" }, | ||||||
|           { tag: "cuda12.8" }, |           { tag: "cuda12.8" }, | ||||||
|           { tag: "cuda12.6" }, |           { tag: "cuda12.6" }, | ||||||
|  |           { tag: "rocm6.3"  }, | ||||||
|           { tag: "rocm6.4"  }, |           { tag: "rocm6.4"  }, | ||||||
|           { tag: "rocm7.0"  }, |  | ||||||
|           { tag: "cpu"      }, |           { tag: "cpu"      }, | ||||||
|         ] |         ] | ||||||
|     steps: |     steps: | ||||||
|  | |||||||
							
								
								
									
										2
									
								
								.github/workflows/build-magma-rocm-linux.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/build-magma-rocm-linux.yml
									
									
									
									
										vendored
									
									
								
							| @ -34,7 +34,7 @@ jobs: | |||||||
|       id-token: write |       id-token: write | ||||||
|     strategy: |     strategy: | ||||||
|       matrix: |       matrix: | ||||||
|         rocm_version: ["70", "64"] |         rocm_version: ["64", "63"] | ||||||
|     steps: |     steps: | ||||||
|       - name: Checkout PyTorch |       - name: Checkout PyTorch | ||||||
|         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | ||||||
|  | |||||||
							
								
								
									
										2
									
								
								.github/workflows/build-manywheel-images.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/build-manywheel-images.yml
									
									
									
									
										vendored
									
									
								
							| @ -52,8 +52,8 @@ jobs: | |||||||
|           { name: "manylinuxaarch64-builder",       tag: "cuda13.0",          runner: "linux.arm64.2xlarge.ephemeral" }, |           { name: "manylinuxaarch64-builder",       tag: "cuda13.0",          runner: "linux.arm64.2xlarge.ephemeral" }, | ||||||
|           { name: "manylinuxaarch64-builder",       tag: "cuda12.8",          runner: "linux.arm64.2xlarge.ephemeral" }, |           { name: "manylinuxaarch64-builder",       tag: "cuda12.8",          runner: "linux.arm64.2xlarge.ephemeral" }, | ||||||
|           { name: "manylinuxaarch64-builder",       tag: "cuda12.6",          runner: "linux.arm64.2xlarge.ephemeral" }, |           { name: "manylinuxaarch64-builder",       tag: "cuda12.6",          runner: "linux.arm64.2xlarge.ephemeral" }, | ||||||
|  |           { name: "manylinux2_28-builder",          tag: "rocm6.3",           runner: "linux.9xlarge.ephemeral" }, | ||||||
|           { name: "manylinux2_28-builder",          tag: "rocm6.4",           runner: "linux.9xlarge.ephemeral" }, |           { name: "manylinux2_28-builder",          tag: "rocm6.4",           runner: "linux.9xlarge.ephemeral" }, | ||||||
|           { name: "manylinux2_28-builder",          tag: "rocm7.0",           runner: "linux.9xlarge.ephemeral" }, |  | ||||||
|           { name: "manylinux2_28-builder",          tag: "cpu",               runner: "linux.9xlarge.ephemeral" }, |           { name: "manylinux2_28-builder",          tag: "cpu",               runner: "linux.9xlarge.ephemeral" }, | ||||||
|           { name: "manylinux2_28_aarch64-builder",  tag: "cpu-aarch64",       runner: "linux.arm64.2xlarge.ephemeral" }, |           { name: "manylinux2_28_aarch64-builder",  tag: "cpu-aarch64",       runner: "linux.arm64.2xlarge.ephemeral" }, | ||||||
|           { name: "manylinuxcxx11-abi-builder",     tag: "cpu-cxx11-abi",     runner: "linux.9xlarge.ephemeral" }, |           { name: "manylinuxcxx11-abi-builder",     tag: "cpu-cxx11-abi",     runner: "linux.9xlarge.ephemeral" }, | ||||||
|  | |||||||
							
								
								
									
										9
									
								
								.github/workflows/build-triton-wheel.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										9
									
								
								.github/workflows/build-triton-wheel.yml
									
									
									
									
										vendored
									
									
								
							| @ -50,12 +50,12 @@ jobs: | |||||||
|     strategy: |     strategy: | ||||||
|       fail-fast: false |       fail-fast: false | ||||||
|       matrix: |       matrix: | ||||||
|         py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ] |         py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ] | ||||||
|         device: ["cuda", "rocm", "xpu", "aarch64"] |         device: ["cuda", "rocm", "xpu", "aarch64"] | ||||||
|         docker-image: ["pytorch/manylinux2_28-builder:cpu"] |         docker-image: ["pytorch/manylinux2_28-builder:cpu"] | ||||||
|         include: |         include: | ||||||
|           - device: "rocm" |           - device: "rocm" | ||||||
|             rocm_version: "7.0" |             rocm_version: "6.4" | ||||||
|             runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" |             runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" | ||||||
|           - device: "cuda" |           - device: "cuda" | ||||||
|             rocm_version: "" |             rocm_version: "" | ||||||
| @ -108,6 +108,9 @@ jobs: | |||||||
|  |  | ||||||
|           # Determine python executable for given version |           # Determine python executable for given version | ||||||
|           case $PY_VERS in |           case $PY_VERS in | ||||||
|  |           3.9) | ||||||
|  |             PYTHON_EXECUTABLE=/opt/python/cp39-cp39/bin/python | ||||||
|  |             ;; | ||||||
|           3.10) |           3.10) | ||||||
|             PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python |             PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python | ||||||
|             ;; |             ;; | ||||||
| @ -191,7 +194,7 @@ jobs: | |||||||
|     strategy: |     strategy: | ||||||
|       fail-fast: false |       fail-fast: false | ||||||
|       matrix: |       matrix: | ||||||
|         py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ] |         py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ] | ||||||
|         device: ["xpu"] |         device: ["xpu"] | ||||||
|     timeout-minutes: 40 |     timeout-minutes: 40 | ||||||
|     env: |     env: | ||||||
|  | |||||||
							
								
								
									
										59
									
								
								.github/workflows/create_release.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										59
									
								
								.github/workflows/create_release.yml
									
									
									
									
										vendored
									
									
								
							| @ -35,7 +35,6 @@ jobs: | |||||||
|       contents: write |       contents: write | ||||||
|     outputs: |     outputs: | ||||||
|       pt_release_name: ${{ steps.release_name.outputs.pt_release_name }} |       pt_release_name: ${{ steps.release_name.outputs.pt_release_name }} | ||||||
|       pt_pep517_release_name: ${{ steps.release_name.outputs.pt_pep517_release_name }} |  | ||||||
|     steps: |     steps: | ||||||
|       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 |       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | ||||||
|         with: |         with: | ||||||
| @ -54,12 +53,8 @@ jobs: | |||||||
|           tag_or_branch="${tag_or_branch#refs/heads/}" |           tag_or_branch="${tag_or_branch#refs/heads/}" | ||||||
|           # replace directory separators with _ in branch name |           # replace directory separators with _ in branch name | ||||||
|           tag_or_branch="${tag_or_branch//\//_}" |           tag_or_branch="${tag_or_branch//\//_}" | ||||||
|           torch_version="$(python -c 'from tools.generate_torch_version import get_torch_version; print(get_torch_version())')" |           echo "PT_RELEASE_NAME=pytorch-$tag_or_branch" >> "$GITHUB_ENV" | ||||||
|           { |           echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV" | ||||||
|             echo "PT_RELEASE_NAME=pytorch-$tag_or_branch"; |  | ||||||
|             echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz"; |  | ||||||
|             echo "PT_PEP517_RELEASE_FILE=torch-${torch_version}.tar.gz"; |  | ||||||
|           } >> "$GITHUB_ENV" |  | ||||||
|       - name: Checkout optional submodules |       - name: Checkout optional submodules | ||||||
|         run: python3 tools/optional_submodules.py |         run: python3 tools/optional_submodules.py | ||||||
|       - name: Copy docs requirements for inclusion |       - name: Copy docs requirements for inclusion | ||||||
| @ -69,47 +64,30 @@ jobs: | |||||||
|           cp .ci/docker/requirements-docs.txt docs/requirements.txt |           cp .ci/docker/requirements-docs.txt docs/requirements.txt | ||||||
|       - name: Create source distribution |       - name: Create source distribution | ||||||
|         run: | |         run: | | ||||||
|           # Create new folder with specified name so extracting the archive yields that |             # Create new folder with specified name so extracting the archive yields that | ||||||
|           rm -rf "/tmp/$PT_RELEASE_NAME" |             rm -rf "/tmp/$PT_RELEASE_NAME" | ||||||
|           cp -r "$PWD" "/tmp/$PT_RELEASE_NAME" |             cp -r "$PWD" "/tmp/$PT_RELEASE_NAME" | ||||||
|           mv "/tmp/$PT_RELEASE_NAME" . |             mv "/tmp/$PT_RELEASE_NAME" . | ||||||
|           # Cleanup |             # Cleanup | ||||||
|           rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci} |             rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci} | ||||||
|           find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true |             find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true | ||||||
|           # Create archive |             # Create archive | ||||||
|           tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME" |             tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME" | ||||||
|           echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")" |             echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")" | ||||||
|       - name: Create PEP 517 compatible source distribution |  | ||||||
|         run: | |  | ||||||
|           pip install build==1.2.2.post1 || exit 1 |  | ||||||
|           python -m build --sdist || exit 1 |  | ||||||
|           cd dist || exit 1 |  | ||||||
|       - name: Upload source distribution for release |       - name: Upload source distribution for release | ||||||
|         if: ${{ github.event_name == 'release' }} |         if: ${{ github.event_name == 'release' }} | ||||||
|         uses: softprops/action-gh-release@da05d552573ad5aba039eaac05058a918a7bf631 # v2.2.2 |         uses: softprops/action-gh-release@da05d552573ad5aba039eaac05058a918a7bf631 # v2.2.2 | ||||||
|         with: |         with: | ||||||
|           files: | |           files: ${{env.PT_RELEASE_FILE}} | ||||||
|             ${{ env.PT_RELEASE_FILE }} |       - name: Upload source distribution to GHA artifacts for release tags | ||||||
|             ${{ env.PT_PEP517_RELEASE_FILE }} |  | ||||||
|       - name: Upload source distribution to GHA artifacts  # for release tags |  | ||||||
|         if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }} |         if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }} | ||||||
|         uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 |         uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 | ||||||
|         with: |         with: | ||||||
|           name: ${{ env.PT_RELEASE_FILE }} |           name: ${{ env.PT_RELEASE_FILE }} | ||||||
|           path: ${{ env.PT_RELEASE_FILE }} |           path: ${{ env.PT_RELEASE_FILE }} | ||||||
|       - name: Upload PEP 517 source distribution to GHA artifacts  # for release tags |  | ||||||
|         if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }} |  | ||||||
|         uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 |  | ||||||
|         with: |  | ||||||
|           name: ${{ env.PT_PEP517_RELEASE_FILE }} |  | ||||||
|           path: dist/${{ env.PT_PEP517_RELEASE_FILE }} |  | ||||||
|       - name: Set output |       - name: Set output | ||||||
|         id: release_name |         id: release_name | ||||||
|         run: | |         run: echo "pt_release_name=${{ env.PT_RELEASE_NAME }}.tar.gz" >> "${GITHUB_OUTPUT}" | ||||||
|           { |  | ||||||
|             echo "pt_release_name=${{ env.PT_RELEASE_FILE }}"; |  | ||||||
|             echo "pt_pep517_release_name=${{ env.PT_PEP517_RELEASE_FILE }}"; |  | ||||||
|           } >> "${GITHUB_OUTPUT}" |  | ||||||
|  |  | ||||||
|   upload_source_code_to_s3: |   upload_source_code_to_s3: | ||||||
|     if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }} |     if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }} | ||||||
| @ -125,9 +103,6 @@ jobs: | |||||||
|       - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 |       - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 | ||||||
|         with: |         with: | ||||||
|           name: ${{ needs.release.outputs.pt_release_name }} |           name: ${{ needs.release.outputs.pt_release_name }} | ||||||
|       - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 |  | ||||||
|         with: |  | ||||||
|           name: ${{ needs.release.outputs.pt_pep517_release_name }} |  | ||||||
|       - name: Configure AWS credentials(PyTorch account) |       - name: Configure AWS credentials(PyTorch account) | ||||||
|         uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 |         uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 | ||||||
|         with: |         with: | ||||||
| @ -138,9 +113,7 @@ jobs: | |||||||
|           s3-bucket: pytorch |           s3-bucket: pytorch | ||||||
|           s3-prefix: source_code/test |           s3-prefix: source_code/test | ||||||
|           if-no-files-found: warn |           if-no-files-found: warn | ||||||
|           path: | |           path: ${{ needs.release.outputs.pt_release_name }} | ||||||
|             ${{ needs.release.outputs.pt_release_name }} |  | ||||||
|             ${{ needs.release.outputs.pt_pep517_release_name }} |  | ||||||
|  |  | ||||||
| concurrency: | concurrency: | ||||||
|   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }} |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }} | ||||||
|  | |||||||
							
								
								
									
										2
									
								
								.github/workflows/docker-builds.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/docker-builds.yml
									
									
									
									
										vendored
									
									
								
							| @ -70,7 +70,7 @@ jobs: | |||||||
|           pytorch-linux-jammy-py3-clang18-asan, |           pytorch-linux-jammy-py3-clang18-asan, | ||||||
|           pytorch-linux-jammy-py3-clang12-onnx, |           pytorch-linux-jammy-py3-clang12-onnx, | ||||||
|           pytorch-linux-jammy-linter, |           pytorch-linux-jammy-linter, | ||||||
|           pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter, |           pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter, | ||||||
|           pytorch-linux-jammy-py3-clang12-executorch, |           pytorch-linux-jammy-py3-clang12-executorch, | ||||||
|           pytorch-linux-jammy-py3.12-triton-cpu, |           pytorch-linux-jammy-py3.12-triton-cpu, | ||||||
|           pytorch-linux-noble-riscv64-py3.12-gcc14 |           pytorch-linux-noble-riscv64-py3.12-gcc14 | ||||||
|  | |||||||
							
								
								
									
										98
									
								
								.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							
							
						
						
									
										98
									
								
								.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							| @ -62,7 +62,7 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 |       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 | ||||||
|       DESIRED_PYTHON: "3.10" |       DESIRED_PYTHON: "3.10" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_10-cpu-aarch64 |       build_name: manywheel-py3_10-cpu-aarch64 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
| @ -128,11 +128,11 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cuda12.6 |       DOCKER_IMAGE_TAG_PREFIX: cuda12.6 | ||||||
|       DESIRED_PYTHON: "3.10" |       DESIRED_PYTHON: "3.10" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_10-cuda-aarch64-12_6 |       build_name: manywheel-py3_10-cuda-aarch64-12_6 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' |       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' | ||||||
|       timeout-minutes: 420 |       timeout-minutes: 420 | ||||||
|     secrets: |     secrets: | ||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
| @ -174,11 +174,11 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cuda12.8 |       DOCKER_IMAGE_TAG_PREFIX: cuda12.8 | ||||||
|       DESIRED_PYTHON: "3.10" |       DESIRED_PYTHON: "3.10" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_10-cuda-aarch64-12_8 |       build_name: manywheel-py3_10-cuda-aarch64-12_8 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' |       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' | ||||||
|       timeout-minutes: 420 |       timeout-minutes: 420 | ||||||
|     secrets: |     secrets: | ||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
| @ -220,11 +220,11 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cuda13.0 |       DOCKER_IMAGE_TAG_PREFIX: cuda13.0 | ||||||
|       DESIRED_PYTHON: "3.10" |       DESIRED_PYTHON: "3.10" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_10-cuda-aarch64-13_0 |       build_name: manywheel-py3_10-cuda-aarch64-13_0 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' |       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||||
|       timeout-minutes: 420 |       timeout-minutes: 420 | ||||||
|     secrets: |     secrets: | ||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
| @ -265,7 +265,7 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 |       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 | ||||||
|       DESIRED_PYTHON: "3.11" |       DESIRED_PYTHON: "3.11" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_11-cpu-aarch64 |       build_name: manywheel-py3_11-cpu-aarch64 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
| @ -331,11 +331,11 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cuda12.6 |       DOCKER_IMAGE_TAG_PREFIX: cuda12.6 | ||||||
|       DESIRED_PYTHON: "3.11" |       DESIRED_PYTHON: "3.11" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_11-cuda-aarch64-12_6 |       build_name: manywheel-py3_11-cuda-aarch64-12_6 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' |       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' | ||||||
|       timeout-minutes: 420 |       timeout-minutes: 420 | ||||||
|     secrets: |     secrets: | ||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
| @ -377,11 +377,11 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cuda12.8 |       DOCKER_IMAGE_TAG_PREFIX: cuda12.8 | ||||||
|       DESIRED_PYTHON: "3.11" |       DESIRED_PYTHON: "3.11" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_11-cuda-aarch64-12_8 |       build_name: manywheel-py3_11-cuda-aarch64-12_8 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' |       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' | ||||||
|       timeout-minutes: 420 |       timeout-minutes: 420 | ||||||
|     secrets: |     secrets: | ||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
| @ -423,11 +423,11 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cuda13.0 |       DOCKER_IMAGE_TAG_PREFIX: cuda13.0 | ||||||
|       DESIRED_PYTHON: "3.11" |       DESIRED_PYTHON: "3.11" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_11-cuda-aarch64-13_0 |       build_name: manywheel-py3_11-cuda-aarch64-13_0 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' |       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||||
|       timeout-minutes: 420 |       timeout-minutes: 420 | ||||||
|     secrets: |     secrets: | ||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
| @ -468,7 +468,7 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 |       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 | ||||||
|       DESIRED_PYTHON: "3.12" |       DESIRED_PYTHON: "3.12" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_12-cpu-aarch64 |       build_name: manywheel-py3_12-cpu-aarch64 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
| @ -534,11 +534,11 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cuda12.6 |       DOCKER_IMAGE_TAG_PREFIX: cuda12.6 | ||||||
|       DESIRED_PYTHON: "3.12" |       DESIRED_PYTHON: "3.12" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_12-cuda-aarch64-12_6 |       build_name: manywheel-py3_12-cuda-aarch64-12_6 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' |       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' | ||||||
|       timeout-minutes: 420 |       timeout-minutes: 420 | ||||||
|     secrets: |     secrets: | ||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
| @ -580,11 +580,11 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cuda12.8 |       DOCKER_IMAGE_TAG_PREFIX: cuda12.8 | ||||||
|       DESIRED_PYTHON: "3.12" |       DESIRED_PYTHON: "3.12" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_12-cuda-aarch64-12_8 |       build_name: manywheel-py3_12-cuda-aarch64-12_8 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' |       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' | ||||||
|       timeout-minutes: 420 |       timeout-minutes: 420 | ||||||
|     secrets: |     secrets: | ||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
| @ -626,11 +626,11 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cuda13.0 |       DOCKER_IMAGE_TAG_PREFIX: cuda13.0 | ||||||
|       DESIRED_PYTHON: "3.12" |       DESIRED_PYTHON: "3.12" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_12-cuda-aarch64-13_0 |       build_name: manywheel-py3_12-cuda-aarch64-13_0 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' |       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||||
|       timeout-minutes: 420 |       timeout-minutes: 420 | ||||||
|     secrets: |     secrets: | ||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
| @ -671,7 +671,7 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 |       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 | ||||||
|       DESIRED_PYTHON: "3.13" |       DESIRED_PYTHON: "3.13" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_13-cpu-aarch64 |       build_name: manywheel-py3_13-cpu-aarch64 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
| @ -737,11 +737,11 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cuda12.6 |       DOCKER_IMAGE_TAG_PREFIX: cuda12.6 | ||||||
|       DESIRED_PYTHON: "3.13" |       DESIRED_PYTHON: "3.13" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_13-cuda-aarch64-12_6 |       build_name: manywheel-py3_13-cuda-aarch64-12_6 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' |       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' | ||||||
|       timeout-minutes: 420 |       timeout-minutes: 420 | ||||||
|     secrets: |     secrets: | ||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
| @ -783,11 +783,11 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cuda12.8 |       DOCKER_IMAGE_TAG_PREFIX: cuda12.8 | ||||||
|       DESIRED_PYTHON: "3.13" |       DESIRED_PYTHON: "3.13" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_13-cuda-aarch64-12_8 |       build_name: manywheel-py3_13-cuda-aarch64-12_8 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' |       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' | ||||||
|       timeout-minutes: 420 |       timeout-minutes: 420 | ||||||
|     secrets: |     secrets: | ||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
| @ -829,11 +829,11 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cuda13.0 |       DOCKER_IMAGE_TAG_PREFIX: cuda13.0 | ||||||
|       DESIRED_PYTHON: "3.13" |       DESIRED_PYTHON: "3.13" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_13-cuda-aarch64-13_0 |       build_name: manywheel-py3_13-cuda-aarch64-13_0 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' |       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||||
|       timeout-minutes: 420 |       timeout-minutes: 420 | ||||||
|     secrets: |     secrets: | ||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
| @ -874,7 +874,7 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 |       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 | ||||||
|       DESIRED_PYTHON: "3.13t" |       DESIRED_PYTHON: "3.13t" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_13t-cpu-aarch64 |       build_name: manywheel-py3_13t-cpu-aarch64 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
| @ -940,11 +940,11 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cuda12.6 |       DOCKER_IMAGE_TAG_PREFIX: cuda12.6 | ||||||
|       DESIRED_PYTHON: "3.13t" |       DESIRED_PYTHON: "3.13t" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_13t-cuda-aarch64-12_6 |       build_name: manywheel-py3_13t-cuda-aarch64-12_6 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' |       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' | ||||||
|       timeout-minutes: 420 |       timeout-minutes: 420 | ||||||
|     secrets: |     secrets: | ||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
| @ -986,11 +986,11 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cuda12.8 |       DOCKER_IMAGE_TAG_PREFIX: cuda12.8 | ||||||
|       DESIRED_PYTHON: "3.13t" |       DESIRED_PYTHON: "3.13t" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_13t-cuda-aarch64-12_8 |       build_name: manywheel-py3_13t-cuda-aarch64-12_8 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' |       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' | ||||||
|       timeout-minutes: 420 |       timeout-minutes: 420 | ||||||
|     secrets: |     secrets: | ||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
| @ -1032,11 +1032,11 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cuda13.0 |       DOCKER_IMAGE_TAG_PREFIX: cuda13.0 | ||||||
|       DESIRED_PYTHON: "3.13t" |       DESIRED_PYTHON: "3.13t" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_13t-cuda-aarch64-13_0 |       build_name: manywheel-py3_13t-cuda-aarch64-13_0 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' |       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||||
|       timeout-minutes: 420 |       timeout-minutes: 420 | ||||||
|     secrets: |     secrets: | ||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
| @ -1077,7 +1077,7 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 |       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 | ||||||
|       DESIRED_PYTHON: "3.14" |       DESIRED_PYTHON: "3.14" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_14-cpu-aarch64 |       build_name: manywheel-py3_14-cpu-aarch64 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
| @ -1143,11 +1143,11 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cuda12.6 |       DOCKER_IMAGE_TAG_PREFIX: cuda12.6 | ||||||
|       DESIRED_PYTHON: "3.14" |       DESIRED_PYTHON: "3.14" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_14-cuda-aarch64-12_6 |       build_name: manywheel-py3_14-cuda-aarch64-12_6 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' |       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' | ||||||
|       timeout-minutes: 420 |       timeout-minutes: 420 | ||||||
|     secrets: |     secrets: | ||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
| @ -1189,11 +1189,11 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cuda12.8 |       DOCKER_IMAGE_TAG_PREFIX: cuda12.8 | ||||||
|       DESIRED_PYTHON: "3.14" |       DESIRED_PYTHON: "3.14" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_14-cuda-aarch64-12_8 |       build_name: manywheel-py3_14-cuda-aarch64-12_8 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' |       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' | ||||||
|       timeout-minutes: 420 |       timeout-minutes: 420 | ||||||
|     secrets: |     secrets: | ||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
| @ -1235,11 +1235,11 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cuda13.0 |       DOCKER_IMAGE_TAG_PREFIX: cuda13.0 | ||||||
|       DESIRED_PYTHON: "3.14" |       DESIRED_PYTHON: "3.14" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_14-cuda-aarch64-13_0 |       build_name: manywheel-py3_14-cuda-aarch64-13_0 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' |       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||||
|       timeout-minutes: 420 |       timeout-minutes: 420 | ||||||
|     secrets: |     secrets: | ||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
| @ -1280,7 +1280,7 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 |       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 | ||||||
|       DESIRED_PYTHON: "3.14t" |       DESIRED_PYTHON: "3.14t" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_14t-cpu-aarch64 |       build_name: manywheel-py3_14t-cpu-aarch64 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
| @ -1346,11 +1346,11 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cuda12.6 |       DOCKER_IMAGE_TAG_PREFIX: cuda12.6 | ||||||
|       DESIRED_PYTHON: "3.14t" |       DESIRED_PYTHON: "3.14t" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_14t-cuda-aarch64-12_6 |       build_name: manywheel-py3_14t-cuda-aarch64-12_6 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' |       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' | ||||||
|       timeout-minutes: 420 |       timeout-minutes: 420 | ||||||
|     secrets: |     secrets: | ||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
| @ -1392,11 +1392,11 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cuda12.8 |       DOCKER_IMAGE_TAG_PREFIX: cuda12.8 | ||||||
|       DESIRED_PYTHON: "3.14t" |       DESIRED_PYTHON: "3.14t" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_14t-cuda-aarch64-12_8 |       build_name: manywheel-py3_14t-cuda-aarch64-12_8 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' |       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' | ||||||
|       timeout-minutes: 420 |       timeout-minutes: 420 | ||||||
|     secrets: |     secrets: | ||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
| @ -1438,11 +1438,11 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: cuda13.0 |       DOCKER_IMAGE_TAG_PREFIX: cuda13.0 | ||||||
|       DESIRED_PYTHON: "3.14t" |       DESIRED_PYTHON: "3.14t" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.arm64.r7g.12xlarge.memory |       runs_on: linux.arm64.m7g.4xlarge.ephemeral | ||||||
|       ALPINE_IMAGE: "arm64v8/alpine" |       ALPINE_IMAGE: "arm64v8/alpine" | ||||||
|       build_name: manywheel-py3_14t-cuda-aarch64-13_0 |       build_name: manywheel-py3_14t-cuda-aarch64-13_0 | ||||||
|       build_environment: linux-aarch64-binary-manywheel |       build_environment: linux-aarch64-binary-manywheel | ||||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' |       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' | ||||||
|       timeout-minutes: 420 |       timeout-minutes: 420 | ||||||
|     secrets: |     secrets: | ||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
|  | |||||||
							
								
								
									
										230
									
								
								.github/workflows/generated-linux-binary-libtorch-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							
							
						
						
									
										230
									
								
								.github/workflows/generated-linux-binary-libtorch-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							| @ -316,6 +316,120 @@ jobs: | |||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
|     uses: ./.github/workflows/_binary-upload.yml |     uses: ./.github/workflows/_binary-upload.yml | ||||||
|  |  | ||||||
|  |   libtorch-rocm6_3-shared-with-deps-release-build: | ||||||
|  |     if: ${{ github.repository_owner == 'pytorch' }} | ||||||
|  |     uses: ./.github/workflows/_binary-build-linux.yml | ||||||
|  |     needs: get-label-type | ||||||
|  |     with: | ||||||
|  |       PYTORCH_ROOT: /pytorch | ||||||
|  |       PACKAGE_TYPE: libtorch | ||||||
|  |       # TODO: This is a legacy variable that we eventually want to get rid of in | ||||||
|  |       #       favor of GPU_ARCH_VERSION | ||||||
|  |       DESIRED_CUDA: rocm6.3 | ||||||
|  |       GPU_ARCH_VERSION: "6.3" | ||||||
|  |       GPU_ARCH_TYPE: rocm | ||||||
|  |       DOCKER_IMAGE: libtorch-cxx11-builder | ||||||
|  |       DOCKER_IMAGE_TAG_PREFIX: rocm6.3 | ||||||
|  |       LIBTORCH_CONFIG: release | ||||||
|  |       LIBTORCH_VARIANT: shared-with-deps | ||||||
|  |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|  |       build_name: libtorch-rocm6_3-shared-with-deps-release | ||||||
|  |       build_environment: linux-binary-libtorch | ||||||
|  |     secrets: | ||||||
|  |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
|  |   libtorch-rocm6_3-shared-with-deps-release-test:  # Testing | ||||||
|  |     if: ${{ github.repository_owner == 'pytorch' }} | ||||||
|  |     needs: | ||||||
|  |       - libtorch-rocm6_3-shared-with-deps-release-build | ||||||
|  |       - get-label-type | ||||||
|  |     runs-on: linux.rocm.gpu.mi250 | ||||||
|  |     timeout-minutes: 240 | ||||||
|  |     env: | ||||||
|  |       PYTORCH_ROOT: /pytorch | ||||||
|  |       PACKAGE_TYPE: libtorch | ||||||
|  |       # TODO: This is a legacy variable that we eventually want to get rid of in | ||||||
|  |       #       favor of GPU_ARCH_VERSION | ||||||
|  |       DESIRED_CUDA: rocm6.3 | ||||||
|  |       GPU_ARCH_VERSION: "6.3" | ||||||
|  |       GPU_ARCH_TYPE: rocm | ||||||
|  |       SKIP_ALL_TESTS: 1 | ||||||
|  |       DOCKER_IMAGE: libtorch-cxx11-builder | ||||||
|  |       DOCKER_IMAGE_TAG_PREFIX: rocm6.3 | ||||||
|  |       LIBTORCH_CONFIG: release | ||||||
|  |       LIBTORCH_VARIANT: shared-with-deps | ||||||
|  |     steps: | ||||||
|  |       - name: Setup ROCm | ||||||
|  |         uses: ./.github/actions/setup-rocm | ||||||
|  |       - uses: actions/download-artifact@v4.1.7 | ||||||
|  |         name: Download Build Artifacts | ||||||
|  |         with: | ||||||
|  |           name: libtorch-rocm6_3-shared-with-deps-release | ||||||
|  |           path: "${{ runner.temp }}/artifacts/" | ||||||
|  |       - name: Checkout PyTorch | ||||||
|  |         uses: actions/checkout@v4 | ||||||
|  |         with: | ||||||
|  |           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} | ||||||
|  |           submodules: recursive | ||||||
|  |           path: pytorch | ||||||
|  |           show-progress: false | ||||||
|  |       - name: Clean PyTorch checkout | ||||||
|  |         run: | | ||||||
|  |           # Remove any artifacts from the previous checkouts | ||||||
|  |           git clean -fxd | ||||||
|  |         working-directory: pytorch | ||||||
|  |       - name: ROCm set GPU_FLAG | ||||||
|  |         run: | | ||||||
|  |           echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" | ||||||
|  |       - name: configure aws credentials | ||||||
|  |         id: aws_creds | ||||||
|  |         if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} | ||||||
|  |         uses: aws-actions/configure-aws-credentials@v4 | ||||||
|  |         with: | ||||||
|  |           role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only | ||||||
|  |           aws-region: us-east-1 | ||||||
|  |           role-duration-seconds: 18000 | ||||||
|  |       - name: Calculate docker image | ||||||
|  |         id: calculate-docker-image | ||||||
|  |         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main | ||||||
|  |         with: | ||||||
|  |           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} | ||||||
|  |           docker-image-name: libtorch-cxx11-builder | ||||||
|  |           custom-tag-prefix: rocm6.3 | ||||||
|  |           docker-build-dir: .ci/docker | ||||||
|  |           working-directory: pytorch | ||||||
|  |       - name: Pull Docker image | ||||||
|  |         uses: pytorch/test-infra/.github/actions/pull-docker-image@main | ||||||
|  |         with: | ||||||
|  |           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} | ||||||
|  |       - name: Test Pytorch binary | ||||||
|  |         uses: ./pytorch/.github/actions/test-pytorch-binary | ||||||
|  |         env: | ||||||
|  |           DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} | ||||||
|  |       - name: Teardown ROCm | ||||||
|  |         uses: ./.github/actions/teardown-rocm | ||||||
|  |   libtorch-rocm6_3-shared-with-deps-release-upload:  # Uploading | ||||||
|  |     if: ${{ github.repository_owner == 'pytorch' }} | ||||||
|  |     permissions: | ||||||
|  |       id-token: write | ||||||
|  |       contents: read | ||||||
|  |     needs: libtorch-rocm6_3-shared-with-deps-release-test | ||||||
|  |     with: | ||||||
|  |       PYTORCH_ROOT: /pytorch | ||||||
|  |       PACKAGE_TYPE: libtorch | ||||||
|  |       # TODO: This is a legacy variable that we eventually want to get rid of in | ||||||
|  |       #       favor of GPU_ARCH_VERSION | ||||||
|  |       DESIRED_CUDA: rocm6.3 | ||||||
|  |       GPU_ARCH_VERSION: "6.3" | ||||||
|  |       GPU_ARCH_TYPE: rocm | ||||||
|  |       DOCKER_IMAGE: libtorch-cxx11-builder | ||||||
|  |       DOCKER_IMAGE_TAG_PREFIX: rocm6.3 | ||||||
|  |       LIBTORCH_CONFIG: release | ||||||
|  |       LIBTORCH_VARIANT: shared-with-deps | ||||||
|  |       build_name: libtorch-rocm6_3-shared-with-deps-release | ||||||
|  |     secrets: | ||||||
|  |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
|  |     uses: ./.github/workflows/_binary-upload.yml | ||||||
|  |  | ||||||
|   libtorch-rocm6_4-shared-with-deps-release-build: |   libtorch-rocm6_4-shared-with-deps-release-build: | ||||||
|     if: ${{ github.repository_owner == 'pytorch' }} |     if: ${{ github.repository_owner == 'pytorch' }} | ||||||
|     uses: ./.github/workflows/_binary-build-linux.yml |     uses: ./.github/workflows/_binary-build-linux.yml | ||||||
| @ -333,7 +447,6 @@ jobs: | |||||||
|       LIBTORCH_CONFIG: release |       LIBTORCH_CONFIG: release | ||||||
|       LIBTORCH_VARIANT: shared-with-deps |       LIBTORCH_VARIANT: shared-with-deps | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       timeout-minutes: 300 |  | ||||||
|       build_name: libtorch-rocm6_4-shared-with-deps-release |       build_name: libtorch-rocm6_4-shared-with-deps-release | ||||||
|       build_environment: linux-binary-libtorch |       build_environment: linux-binary-libtorch | ||||||
|     secrets: |     secrets: | ||||||
| @ -430,118 +543,3 @@ jobs: | |||||||
|     secrets: |     secrets: | ||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
|     uses: ./.github/workflows/_binary-upload.yml |     uses: ./.github/workflows/_binary-upload.yml | ||||||
|  |  | ||||||
|   libtorch-rocm7_0-shared-with-deps-release-build: |  | ||||||
|     if: ${{ github.repository_owner == 'pytorch' }} |  | ||||||
|     uses: ./.github/workflows/_binary-build-linux.yml |  | ||||||
|     needs: get-label-type |  | ||||||
|     with: |  | ||||||
|       PYTORCH_ROOT: /pytorch |  | ||||||
|       PACKAGE_TYPE: libtorch |  | ||||||
|       # TODO: This is a legacy variable that we eventually want to get rid of in |  | ||||||
|       #       favor of GPU_ARCH_VERSION |  | ||||||
|       DESIRED_CUDA: rocm7.0 |  | ||||||
|       GPU_ARCH_VERSION: "7.0" |  | ||||||
|       GPU_ARCH_TYPE: rocm |  | ||||||
|       DOCKER_IMAGE: libtorch-cxx11-builder |  | ||||||
|       DOCKER_IMAGE_TAG_PREFIX: rocm7.0 |  | ||||||
|       LIBTORCH_CONFIG: release |  | ||||||
|       LIBTORCH_VARIANT: shared-with-deps |  | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |  | ||||||
|       timeout-minutes: 300 |  | ||||||
|       build_name: libtorch-rocm7_0-shared-with-deps-release |  | ||||||
|       build_environment: linux-binary-libtorch |  | ||||||
|     secrets: |  | ||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |  | ||||||
|   libtorch-rocm7_0-shared-with-deps-release-test:  # Testing |  | ||||||
|     if: ${{ github.repository_owner == 'pytorch' }} |  | ||||||
|     needs: |  | ||||||
|       - libtorch-rocm7_0-shared-with-deps-release-build |  | ||||||
|       - get-label-type |  | ||||||
|     runs-on: linux.rocm.gpu.mi250 |  | ||||||
|     timeout-minutes: 240 |  | ||||||
|     env: |  | ||||||
|       PYTORCH_ROOT: /pytorch |  | ||||||
|       PACKAGE_TYPE: libtorch |  | ||||||
|       # TODO: This is a legacy variable that we eventually want to get rid of in |  | ||||||
|       #       favor of GPU_ARCH_VERSION |  | ||||||
|       DESIRED_CUDA: rocm7.0 |  | ||||||
|       GPU_ARCH_VERSION: "7.0" |  | ||||||
|       GPU_ARCH_TYPE: rocm |  | ||||||
|       SKIP_ALL_TESTS: 1 |  | ||||||
|       DOCKER_IMAGE: libtorch-cxx11-builder |  | ||||||
|       DOCKER_IMAGE_TAG_PREFIX: rocm7.0 |  | ||||||
|       LIBTORCH_CONFIG: release |  | ||||||
|       LIBTORCH_VARIANT: shared-with-deps |  | ||||||
|     steps: |  | ||||||
|       - name: Setup ROCm |  | ||||||
|         uses: ./.github/actions/setup-rocm |  | ||||||
|       - uses: actions/download-artifact@v4.1.7 |  | ||||||
|         name: Download Build Artifacts |  | ||||||
|         with: |  | ||||||
|           name: libtorch-rocm7_0-shared-with-deps-release |  | ||||||
|           path: "${{ runner.temp }}/artifacts/" |  | ||||||
|       - name: Checkout PyTorch |  | ||||||
|         uses: actions/checkout@v4 |  | ||||||
|         with: |  | ||||||
|           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} |  | ||||||
|           submodules: recursive |  | ||||||
|           path: pytorch |  | ||||||
|           show-progress: false |  | ||||||
|       - name: Clean PyTorch checkout |  | ||||||
|         run: | |  | ||||||
|           # Remove any artifacts from the previous checkouts |  | ||||||
|           git clean -fxd |  | ||||||
|         working-directory: pytorch |  | ||||||
|       - name: ROCm set GPU_FLAG |  | ||||||
|         run: | |  | ||||||
|           echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" |  | ||||||
|       - name: configure aws credentials |  | ||||||
|         id: aws_creds |  | ||||||
|         if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} |  | ||||||
|         uses: aws-actions/configure-aws-credentials@v4 |  | ||||||
|         with: |  | ||||||
|           role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only |  | ||||||
|           aws-region: us-east-1 |  | ||||||
|           role-duration-seconds: 18000 |  | ||||||
|       - name: Calculate docker image |  | ||||||
|         id: calculate-docker-image |  | ||||||
|         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main |  | ||||||
|         with: |  | ||||||
|           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} |  | ||||||
|           docker-image-name: libtorch-cxx11-builder |  | ||||||
|           custom-tag-prefix: rocm7.0 |  | ||||||
|           docker-build-dir: .ci/docker |  | ||||||
|           working-directory: pytorch |  | ||||||
|       - name: Pull Docker image |  | ||||||
|         uses: pytorch/test-infra/.github/actions/pull-docker-image@main |  | ||||||
|         with: |  | ||||||
|           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} |  | ||||||
|       - name: Test Pytorch binary |  | ||||||
|         uses: ./pytorch/.github/actions/test-pytorch-binary |  | ||||||
|         env: |  | ||||||
|           DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} |  | ||||||
|       - name: Teardown ROCm |  | ||||||
|         uses: ./.github/actions/teardown-rocm |  | ||||||
|   libtorch-rocm7_0-shared-with-deps-release-upload:  # Uploading |  | ||||||
|     if: ${{ github.repository_owner == 'pytorch' }} |  | ||||||
|     permissions: |  | ||||||
|       id-token: write |  | ||||||
|       contents: read |  | ||||||
|     needs: libtorch-rocm7_0-shared-with-deps-release-test |  | ||||||
|     with: |  | ||||||
|       PYTORCH_ROOT: /pytorch |  | ||||||
|       PACKAGE_TYPE: libtorch |  | ||||||
|       # TODO: This is a legacy variable that we eventually want to get rid of in |  | ||||||
|       #       favor of GPU_ARCH_VERSION |  | ||||||
|       DESIRED_CUDA: rocm7.0 |  | ||||||
|       GPU_ARCH_VERSION: "7.0" |  | ||||||
|       GPU_ARCH_TYPE: rocm |  | ||||||
|       DOCKER_IMAGE: libtorch-cxx11-builder |  | ||||||
|       DOCKER_IMAGE_TAG_PREFIX: rocm7.0 |  | ||||||
|       LIBTORCH_CONFIG: release |  | ||||||
|       LIBTORCH_VARIANT: shared-with-deps |  | ||||||
|       build_name: libtorch-rocm7_0-shared-with-deps-release |  | ||||||
|     secrets: |  | ||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |  | ||||||
|     uses: ./.github/workflows/_binary-upload.yml |  | ||||||
|  | |||||||
							
								
								
									
										24
									
								
								.github/workflows/generated-linux-binary-manywheel-main.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							
							
						
						
									
										24
									
								
								.github/workflows/generated-linux-binary-manywheel-main.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							| @ -42,7 +42,7 @@ jobs: | |||||||
|       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} |       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} | ||||||
|       curr_branch: ${{ github.head_ref || github.ref_name }} |       curr_branch: ${{ github.head_ref || github.ref_name }} | ||||||
|       curr_ref_type: ${{ github.ref_type }} |       curr_ref_type: ${{ github.ref_type }} | ||||||
|   manywheel-py3_12-cuda13_0-build: |   manywheel-py3_12-cuda12_8-build: | ||||||
|     if: ${{ github.repository_owner == 'pytorch' }} |     if: ${{ github.repository_owner == 'pytorch' }} | ||||||
|     uses: ./.github/workflows/_binary-build-linux.yml |     uses: ./.github/workflows/_binary-build-linux.yml | ||||||
|     needs: get-label-type |     needs: get-label-type | ||||||
| @ -51,22 +51,22 @@ jobs: | |||||||
|       PACKAGE_TYPE: manywheel |       PACKAGE_TYPE: manywheel | ||||||
|       # TODO: This is a legacy variable that we eventually want to get rid of in |       # TODO: This is a legacy variable that we eventually want to get rid of in | ||||||
|       #       favor of GPU_ARCH_VERSION |       #       favor of GPU_ARCH_VERSION | ||||||
|       DESIRED_CUDA: cu130 |       DESIRED_CUDA: cu128 | ||||||
|       GPU_ARCH_VERSION: "13.0" |       GPU_ARCH_VERSION: "12.8" | ||||||
|       GPU_ARCH_TYPE: cuda |       GPU_ARCH_TYPE: cuda | ||||||
|       DOCKER_IMAGE: manylinux2_28-builder |       DOCKER_IMAGE: manylinux2_28-builder | ||||||
|       DOCKER_IMAGE_TAG_PREFIX: cuda13.0 |       DOCKER_IMAGE_TAG_PREFIX: cuda12.8 | ||||||
|       DESIRED_PYTHON: "3.12" |       DESIRED_PYTHON: "3.12" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       build_name: manywheel-py3_12-cuda13_0 |       build_name: manywheel-py3_12-cuda12_8 | ||||||
|       build_environment: linux-binary-manywheel |       build_environment: linux-binary-manywheel | ||||||
|       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' |       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' | ||||||
|     secrets: |     secrets: | ||||||
|       github-token: ${{ secrets.GITHUB_TOKEN }} |       github-token: ${{ secrets.GITHUB_TOKEN }} | ||||||
|   manywheel-py3_12-cuda13_0-test:  # Testing |   manywheel-py3_12-cuda12_8-test:  # Testing | ||||||
|     if: ${{ github.repository_owner == 'pytorch' }} |     if: ${{ github.repository_owner == 'pytorch' }} | ||||||
|     needs: |     needs: | ||||||
|       - manywheel-py3_12-cuda13_0-build |       - manywheel-py3_12-cuda12_8-build | ||||||
|       - get-label-type |       - get-label-type | ||||||
|     uses: ./.github/workflows/_binary-test-linux.yml |     uses: ./.github/workflows/_binary-test-linux.yml | ||||||
|     with: |     with: | ||||||
| @ -74,13 +74,13 @@ jobs: | |||||||
|       PACKAGE_TYPE: manywheel |       PACKAGE_TYPE: manywheel | ||||||
|       # TODO: This is a legacy variable that we eventually want to get rid of in |       # TODO: This is a legacy variable that we eventually want to get rid of in | ||||||
|       #       favor of GPU_ARCH_VERSION |       #       favor of GPU_ARCH_VERSION | ||||||
|       DESIRED_CUDA: cu130 |       DESIRED_CUDA: cu128 | ||||||
|       GPU_ARCH_VERSION: "13.0" |       GPU_ARCH_VERSION: "12.8" | ||||||
|       GPU_ARCH_TYPE: cuda |       GPU_ARCH_TYPE: cuda | ||||||
|       DOCKER_IMAGE: manylinux2_28-builder |       DOCKER_IMAGE: manylinux2_28-builder | ||||||
|       DOCKER_IMAGE_TAG_PREFIX: cuda13.0 |       DOCKER_IMAGE_TAG_PREFIX: cuda12.8 | ||||||
|       DESIRED_PYTHON: "3.12" |       DESIRED_PYTHON: "3.12" | ||||||
|       build_name: manywheel-py3_12-cuda13_0 |       build_name: manywheel-py3_12-cuda12_8 | ||||||
|       build_environment: linux-binary-manywheel |       build_environment: linux-binary-manywheel | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner |       runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner | ||||||
|  | |||||||
							
								
								
									
										1610
									
								
								.github/workflows/generated-linux-binary-manywheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							
							
						
						
									
										1610
									
								
								.github/workflows/generated-linux-binary-manywheel-nightly.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										1
									
								
								.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
									
									
									
										generated
									
									
										vendored
									
									
								
							| @ -60,7 +60,6 @@ jobs: | |||||||
|       DOCKER_IMAGE_TAG_PREFIX: rocm6.4 |       DOCKER_IMAGE_TAG_PREFIX: rocm6.4 | ||||||
|       DESIRED_PYTHON: "3.10" |       DESIRED_PYTHON: "3.10" | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       timeout-minutes: 300 |  | ||||||
|       build_name: manywheel-py3_10-rocm6_4 |       build_name: manywheel-py3_10-rocm6_4 | ||||||
|       build_environment: linux-binary-manywheel-rocm |       build_environment: linux-binary-manywheel-rocm | ||||||
|     secrets: |     secrets: | ||||||
|  | |||||||
							
								
								
									
										19
									
								
								.github/workflows/inductor-perf-test-nightly.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										19
									
								
								.github/workflows/inductor-perf-test-nightly.yml
									
									
									
									
										vendored
									
									
								
							| @ -57,7 +57,7 @@ on: | |||||||
|         description: The list of configs used the benchmark |         description: The list of configs used the benchmark | ||||||
|         required: false |         required: false | ||||||
|         type: string |         type: string | ||||||
|         default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf,cachebench |         default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf,dynamo_eager_huggingface_perf,dynamo_eager_timm_perf,dynamo_eager_torchbench_perf,cachebench | ||||||
|  |  | ||||||
| concurrency: | concurrency: | ||||||
|   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} | ||||||
| @ -97,18 +97,35 @@ jobs: | |||||||
|           { config: "inductor_huggingface_perf", shard: 3, num_shards: 5, runner: "linux.aws.a100" }, |           { config: "inductor_huggingface_perf", shard: 3, num_shards: 5, runner: "linux.aws.a100" }, | ||||||
|           { config: "inductor_huggingface_perf", shard: 4, num_shards: 5, runner: "linux.aws.a100" }, |           { config: "inductor_huggingface_perf", shard: 4, num_shards: 5, runner: "linux.aws.a100" }, | ||||||
|           { config: "inductor_huggingface_perf", shard: 5, num_shards: 5, runner: "linux.aws.a100" }, |           { config: "inductor_huggingface_perf", shard: 5, num_shards: 5, runner: "linux.aws.a100" }, | ||||||
|  |           { config: "dynamo_eager_huggingface_perf", shard: 1, num_shards: 5, runner: "linux.aws.a100" }, | ||||||
|  |           { config: "dynamo_eager_huggingface_perf", shard: 2, num_shards: 5, runner: "linux.aws.a100" }, | ||||||
|  |           { config: "dynamo_eager_huggingface_perf", shard: 3, num_shards: 5, runner: "linux.aws.a100" }, | ||||||
|  |           { config: "dynamo_eager_huggingface_perf", shard: 4, num_shards: 5, runner: "linux.aws.a100" }, | ||||||
|  |           { config: "dynamo_eager_huggingface_perf", shard: 5, num_shards: 5, runner: "linux.aws.a100" }, | ||||||
|           { config: "inductor_timm_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" }, |           { config: "inductor_timm_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" }, | ||||||
|           { config: "inductor_timm_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" }, |           { config: "inductor_timm_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" }, | ||||||
|           { config: "inductor_timm_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" }, |           { config: "inductor_timm_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" }, | ||||||
|           { config: "inductor_timm_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" }, |           { config: "inductor_timm_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" }, | ||||||
|           { config: "inductor_timm_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" }, |           { config: "inductor_timm_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" }, | ||||||
|           { config: "inductor_timm_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" }, |           { config: "inductor_timm_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" }, | ||||||
|  |           { config: "dynamo_eager_timm_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" }, | ||||||
|  |           { config: "dynamo_eager_timm_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" }, | ||||||
|  |           { config: "dynamo_eager_timm_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" }, | ||||||
|  |           { config: "dynamo_eager_timm_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" }, | ||||||
|  |           { config: "dynamo_eager_timm_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" }, | ||||||
|  |           { config: "dynamo_eager_timm_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" }, | ||||||
|           { config: "inductor_torchbench_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" }, |           { config: "inductor_torchbench_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" }, | ||||||
|           { config: "inductor_torchbench_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" }, |           { config: "inductor_torchbench_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" }, | ||||||
|           { config: "inductor_torchbench_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" }, |           { config: "inductor_torchbench_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" }, | ||||||
|           { config: "inductor_torchbench_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" }, |           { config: "inductor_torchbench_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" }, | ||||||
|           { config: "inductor_torchbench_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" }, |           { config: "inductor_torchbench_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" }, | ||||||
|           { config: "inductor_torchbench_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" }, |           { config: "inductor_torchbench_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" }, | ||||||
|  |           { config: "dynamo_eager_torchbench_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" }, | ||||||
|  |           { config: "dynamo_eager_torchbench_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" }, | ||||||
|  |           { config: "dynamo_eager_torchbench_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" }, | ||||||
|  |           { config: "dynamo_eager_torchbench_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" }, | ||||||
|  |           { config: "dynamo_eager_torchbench_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" }, | ||||||
|  |           { config: "dynamo_eager_torchbench_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" }, | ||||||
|           { config: "cachebench", shard: 1, num_shards: 2, runner: "linux.aws.a100" }, |           { config: "cachebench", shard: 1, num_shards: 2, runner: "linux.aws.a100" }, | ||||||
|           { config: "cachebench", shard: 2, num_shards: 2, runner: "linux.aws.a100" }, |           { config: "cachebench", shard: 2, num_shards: 2, runner: "linux.aws.a100" }, | ||||||
|         ]} |         ]} | ||||||
|  | |||||||
							
								
								
									
										8
									
								
								.github/workflows/lint.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										8
									
								
								.github/workflows/lint.yml
									
									
									
									
										vendored
									
									
								
							| @ -31,8 +31,6 @@ jobs: | |||||||
|     if: github.repository_owner == 'pytorch' |     if: github.repository_owner == 'pytorch' | ||||||
|     name: Get changed files |     name: Get changed files | ||||||
|     uses: ./.github/workflows/_get-changed-files.yml |     uses: ./.github/workflows/_get-changed-files.yml | ||||||
|     with: |  | ||||||
|       all_files: ${{ contains(github.event.pull_request.labels.*.name, 'lint-all-files') || contains(github.event.pull_request.labels.*.name, 'Reverted') }} |  | ||||||
|  |  | ||||||
|   lintrunner-clang: |   lintrunner-clang: | ||||||
|     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main |     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main | ||||||
| @ -55,7 +53,7 @@ jobs: | |||||||
|     with: |     with: | ||||||
|       timeout: 120 |       timeout: 120 | ||||||
|       runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" |       runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" | ||||||
|       docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter |       docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter | ||||||
|       # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout |       # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout | ||||||
|       # to run git rev-parse HEAD~:.ci/docker when a new image is needed |       # to run git rev-parse HEAD~:.ci/docker when a new image is needed | ||||||
|       fetch-depth: 0 |       fetch-depth: 0 | ||||||
| @ -266,10 +264,10 @@ jobs: | |||||||
|         with: |         with: | ||||||
|           submodules: false |           submodules: false | ||||||
|           fetch-depth: 1 |           fetch-depth: 1 | ||||||
|       - name: Setup Python 3.10 |       - name: Setup Python 3.9 | ||||||
|         uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 |         uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 | ||||||
|         with: |         with: | ||||||
|           python-version: '3.10' |           python-version: '3.9' | ||||||
|           architecture: x64 |           architecture: x64 | ||||||
|           cache: pip |           cache: pip | ||||||
|       - name: Install dependencies |       - name: Install dependencies | ||||||
|  | |||||||
							
								
								
									
										46
									
								
								.github/workflows/operator_microbenchmark.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										46
									
								
								.github/workflows/operator_microbenchmark.yml
									
									
									
									
										vendored
									
									
								
							| @ -1,46 +0,0 @@ | |||||||
| name: operator_microbenchmark |  | ||||||
|  |  | ||||||
| on: |  | ||||||
|   push: |  | ||||||
|     tags: |  | ||||||
|       - ciflow/op-benchmark/* |  | ||||||
|   workflow_dispatch: |  | ||||||
|   schedule: |  | ||||||
|     # Run at 06:00 UTC everyday |  | ||||||
|     - cron: 0 6 * * * |  | ||||||
|  |  | ||||||
| concurrency: |  | ||||||
|   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} |  | ||||||
|   cancel-in-progress: true |  | ||||||
|  |  | ||||||
| permissions: |  | ||||||
|   id-token: write |  | ||||||
|   contents: read |  | ||||||
|  |  | ||||||
| jobs: |  | ||||||
|   opmicrobenchmark-build: |  | ||||||
|     if: github.repository_owner == 'pytorch' |  | ||||||
|     name: opmicrobenchmark-build |  | ||||||
|     uses: ./.github/workflows/_linux-build.yml |  | ||||||
|     with: |  | ||||||
|       runner: linux.12xlarge.memory |  | ||||||
|       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 |  | ||||||
|       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 |  | ||||||
|       cuda-arch-list: '8.0 9.0' |  | ||||||
|       test-matrix: | |  | ||||||
|         { include: [ |  | ||||||
|           { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.h100" }, |  | ||||||
|           { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.a100" }, |  | ||||||
|         ]} |  | ||||||
|     secrets: inherit |  | ||||||
|  |  | ||||||
|   opmicrobenchmark-test: |  | ||||||
|     name: opmicrobenchmark-test |  | ||||||
|     uses: ./.github/workflows/_linux-test.yml |  | ||||||
|     needs: opmicrobenchmark-build |  | ||||||
|     with: |  | ||||||
|       timeout-minutes: 500 |  | ||||||
|       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 |  | ||||||
|       docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }} |  | ||||||
|       test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }} |  | ||||||
|     secrets: inherit |  | ||||||
							
								
								
									
										2
									
								
								.github/workflows/pull.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/pull.yml
									
									
									
									
										vendored
									
									
								
							| @ -127,8 +127,6 @@ jobs: | |||||||
|     uses: ./.github/workflows/_linux-build.yml |     uses: ./.github/workflows/_linux-build.yml | ||||||
|     needs: get-label-type |     needs: get-label-type | ||||||
|     with: |     with: | ||||||
|       # More memory is needed to build with asan |  | ||||||
|       runner: linux.2xlarge.memory |  | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       build-environment: linux-jammy-py3.10-clang18-asan |       build-environment: linux-jammy-py3.10-clang18-asan | ||||||
|       docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan |       docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan | ||||||
|  | |||||||
							
								
								
									
										54
									
								
								.github/workflows/quantization-periodic.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										54
									
								
								.github/workflows/quantization-periodic.yml
									
									
									
									
										vendored
									
									
								
							| @ -1,54 +0,0 @@ | |||||||
| name: quantization-periodic |  | ||||||
|  |  | ||||||
| on: |  | ||||||
|   push: |  | ||||||
|     tags: |  | ||||||
|       - ciflow/quantization-periodic/* |  | ||||||
|   workflow_dispatch: |  | ||||||
|   schedule: |  | ||||||
|     # run weekly |  | ||||||
|     - cron: "45 0 * * 0" |  | ||||||
|  |  | ||||||
| concurrency: |  | ||||||
|   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} |  | ||||||
|   cancel-in-progress: true |  | ||||||
|  |  | ||||||
| permissions: |  | ||||||
|   id-token: write |  | ||||||
|   contents: read |  | ||||||
|  |  | ||||||
| jobs: |  | ||||||
|   get-default-label-prefix: |  | ||||||
|     name: get-default-label-prefix |  | ||||||
|     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main |  | ||||||
|     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} |  | ||||||
|     with: |  | ||||||
|       triggering_actor: ${{ github.triggering_actor }} |  | ||||||
|       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} |  | ||||||
|       curr_branch: ${{ github.head_ref || github.ref_name }} |  | ||||||
|       curr_ref_type: ${{ github.ref_type }} |  | ||||||
|       opt_out_experiments: lf |  | ||||||
|  |  | ||||||
|   periodic-quantization-build: |  | ||||||
|     name: periodic-quantization-build |  | ||||||
|     uses: ./.github/workflows/_linux-build.yml |  | ||||||
|     needs: get-default-label-prefix |  | ||||||
|     with: |  | ||||||
|       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" |  | ||||||
|       build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11 |  | ||||||
|       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 |  | ||||||
|       cuda-arch-list: '8.9' |  | ||||||
|       test-matrix: | |  | ||||||
|         { include: [ |  | ||||||
|           { config: "quantization", shard: 1, num_shards: 1, runner: "${{ needs.get-default-label-prefix.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, |  | ||||||
|         ]} |  | ||||||
|     secrets: inherit |  | ||||||
|   periodic-test-quantization: |  | ||||||
|     name: periodic-test-quantization |  | ||||||
|     uses: ./.github/workflows/_linux-test.yml |  | ||||||
|     needs: periodic-quantization-build |  | ||||||
|     with: |  | ||||||
|       build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11 |  | ||||||
|       docker-image: ${{ needs.periodic-quantization-build.outputs.docker-image }} |  | ||||||
|       test-matrix: ${{ needs.periodic-quantization-build.outputs.test-matrix }} |  | ||||||
|     secrets: inherit |  | ||||||
							
								
								
									
										2
									
								
								.github/workflows/slow.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/slow.yml
									
									
									
									
										vendored
									
									
								
							| @ -140,8 +140,6 @@ jobs: | |||||||
|     uses: ./.github/workflows/_linux-build.yml |     uses: ./.github/workflows/_linux-build.yml | ||||||
|     needs: get-label-type |     needs: get-label-type | ||||||
|     with: |     with: | ||||||
|       # More memory is needed to build with asan |  | ||||||
|       runner: linux.2xlarge.memory |  | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|       build-environment: linux-jammy-py3.10-clang18-asan |       build-environment: linux-jammy-py3.10-clang18-asan | ||||||
|       docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan |       docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan | ||||||
|  | |||||||
							
								
								
									
										76
									
								
								.github/workflows/test-b200.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										76
									
								
								.github/workflows/test-b200.yml
									
									
									
									
										vendored
									
									
								
							| @ -1,76 +0,0 @@ | |||||||
| # B200 Smoke Tests CI Workflow |  | ||||||
| # |  | ||||||
| # This workflow runs smoke tests on B200 hardware |  | ||||||
| # |  | ||||||
| # Flow: |  | ||||||
| # 1. Builds PyTorch with CUDA 12.8+ and sm100 architecture for B200 |  | ||||||
| # 2. Runs smoke tests on linux.dgx.b200 runner |  | ||||||
| # 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke() function |  | ||||||
| # |  | ||||||
| # Triggered by: |  | ||||||
| # - Pull requests modifying this workflow file |  | ||||||
| # - Manual dispatch |  | ||||||
| # - Schedule (every 6 hours) |  | ||||||
| # - Adding ciflow/b200 label to a PR (creates ciflow/b200/* tag) |  | ||||||
|  |  | ||||||
| name: B200 Smoke Tests |  | ||||||
|  |  | ||||||
| on: |  | ||||||
|   pull_request: |  | ||||||
|     paths: |  | ||||||
|       - .github/workflows/test-b200.yml |  | ||||||
|   workflow_dispatch: |  | ||||||
|   schedule: |  | ||||||
|     - cron: 0 4,10,16,22 * * *  # every 6 hours |  | ||||||
|   push: |  | ||||||
|     tags: |  | ||||||
|       - ciflow/b200/* |  | ||||||
|  |  | ||||||
| concurrency: |  | ||||||
|   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} |  | ||||||
|   cancel-in-progress: true |  | ||||||
|  |  | ||||||
| permissions: |  | ||||||
|   id-token: write |  | ||||||
|   contents: read |  | ||||||
|  |  | ||||||
| jobs: |  | ||||||
|  |  | ||||||
|   get-label-type: |  | ||||||
|     if: github.repository_owner == 'pytorch' |  | ||||||
|     name: get-label-type |  | ||||||
|     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main |  | ||||||
|     with: |  | ||||||
|       triggering_actor: ${{ github.triggering_actor }} |  | ||||||
|       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} |  | ||||||
|       curr_branch: ${{ github.head_ref || github.ref_name }} |  | ||||||
|       curr_ref_type: ${{ github.ref_type }} |  | ||||||
|  |  | ||||||
|   linux-jammy-cuda12_8-py3_10-gcc11-sm100-build: |  | ||||||
|     name: linux-jammy-cuda12.8-py3.10-gcc11-sm100 |  | ||||||
|     uses: ./.github/workflows/_linux-build.yml |  | ||||||
|     needs: get-label-type |  | ||||||
|     with: |  | ||||||
|       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" |  | ||||||
|       runner: linux.12xlarge.memory |  | ||||||
|       build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100 |  | ||||||
|       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 |  | ||||||
|       cuda-arch-list: '10.0' |  | ||||||
|       test-matrix: | |  | ||||||
|         { include: [ |  | ||||||
|           { config: "smoke_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" }, |  | ||||||
|         ]} |  | ||||||
|       # config: "smoke_b200" maps to test_python_smoke_b200() in .ci/pytorch/test.sh |  | ||||||
|     secrets: inherit |  | ||||||
|  |  | ||||||
|   linux-jammy-cuda12_8-py3_10-gcc11-sm100-test: |  | ||||||
|     name: linux-jammy-cuda12.8-py3.10-gcc11-sm100 |  | ||||||
|     uses: ./.github/workflows/_linux-test.yml |  | ||||||
|     needs: |  | ||||||
|       - linux-jammy-cuda12_8-py3_10-gcc11-sm100-build |  | ||||||
|     with: |  | ||||||
|       build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100 |  | ||||||
|       docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.docker-image }} |  | ||||||
|       test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.test-matrix }} |  | ||||||
|       aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only |  | ||||||
|     secrets: inherit |  | ||||||
							
								
								
									
										24
									
								
								.github/workflows/unstable.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										24
									
								
								.github/workflows/unstable.yml
									
									
									
									
										vendored
									
									
								
							| @ -53,3 +53,27 @@ jobs: | |||||||
|       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} |       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} | ||||||
|       curr_branch: ${{ github.head_ref || github.ref_name }} |       curr_branch: ${{ github.head_ref || github.ref_name }} | ||||||
|       curr_ref_type: ${{ github.ref_type }} |       curr_ref_type: ${{ github.ref_type }} | ||||||
|  |  | ||||||
|  |   linux-jammy-py3_9-clang9-xla-build: | ||||||
|  |     name: linux-jammy-py3_9-clang9-xla | ||||||
|  |     uses: ./.github/workflows/_linux-build.yml | ||||||
|  |     needs: get-label-type | ||||||
|  |     with: | ||||||
|  |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|  |       build-environment: linux-jammy-py3.9-clang9-xla | ||||||
|  |       docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite | ||||||
|  |       test-matrix: | | ||||||
|  |         { include: [ | ||||||
|  |           { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" }, | ||||||
|  |         ]} | ||||||
|  |     secrets: inherit | ||||||
|  |  | ||||||
|  |   linux-jammy-py3_9-clang9-xla-test: | ||||||
|  |     name: linux-jammy-py3_9-clang9-xla | ||||||
|  |     uses: ./.github/workflows/_linux-test.yml | ||||||
|  |     needs: linux-jammy-py3_9-clang9-xla-build | ||||||
|  |     with: | ||||||
|  |       build-environment: linux-jammy-py3.9-clang9-xla | ||||||
|  |       docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }} | ||||||
|  |       test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }} | ||||||
|  |     secrets: inherit | ||||||
|  | |||||||
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -82,7 +82,6 @@ torch/return_types.pyi | |||||||
| torch/nn/functional.pyi | torch/nn/functional.pyi | ||||||
| torch/utils/data/datapipes/datapipe.pyi | torch/utils/data/datapipes/datapipe.pyi | ||||||
| torch/csrc/autograd/generated/* | torch/csrc/autograd/generated/* | ||||||
| torch/csrc/functionalization/generated/* |  | ||||||
| torch/csrc/lazy/generated/*.[!m]* | torch/csrc/lazy/generated/*.[!m]* | ||||||
| torch_compile_debug/ | torch_compile_debug/ | ||||||
| # Listed manually because some files in this directory are not generated | # Listed manually because some files in this directory are not generated | ||||||
|  | |||||||
| @ -49,7 +49,7 @@ init_command = [ | |||||||
|     'mccabe==0.7.0', |     'mccabe==0.7.0', | ||||||
|     'pycodestyle==2.14.0', |     'pycodestyle==2.14.0', | ||||||
|     'pyflakes==3.4.0', |     'pyflakes==3.4.0', | ||||||
|     'torchfix==0.4.0 ; python_version >= "3.10" and python_version < "3.13"', |     'torchfix==0.4.0 ; python_version >= "3.9" and python_version < "3.13"', | ||||||
| ] | ] | ||||||
|  |  | ||||||
|  |  | ||||||
| @ -153,7 +153,7 @@ init_command = [ | |||||||
|     'python3', |     'python3', | ||||||
|     'tools/linter/adapters/pip_init.py', |     'tools/linter/adapters/pip_init.py', | ||||||
|     '--dry-run={{DRYRUN}}', |     '--dry-run={{DRYRUN}}', | ||||||
|     'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"', |     'numpy==1.26.4 ; python_version >= "3.9" and python_version <= "3.11"', | ||||||
|     'numpy==2.1.0 ; python_version >= "3.12"', |     'numpy==2.1.0 ; python_version >= "3.12"', | ||||||
|     'expecttest==0.3.0', |     'expecttest==0.3.0', | ||||||
|     'mypy==1.16.0', |     'mypy==1.16.0', | ||||||
| @ -196,7 +196,6 @@ exclude_patterns = [ | |||||||
|     'tools/test/gen_operators_yaml_test.py', |     'tools/test/gen_operators_yaml_test.py', | ||||||
|     'tools/test/gen_oplist_test.py', |     'tools/test/gen_oplist_test.py', | ||||||
|     'tools/test/test_selective_build.py', |     'tools/test/test_selective_build.py', | ||||||
|     'tools/experimental/dynamic_shapes/torchfuzz/**', |  | ||||||
| ] | ] | ||||||
| command = [ | command = [ | ||||||
|     'python3', |     'python3', | ||||||
| @ -1453,7 +1452,7 @@ init_command = [ | |||||||
|     '--dry-run={{DRYRUN}}', |     '--dry-run={{DRYRUN}}', | ||||||
|     'usort==1.0.8.post1', |     'usort==1.0.8.post1', | ||||||
|     'isort==6.0.1', |     'isort==6.0.1', | ||||||
|     'ruff==0.13.1',  # sync with RUFF |     'ruff==0.12.9',  # sync with RUFF | ||||||
| ] | ] | ||||||
| is_formatter = true | is_formatter = true | ||||||
|  |  | ||||||
| @ -1587,7 +1586,7 @@ init_command = [ | |||||||
|     'python3', |     'python3', | ||||||
|     'tools/linter/adapters/pip_init.py', |     'tools/linter/adapters/pip_init.py', | ||||||
|     '--dry-run={{DRYRUN}}', |     '--dry-run={{DRYRUN}}', | ||||||
|     'ruff==0.13.1',  # sync with PYFMT |     'ruff==0.12.9',  # sync with PYFMT | ||||||
| ] | ] | ||||||
| is_formatter = true | is_formatter = true | ||||||
|  |  | ||||||
|  | |||||||
							
								
								
									
										34
									
								
								BUILD.bazel
									
									
									
									
									
								
							
							
						
						
									
										34
									
								
								BUILD.bazel
									
									
									
									
									
								
							| @ -91,8 +91,6 @@ generated_cpu_cpp = [ | |||||||
|     "aten/src/ATen/NativeMetaFunctions.h", |     "aten/src/ATen/NativeMetaFunctions.h", | ||||||
|     "aten/src/ATen/RegistrationDeclarations.h", |     "aten/src/ATen/RegistrationDeclarations.h", | ||||||
|     "aten/src/ATen/VmapGeneratedPlumbing.h", |     "aten/src/ATen/VmapGeneratedPlumbing.h", | ||||||
|     "aten/src/ATen/ViewMetaClasses.h", |  | ||||||
|     "aten/src/ATen/ViewMetaClasses.cpp", |  | ||||||
|     "aten/src/ATen/core/aten_interned_strings.h", |     "aten/src/ATen/core/aten_interned_strings.h", | ||||||
|     "aten/src/ATen/core/enum_tag.h", |     "aten/src/ATen/core/enum_tag.h", | ||||||
|     "aten/src/ATen/core/TensorBody.h", |     "aten/src/ATen/core/TensorBody.h", | ||||||
| @ -835,6 +833,36 @@ pybind_extension( | |||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
|  |  | ||||||
|  | cc_library( | ||||||
|  |     name = "functorch", | ||||||
|  |     hdrs = glob([ | ||||||
|  |         "functorch/csrc/dim/*.h", | ||||||
|  |     ]), | ||||||
|  |     srcs = glob([ | ||||||
|  |         "functorch/csrc/dim/*.cpp", | ||||||
|  |     ]), | ||||||
|  |     deps = [ | ||||||
|  |         ":aten_nvrtc", | ||||||
|  |         ":torch_python", | ||||||
|  |         "@pybind11", | ||||||
|  |     ], | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | pybind_extension( | ||||||
|  |     name = "functorch/_C", | ||||||
|  |     copts=[ | ||||||
|  |         "-DTORCH_EXTENSION_NAME=_C" | ||||||
|  |     ], | ||||||
|  |     srcs = [ | ||||||
|  |         "functorch/csrc/init_dim_only.cpp", | ||||||
|  |     ], | ||||||
|  |     deps = [ | ||||||
|  |         ":functorch", | ||||||
|  |         ":torch_python", | ||||||
|  |         ":aten_nvrtc", | ||||||
|  |     ], | ||||||
|  | ) | ||||||
|  |  | ||||||
| cc_binary( | cc_binary( | ||||||
|     name = "torch/bin/torch_shm_manager", |     name = "torch/bin/torch_shm_manager", | ||||||
|     srcs = [ |     srcs = [ | ||||||
| @ -875,6 +903,7 @@ py_library( | |||||||
|     ], |     ], | ||||||
|     data = [ |     data = [ | ||||||
|         ":torch/_C.so", |         ":torch/_C.so", | ||||||
|  |         ":functorch/_C.so", | ||||||
|         ":torch/bin/torch_shm_manager", |         ":torch/bin/torch_shm_manager", | ||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
| @ -1077,7 +1106,6 @@ test_suite( | |||||||
|         "aten/src/ATen/templates/LazyNonNativeIr.h", |         "aten/src/ATen/templates/LazyNonNativeIr.h", | ||||||
|         "aten/src/ATen/templates/RegisterDispatchKey.cpp", |         "aten/src/ATen/templates/RegisterDispatchKey.cpp", | ||||||
|         "aten/src/ATen/templates/RegisterDispatchDefinitions.ini", |         "aten/src/ATen/templates/RegisterDispatchDefinitions.ini", | ||||||
|         "aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp", |  | ||||||
|         "aten/src/ATen/native/native_functions.yaml", |         "aten/src/ATen/native/native_functions.yaml", | ||||||
|         "aten/src/ATen/native/tags.yaml", |         "aten/src/ATen/native/tags.yaml", | ||||||
|         "aten/src/ATen/native/ts_native_functions.yaml", |         "aten/src/ATen/native/ts_native_functions.yaml", | ||||||
|  | |||||||
| @ -1,4 +1,5 @@ | |||||||
| cmake_minimum_required(VERSION 3.27 FATAL_ERROR) | cmake_minimum_required(VERSION 3.27 FATAL_ERROR) | ||||||
|  | # cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0023 NEW) | ||||||
|  |  | ||||||
| # Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this | # Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this | ||||||
| # sometimes makes XCode C compiler gets detected as "Clang", even when the C++ | # sometimes makes XCode C compiler gets detected as "Clang", even when the C++ | ||||||
| @ -442,7 +443,7 @@ if(WIN32) | |||||||
|       message( |       message( | ||||||
|         WARNING |         WARNING | ||||||
|           "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. " |           "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. " | ||||||
|           "Please run command 'conda install -c conda-forge libuv=1.51' to install libuv." |           "Please run command 'conda install -c conda-forge libuv=1.39' to install libuv." | ||||||
|       ) |       ) | ||||||
|     else() |     else() | ||||||
|       set(ENV{libuv_ROOT} ${libuv_tmp_LIBRARY}/../../) |       set(ENV{libuv_ROOT} ${libuv_tmp_LIBRARY}/../../) | ||||||
| @ -1390,6 +1391,10 @@ endif() | |||||||
| include(cmake/Summary.cmake) | include(cmake/Summary.cmake) | ||||||
| caffe2_print_configuration_summary() | caffe2_print_configuration_summary() | ||||||
|  |  | ||||||
|  | if(BUILD_FUNCTORCH) | ||||||
|  |   add_subdirectory(functorch) | ||||||
|  | endif() | ||||||
|  |  | ||||||
| # Parse custom debug info | # Parse custom debug info | ||||||
| if(DEFINED USE_CUSTOM_DEBINFO) | if(DEFINED USE_CUSTOM_DEBINFO) | ||||||
|   string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}") |   string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}") | ||||||
| @ -1481,4 +1486,4 @@ else() | |||||||
|     To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1 |     To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1 | ||||||
|     ]]) |     ]]) | ||||||
|   endif() |   endif() | ||||||
| endif() | endif() | ||||||
							
								
								
									
										105
									
								
								MANIFEST.in
									
									
									
									
									
								
							
							
						
						
									
										105
									
								
								MANIFEST.in
									
									
									
									
									
								
							| @ -1,61 +1,20 @@ | |||||||
| # Reference: https://setuptools.pypa.io/en/latest/userguide/miscellaneous.html | # Reference: https://setuptools.pypa.io/en/latest/userguide/miscellaneous.html | ||||||
|  |  | ||||||
| # Include individual top-level files | # Include source files in SDist | ||||||
| include CITATION.cff | include CMakeLists.txt | ||||||
| include CODEOWNERS | include *.bzl *.bazel .bazel* BUILD *.BUILD BUILD.* WORKSPACE | ||||||
| include Dockerfile | include BUCK BUCK.* | ||||||
| include LICENSE | include requirements*.txt | ||||||
| include MANIFEST.in | include version.txt | ||||||
| include Makefile | include [Mm]akefile *.[Mm]akefile [Mm]akefile.* | ||||||
| include NOTICE | include [Dd]ockerfile *.[Dd]ockerfile [Dd]ockerfile.* .dockerignore | ||||||
| include .bc-linter.yml |  | ||||||
| include .clang-format .clang-tidy |  | ||||||
| include .cmakelintrc |  | ||||||
| include .coveragerc |  | ||||||
| include .dockerignore |  | ||||||
| include .editorconfig |  | ||||||
| include .flake8 |  | ||||||
| include .gdbinit |  | ||||||
| include .lintrunner.toml |  | ||||||
| include .lldbinit |  | ||||||
| include codex_setup.sh |  | ||||||
| include docker.Makefile |  | ||||||
| include pyrefly.toml |  | ||||||
| include ubsan.supp |  | ||||||
|  |  | ||||||
| # Include bazel and BUCK related files |  | ||||||
| include BUILD.bazel BUCK.oss |  | ||||||
| include WORKSPACE |  | ||||||
| include *.bzl |  | ||||||
| include .bazelignore .bazelrc .bazelversion |  | ||||||
|  |  | ||||||
| # Include general configuration files |  | ||||||
| include *.ini |  | ||||||
| # Include important top-level information |  | ||||||
| include *.md |  | ||||||
| # Include technical text files at the moment, comprises |  | ||||||
| # version.txt, CMakeLists.txt, requirements.txt |  | ||||||
| include *.txt |  | ||||||
|  |  | ||||||
| # Include ctags configuration |  | ||||||
| include .ctags.d/*.ctags |  | ||||||
|  |  | ||||||
| # Include subfolders completely |  | ||||||
| graft .devcontainer |  | ||||||
| graft .vscode |  | ||||||
| graft android | graft android | ||||||
| graft aten | graft aten | ||||||
| graft benchmarks |  | ||||||
| graft binaries | graft binaries | ||||||
| graft c10 | graft c10 | ||||||
| graft caffe2 | graft caffe2 | ||||||
| graft cmake | graft cmake | ||||||
| graft docs |  | ||||||
| graft functorch | graft functorch | ||||||
| graft ios |  | ||||||
| graft mypy_plugins |  | ||||||
| graft scripts |  | ||||||
| graft test |  | ||||||
| graft third_party | graft third_party | ||||||
| graft tools | graft tools | ||||||
| graft torch | graft torch | ||||||
| @ -63,37 +22,29 @@ graft torchgen | |||||||
| # FIXME: torch-xla build during codegen will fail if include this file in wheel | # FIXME: torch-xla build during codegen will fail if include this file in wheel | ||||||
| exclude torchgen/BUILD.bazel | exclude torchgen/BUILD.bazel | ||||||
|  |  | ||||||
| # The following exclusions omit parts from third-party dependencies that | # Misc files and directories in SDist | ||||||
| # contain invalid symlinks[1] and that are not needed for pytorch, such as | include *.md | ||||||
| # bindings for unused languages | include CITATION.cff | ||||||
| prune third_party/flatbuffers/java | include LICENSE NOTICE | ||||||
| prune third_party/flatbuffers/kotlin | include mypy*.ini | ||||||
| prune third_party/ittapi/rust | graft benchmarks | ||||||
| prune third_party/nccl/pkg/debian | graft docs | ||||||
| prune third_party/opentelemetry-cpp/third_party/prometheus-cpp/cmake/project-import-* | graft mypy_plugins | ||||||
|  | graft scripts | ||||||
| # The following document is also an invalid symlink[1] and superfluous |  | ||||||
| exclude third_party/flatbuffers/docs/source/CONTRIBUTING.md |  | ||||||
|  |  | ||||||
| # Omit autogenerated code |  | ||||||
| prune torchgen/packaged |  | ||||||
|  |  | ||||||
| # Omit caches, compiled, and scm related content |  | ||||||
| prune */__pycache__ |  | ||||||
| prune **/.github |  | ||||||
| prune **/.gitlab |  | ||||||
| global-exclude *.o *.obj *.so *.dylib *.a *.pxd *.dll *.lib |  | ||||||
| global-exclude *.py[cod] *.swp *~ |  | ||||||
| global-exclude .git .git-blame-ignore-revs .gitattributes .gitignore .gitmodules |  | ||||||
| global-exclude .gitlab-ci.yml |  | ||||||
|  |  | ||||||
| # Misc files needed for custom setuptools command | # Misc files needed for custom setuptools command | ||||||
| include .gitignore | include .gitignore | ||||||
| include .gitmodules | include .gitmodules | ||||||
|  |  | ||||||
| # [1] Invalid symlinks for the purposes of Python source distributions are, | # Include test suites in SDist | ||||||
| # according to the source distribution format[2] links pointing outside the | graft test | ||||||
| # destination directory or links with a `..` component, which is those of | include pytest.ini | ||||||
| # concern here. | include .coveragerc | ||||||
|  |  | ||||||
| # [2] https://packaging.python.org/en/latest/specifications/source-distribution-format/#source-distribution-archive-features | # Prune generated/compiled files | ||||||
|  | prune torchgen/packaged | ||||||
|  | prune */__pycache__ | ||||||
|  | global-exclude *.o *.obj *.so *.a *.dylib *.pxd *.dll *.lib *.py[cod] | ||||||
|  |  | ||||||
|  | prune */.git | ||||||
|  | global-exclude .git *~ *.swp | ||||||
|  | |||||||
| @ -161,7 +161,7 @@ They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv) | |||||||
|  |  | ||||||
| #### Prerequisites | #### Prerequisites | ||||||
| If you are installing from source, you will need: | If you are installing from source, you will need: | ||||||
| - Python 3.10 or later | - Python 3.9 or later | ||||||
| - A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required, on Linux) | - A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required, on Linux) | ||||||
| - Visual Studio or Visual Studio Build Tool (Windows only) | - Visual Studio or Visual Studio Build Tool (Windows only) | ||||||
|  |  | ||||||
| @ -275,7 +275,7 @@ conda install pkg-config libuv | |||||||
| pip install mkl-static mkl-include | pip install mkl-static mkl-include | ||||||
| # Add these packages if torch.distributed is needed. | # Add these packages if torch.distributed is needed. | ||||||
| # Distributed package support on Windows is a prototype feature and is subject to changes. | # Distributed package support on Windows is a prototype feature and is subject to changes. | ||||||
| conda install -c conda-forge libuv=1.51 | conda install -c conda-forge libuv | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
| #### Install PyTorch | #### Install PyTorch | ||||||
|  | |||||||
| @ -317,20 +317,10 @@ IF(USE_FBGEMM_GENAI) | |||||||
|         -greedy-reverse-local-assignment=1 |         -greedy-reverse-local-assignment=1 | ||||||
|         -fhip-new-launch-api) |         -fhip-new-launch-api) | ||||||
|  |  | ||||||
|       # Only compile for gfx942 for now. |  | ||||||
|       # This is rather hacky, I could not figure out a clean solution :( |  | ||||||
|       set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS}) |  | ||||||
|       string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}") |  | ||||||
|       if("gfx942" IN_LIST PYTORCH_ROCM_ARCH) |  | ||||||
|         list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;) |  | ||||||
|       endif() |  | ||||||
|       set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS}) |  | ||||||
|  |  | ||||||
|       hip_add_library( |       hip_add_library( | ||||||
|         fbgemm_genai STATIC |         fbgemm_genai STATIC | ||||||
|         ${fbgemm_genai_native_rocm_hip} |         ${fbgemm_genai_native_rocm_hip} | ||||||
|         HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS}) |         HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS}) | ||||||
|       set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL}) |  | ||||||
|       set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON) |       set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON) | ||||||
|       target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES) |       target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES) | ||||||
|  |  | ||||||
|  | |||||||
| @ -401,13 +401,30 @@ T* toDLPackImpl(const Tensor& src) { | |||||||
|   // The following code detects whether the src follows |   // The following code detects whether the src follows | ||||||
|   // a continuous pattern. If the src follows such pattern (common-case) |   // a continuous pattern. If the src follows such pattern (common-case) | ||||||
|   // then we do not need to normalize the strides. |   // then we do not need to normalize the strides. | ||||||
|   bool need_normalize_strides = src.dim() == 1 && src.size(0) == 1 && src.stride(0) != 1; |   bool need_normalize_strides = false; | ||||||
|  |   int64_t expected_stride = 1; | ||||||
|  |   for (int i = src.dim() - 1; i >= 0; i--) { | ||||||
|  |     // detect if we do not meet continuous pattern | ||||||
|  |     // and the size is 1, so there is opportunity to normalize | ||||||
|  |     if (src.stride(i) != expected_stride && src.size(i) == 1) { | ||||||
|  |       need_normalize_strides = true; | ||||||
|  |       break; | ||||||
|  |     } | ||||||
|  |     expected_stride *= src.size(i); | ||||||
|  |   } | ||||||
|  |  | ||||||
|   // less common case, try normalizing the strides |   // less common case, try normalizing the strides | ||||||
|   if (need_normalize_strides) { |   if (need_normalize_strides) { | ||||||
|     // create a new tensor with possibly normalized strides |     // create a new tensor with possibly normalized strides | ||||||
|     // gh-83069 |     // gh-83069 | ||||||
|     auto shape = src.sizes(); |     auto shape = src.sizes(); | ||||||
|     view = src.as_strided(shape, {1}, src.storage_offset()); |     auto strides = src.strides().vec(); | ||||||
|  |     for (int i = 0; i < src.dim(); i++) { | ||||||
|  |       if (shape[i] < 2) { | ||||||
|  |         strides[i] = 1; | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     view = src.as_strided(shape, strides, src.storage_offset()); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>); |   ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>); | ||||||
|  | |||||||
| @ -468,7 +468,7 @@ inline Tensor _sum_to( | |||||||
|       // if we assume no reduction due to unbacked we ensure that at runtime. |       // if we assume no reduction due to unbacked we ensure that at runtime. | ||||||
|       TORCH_MAYBE_SYM_CHECK( |       TORCH_MAYBE_SYM_CHECK( | ||||||
|           sym_eq(shape[i - leading_dims], sizes[i]), |           sym_eq(shape[i - leading_dims], sizes[i]), | ||||||
|           "non-reduction path was assumed due to unbacked symbols expected those two sizes to be the same:", |           "non-reduction path was assumed due to unabcked symbols expected those two sizes to be the same:", | ||||||
|           shape[i - leading_dims], |           shape[i - leading_dims], | ||||||
|           ", ", |           ", ", | ||||||
|           sizes[i]) |           sizes[i]) | ||||||
|  | |||||||
| @ -9,6 +9,11 @@ | |||||||
|  |  | ||||||
| namespace at::functionalization { | namespace at::functionalization { | ||||||
|  |  | ||||||
|  | ViewMeta ViewMeta::to_out_idx(int64_t out_idx) { | ||||||
|  |   if (out_idx == this->out_index) return *this; | ||||||
|  |   return ViewMeta(forward_fn, reverse_fn, has_symbolic_inputs, is_multi_output, is_as_strided, out_idx); | ||||||
|  | } | ||||||
|  |  | ||||||
| // Note [Functionalization: Alias Removal Part 2] | // Note [Functionalization: Alias Removal Part 2] | ||||||
| // See Note [Functionalization: Alias Removal] for more details. | // See Note [Functionalization: Alias Removal] for more details. | ||||||
| // This function applies a single update from one of the views to the StorageImpl. | // This function applies a single update from one of the views to the StorageImpl. | ||||||
| @ -37,12 +42,12 @@ namespace at::functionalization { | |||||||
| static const Tensor apply_update(const FunctionalStorageImpl::Update& update, const Tensor& base) { | static const Tensor apply_update(const FunctionalStorageImpl::Update& update, const Tensor& base) { | ||||||
|   at::Tensor t = update.new_val; |   at::Tensor t = update.new_val; | ||||||
|   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); |   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); | ||||||
|   if (update.view_metas.empty()) { return t; } |   if (update.view_metas.empty()) return t; | ||||||
|  |  | ||||||
|   std::vector<at::Tensor> tmp_values({base}); |   std::vector<at::Tensor> tmp_values({base}); | ||||||
|   tmp_values.reserve(update.view_metas.size()); |   tmp_values.reserve(update.view_metas.size()); | ||||||
|   for (size_t i = 0; i < update.view_metas.size() - 1; ++i) { |   for (size_t i = 0; i < update.view_metas.size() - 1; ++i) { | ||||||
|     at::Tensor next_view = update.view_metas[i]->forward(tmp_values.back()); |     at::Tensor next_view = update.view_metas[i].forward_fn(tmp_values.back(), update.view_metas[i].out_index); | ||||||
|     // NB: We only actually need tmp_values for ops like select/slice/diagonal/squeeze/as_strided |     // NB: We only actually need tmp_values for ops like select/slice/diagonal/squeeze/as_strided | ||||||
|     // All of these ops require additional information to recover the sizes of the original tensor. |     // All of these ops require additional information to recover the sizes of the original tensor. | ||||||
|     // If need to, we could probably apply this optimization and only bother computing tmp_values |     // If need to, we could probably apply this optimization and only bother computing tmp_values | ||||||
| @ -50,8 +55,9 @@ static const Tensor apply_update(const FunctionalStorageImpl::Update& update, co | |||||||
|     tmp_values.push_back(std::move(next_view)); |     tmp_values.push_back(std::move(next_view)); | ||||||
|   } |   } | ||||||
|   for(int64_t i = static_cast<int64_t>(update.view_metas.size()) - 1; i >= 0; --i) { |   for(int64_t i = static_cast<int64_t>(update.view_metas.size()) - 1; i >= 0; --i) { | ||||||
|  |     int64_t out_idx = update.view_metas[i].out_index; | ||||||
|     // Each view inverse is implemented in ViewInverses.cpp. |     // Each view inverse is implemented in ViewInverses.cpp. | ||||||
|     t = update.view_metas[i]->reverse(tmp_values[i], t); |     t = update.view_metas[i].reverse_fn(tmp_values[i], t, out_idx); | ||||||
|   } |   } | ||||||
|   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); |   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); | ||||||
|   return t; |   return t; | ||||||
| @ -105,13 +111,13 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base) | |||||||
|   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base_)); |   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base_)); | ||||||
| } | } | ||||||
|  |  | ||||||
| void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<std::shared_ptr<ViewMeta>>& metas) { | void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<ViewMeta>& metas) { | ||||||
|   TORCH_CHECK(!frozen_, "cannot mutate tensors with frozen storage"); |   TORCH_CHECK(!frozen_, "cannot mutate tensors with frozen storage"); | ||||||
|  |  | ||||||
|   if (metas.size() > 1) { |   if (metas.size() > 1) { | ||||||
|     for (size_t i = 1; i < metas.size(); ++i) { |     for (size_t i = 1; i < metas.size(); ++i) { | ||||||
|       // Skipping this check for XLA. Would be good to add it back, but it is failing XLA CI |       // Skipping this check for XLA. Would be good to add it back, but it is failing XLA CI | ||||||
|       TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i]->is_as_strided, |       TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i].is_as_strided, | ||||||
| "During torch.compile, encountered a mutation on a view chain of length ", metas.size(), ", where view ", i, | "During torch.compile, encountered a mutation on a view chain of length ", metas.size(), ", where view ", i, | ||||||
| " was an as_strided() call. as_strided() is non-compositional, and therefore is not possible to functionalize properly today," | " was an as_strided() call. as_strided() is non-compositional, and therefore is not possible to functionalize properly today," | ||||||
| "so this behavior is banned in compile. As a workaround, you can either remove the mutation from the model code, or you " | "so this behavior is banned in compile. As a workaround, you can either remove the mutation from the model code, or you " | ||||||
|  | |||||||
| @ -8,89 +8,44 @@ namespace at::functionalization { | |||||||
|  |  | ||||||
| // See Note [Functionalization Pass In Core] | // See Note [Functionalization Pass In Core] | ||||||
|  |  | ||||||
| enum class InverseReturnMode { |  | ||||||
|   /// Specifies that functional inverses should always return a view. |  | ||||||
|   AlwaysView, |  | ||||||
|   /// Specifies that functional inverses should always return a non-view / copy. |  | ||||||
|   NeverView, |  | ||||||
|   /// Specifies that functional inverses should return a view unless a (copying) |  | ||||||
|   /// scatter |  | ||||||
|   /// inverse exists, in which case that will be used instead. |  | ||||||
|   /// This avoids as_strided() calls that can be difficult for subclasses to |  | ||||||
|   /// handle. |  | ||||||
|   ViewOrScatterInverse, |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| #define FUNCTIONALIZATION_VIEWMETA_NAME(TYPE) \ |  | ||||||
|   static const char* name() {                 \ |  | ||||||
|     return #TYPE;                             \ |  | ||||||
|   } |  | ||||||
|  |  | ||||||
| #define FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(...) \ |  | ||||||
|   using SerializableTuple = std::tuple<__VA_ARGS__> |  | ||||||
|  |  | ||||||
| // ViewMeta is a class used by the functionalization pass to navigate between | // ViewMeta is a class used by the functionalization pass to navigate between | ||||||
| // a base tensor and a view tensor. | // a base tensor and a view tensor. | ||||||
| // For example, if I call `b = a.view1(...)` | // For example, if I call `b = a.view1(...)` | ||||||
| // the functionalization pass will generate and store a ViewMeta specialization | // the functionalization pass will generate and store a ViewMeta on b that looks | ||||||
| // for `view1` operation on b that looks like: | // like: | ||||||
| // | // | ||||||
| // struct TORCH_API view1_ViewMeta : public ViewMeta { | // ViewMeta( | ||||||
| //   FUNCTIONALIZATION_VIEWMETA_NAME(view1_ViewMeta); | //   [<captures>](const Tensor& base, int64_t mutated_view_idx) { | ||||||
| //   FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE( | //     return base.view1(...); | ||||||
| //       bool /* reapply_views */, | //   }, | ||||||
| //       const std::vector<int64_t>&); | //   [<captures>](const at::Tensor& base, const at::Tensor& mutated_view, | ||||||
| // | //   int64_t mutated_view_idx) -> at::Tensor { | ||||||
| //   view1_ViewMeta(const SerializableTuple& tpl) | //     return at::functionalization::impl::view1_inverse(base, mutated_view, | ||||||
| //       : view1_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {} | //     ...); | ||||||
| // |  | ||||||
| //   view1_ViewMeta(bool reapply_views, const std::vector<int64_t>& size) |  | ||||||
| //       : ViewMeta(/*has_symbolic_inputs=*/false), |  | ||||||
| //         reapply_views(reapply_views), |  | ||||||
| //         size(size) {} |  | ||||||
| // |  | ||||||
| //   Tensor forward(const Tensor& base) override { |  | ||||||
| //       return base.view1(...); |  | ||||||
| //   } | //   } | ||||||
| // | // | ||||||
| //   Tensor reverse(const Tensor& base, const Tensor& mutated_view) override { | // The forward_fn lambda describes how to replay view1 on a tensor. | ||||||
| //       return at::functionalization::impl::view1_inverse(base, mutated_view, |  | ||||||
| //       ...); |  | ||||||
| //   } |  | ||||||
| // | // | ||||||
| //   SerializableTuple to_serializable_tuple() { | // The reverse_fn lambda describes how, given a tensor that is already a view, | ||||||
| //     return std::make_tuple(reapply_views, size); |  | ||||||
| //   } |  | ||||||
| // |  | ||||||
| //   bool reapply_views; |  | ||||||
| //   std::vector<int64_t> size; |  | ||||||
| // }; |  | ||||||
| // |  | ||||||
| // The forward function describes how to replay view1 on a tensor. |  | ||||||
| // |  | ||||||
| // The reverse function describes how, given a tensor that is already a view, |  | ||||||
| // how to get the corresponding base tensor. See Note [Functionalization Pass: | // how to get the corresponding base tensor. See Note [Functionalization Pass: | ||||||
| // View Inverses] for details. | // View Inverses] for details. | ||||||
| // |  | ||||||
| // `SerializedTuple` is a typedef that defines an `std::tuple<...>` type |  | ||||||
| // representing the `ViewMeta` instance state. Methods that take in/return such |  | ||||||
| // a type are used for supporting pickle serialization. |  | ||||||
| struct ViewMeta { | struct ViewMeta { | ||||||
|   ViewMeta( |   ViewMeta( | ||||||
|  |       std::function<Tensor(const Tensor&, int64_t)> forward, | ||||||
|  |       std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse, | ||||||
|       bool has_symbolic_inputs, |       bool has_symbolic_inputs, | ||||||
|       bool is_multi_output = false, |       bool is_multi_output = false, | ||||||
|       bool is_as_strided = false, |       bool is_as_strided = false, | ||||||
|       int64_t out_idx = 0) |       int64_t out_idx = 0) | ||||||
|       : out_index(out_idx), |       : forward_fn(std::move(forward)), | ||||||
|  |         reverse_fn(std::move(reverse)), | ||||||
|  |         out_index(out_idx), | ||||||
|         is_multi_output(is_multi_output), |         is_multi_output(is_multi_output), | ||||||
|         is_as_strided(is_as_strided), |         is_as_strided(is_as_strided), | ||||||
|         has_symbolic_inputs(has_symbolic_inputs) {} |         has_symbolic_inputs(has_symbolic_inputs) {} | ||||||
|  |  | ||||||
|   virtual ~ViewMeta() = default; |   std::function<Tensor(const Tensor&, int64_t)> forward_fn; | ||||||
|  |   std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse_fn; | ||||||
|   virtual Tensor forward(const Tensor& base) = 0; |  | ||||||
|   virtual Tensor reverse(const Tensor& base, const Tensor& mutated_view) = 0; |  | ||||||
|  |  | ||||||
|   // See Note [out_idx in ViewMeta] |   // See Note [out_idx in ViewMeta] | ||||||
|   int64_t out_index; |   int64_t out_index; | ||||||
|  |  | ||||||
| @ -102,17 +57,10 @@ struct ViewMeta { | |||||||
|   // Tells us if this view operation has any symbolic inputs |   // Tells us if this view operation has any symbolic inputs | ||||||
|   bool has_symbolic_inputs; |   bool has_symbolic_inputs; | ||||||
|  |  | ||||||
|   // Returns a new ViewMeta with the same forward/reverse |   // Returns a copy of the current ViewMeta, if out_idx matches the current | ||||||
|  |   // out_index. Otherwise, returns a new ViewMeta with the same forward/reverse | ||||||
|   // functions, but a new out index. |   // functions, but a new out index. | ||||||
|   // |   ViewMeta to_out_idx(int64_t out_idx); | ||||||
|   // This method should be implemented by those `ViewMeta` that have more than |  | ||||||
|   // one output. |  | ||||||
|   virtual std::shared_ptr<ViewMeta> to_out_index(int64_t out_index) { |  | ||||||
|     TORCH_CHECK_NOT_IMPLEMENTED( |  | ||||||
|         false, |  | ||||||
|         "ViewMeta::to_out_index not implemented. ", |  | ||||||
|         "Likely because there's only one output."); |  | ||||||
|   } |  | ||||||
| }; | }; | ||||||
|  |  | ||||||
| // FunctionalStorageImpl is a subclass of StorageImpl used by the | // FunctionalStorageImpl is a subclass of StorageImpl used by the | ||||||
| @ -145,14 +93,14 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl { | |||||||
|     // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) |     // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) | ||||||
|     const at::Tensor new_val; |     const at::Tensor new_val; | ||||||
|     // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) |     // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) | ||||||
|     const std::vector<std::shared_ptr<ViewMeta>> view_metas; |     const std::vector<ViewMeta> view_metas; | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   explicit FunctionalStorageImpl(const Tensor& value); |   explicit FunctionalStorageImpl(const Tensor& value); | ||||||
|  |  | ||||||
|   void add_update( |   void add_update( | ||||||
|       const Tensor& updated_val, |       const Tensor& updated_val, | ||||||
|       const std::vector<std::shared_ptr<ViewMeta>>& view_metas); |       const std::vector<ViewMeta>& view_metas); | ||||||
|   bool apply_updates(); |   bool apply_updates(); | ||||||
|   const Tensor& base() { |   const Tensor& base() { | ||||||
|     return base_; |     return base_; | ||||||
|  | |||||||
| @ -129,19 +129,17 @@ void FunctionalTensorWrapper::freeze_storage() const { | |||||||
| // - view_value: The output tensor that we need to wrap. | // - view_value: The output tensor that we need to wrap. | ||||||
| // - base: The "base" of the view that `view_value` was generated from. | // - base: The "base" of the view that `view_value` was generated from. | ||||||
| // See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic. | // See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic. | ||||||
| FunctionalTensorWrapper::FunctionalTensorWrapper( | FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, const functionalization::ViewMeta& meta) | ||||||
|     const Tensor& view_value, |   : c10::TensorImpl( | ||||||
|     const FunctionalTensorWrapper* base, |       c10::DispatchKeySet(DispatchKey::Functionalize), | ||||||
|     const std::shared_ptr<functionalization::ViewMeta>& meta) |       view_value.dtype(), | ||||||
|     : c10::TensorImpl( |       base->storage().data_ptr().device() | ||||||
|           c10::DispatchKeySet(DispatchKey::Functionalize), |     ), | ||||||
|           view_value.dtype(), |     value_(view_value), | ||||||
|           base->storage().data_ptr().device()), |     is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output), | ||||||
|       value_(view_value), |     was_storage_changed_(base->was_storage_changed_), | ||||||
|       is_multi_output_view_( |     is_symbolic_(base->is_symbolic_) | ||||||
|           base->is_multi_output_view_ || meta->is_multi_output), | { | ||||||
|       was_storage_changed_(base->was_storage_changed_), |  | ||||||
|       is_symbolic_(base->is_symbolic_) { |  | ||||||
|   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(value_)); |   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(value_)); | ||||||
|   TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize)); |   TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize)); | ||||||
|   set_constructor_metadata(); |   set_constructor_metadata(); | ||||||
| @ -150,10 +148,11 @@ FunctionalTensorWrapper::FunctionalTensorWrapper( | |||||||
|       view_metas_ = base->view_metas_;  // copy |       view_metas_ = base->view_metas_;  // copy | ||||||
|   } |   } | ||||||
|   view_metas_.push_back(meta); |   view_metas_.push_back(meta); | ||||||
|   maybe_mark_symbolic(meta.get()); |   maybe_mark_symbolic(meta); | ||||||
|   storage_ = base->storage_; // alias this tensor's storage with the base tensor's |   storage_ = base->storage_; // alias this tensor's storage with the base tensor's | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_storage_impl() const { | functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_storage_impl() const { | ||||||
|   return static_cast<functionalization::FunctionalStorageImpl*>(storage_.unsafeGetStorageImpl()); |   return static_cast<functionalization::FunctionalStorageImpl*>(storage_.unsafeGetStorageImpl()); | ||||||
| } | } | ||||||
| @ -177,18 +176,18 @@ bool FunctionalTensorWrapper::is_up_to_date() const { | |||||||
| } | } | ||||||
|  |  | ||||||
| // See Note [Functionalization Pass - Inplace View Ops] | // See Note [Functionalization Pass - Inplace View Ops] | ||||||
| void FunctionalTensorWrapper::mutate_view_meta(const std::shared_ptr<at::functionalization::ViewMeta>& meta) { | void FunctionalTensorWrapper::mutate_view_meta(const at::functionalization::ViewMeta& meta) { | ||||||
|   view_metas_.push_back(meta); |   view_metas_.push_back(meta); | ||||||
|   // Manually track the fact that this tensor received a metadata mutation! |   // Manually track the fact that this tensor received a metadata mutation! | ||||||
|   has_metadata_mutation_ = true; |   has_metadata_mutation_ = true; | ||||||
|   // Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation. |   // Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation. | ||||||
|   maybe_mark_symbolic(meta.get()); |   maybe_mark_symbolic(meta); | ||||||
|   // Note [Functionalization Pass - Inplace View Ops] |   // Note [Functionalization Pass - Inplace View Ops] | ||||||
|   // So, these ops are special - they're mutation AND view ops. They get special codegen. |   // So, these ops are special - they're mutation AND view ops. They get special codegen. | ||||||
|   // An example is transpose_, e.g. `a.transpose_()` |   // An example is transpose_, e.g. `a.transpose_()` | ||||||
|   // Calling transpose_() should ensure that a gets an alias, and append the new ViewMeta to a's current list of ViewMetas. |   // Calling transpose_() should ensure that a gets an alias, and append the new ViewMeta to a's current list of ViewMetas. | ||||||
|   at::AutoDispatchSkipFunctionalize guard; |   at::AutoDispatchSkipFunctionalize guard; | ||||||
|   value_ = meta->forward(value_); |   value_ = meta.forward_fn(value_, meta.out_index); | ||||||
|   TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize)); |   TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize)); | ||||||
| } | } | ||||||
|  |  | ||||||
| @ -369,8 +368,15 @@ void FunctionalTensorWrapper::sync_() { | |||||||
|   regenerate_from_base(); |   regenerate_from_base(); | ||||||
| } | } | ||||||
|  |  | ||||||
| const std::vector<std::shared_ptr<functionalization::ViewMeta>>& FunctionalTensorWrapper::view_metas() const { | Tensor FunctionalTensorWrapper::apply_view_metas(const Tensor& base) { | ||||||
|   return view_metas_; |   auto t = base; | ||||||
|  |  | ||||||
|  |   // Reapply views to get the viewed tensor from the base in alias_ | ||||||
|  |   for (auto& view_meta: view_metas_) { | ||||||
|  |     t = view_meta.forward_fn(t, view_meta.out_index); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   return t; | ||||||
| } | } | ||||||
|  |  | ||||||
| void FunctionalTensorWrapper::regenerate_from_base() { | void FunctionalTensorWrapper::regenerate_from_base() { | ||||||
| @ -379,7 +385,7 @@ void FunctionalTensorWrapper::regenerate_from_base() { | |||||||
|   auto t = storage_impl->base(); |   auto t = storage_impl->base(); | ||||||
|  |  | ||||||
|   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); |   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); | ||||||
|   t = at::functionalization::impl::apply_view_meta_sequence(t, view_metas_); |   t = apply_view_metas(t); | ||||||
|   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); |   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); | ||||||
|  |  | ||||||
|   replace_(t, /*from_lazy_regenerate=*/true); |   replace_(t, /*from_lazy_regenerate=*/true); | ||||||
| @ -721,11 +727,11 @@ bool isFunctionalTensor(const std::optional<Tensor>& t) { | |||||||
| } | } | ||||||
|  |  | ||||||
| bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) { | bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) { | ||||||
|   if (t_list.empty()) { return false; } |   if (t_list.empty()) return false; | ||||||
|   auto functional_count = 0; |   auto functional_count = 0; | ||||||
|   for (const auto i : c10::irange(t_list.size())) { |   for (const auto i : c10::irange(t_list.size())) { | ||||||
|     auto const & e= t_list[i]; |     auto const & e= t_list[i]; | ||||||
|     if (!e.has_value() || !e->defined()) { continue; } |     if (!e.has_value() || !e->defined()) continue; | ||||||
|     if (isFunctionalTensor(e)) { |     if (isFunctionalTensor(e)) { | ||||||
|       ++functional_count; |       ++functional_count; | ||||||
|     } |     } | ||||||
| @ -735,10 +741,10 @@ bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) { | |||||||
|  |  | ||||||
| template <typename T> | template <typename T> | ||||||
| static bool isFunctionalTensorIListRef(c10::IListRef<T> list) { | static bool isFunctionalTensorIListRef(c10::IListRef<T> list) { | ||||||
|   if (list.size() == 0) { return false; } |   if (list.size() == 0) return false; | ||||||
|   auto functional_count = 0; |   auto functional_count = 0; | ||||||
|   for (const auto& tensor : list) { |   for (const auto& tensor : list) { | ||||||
|     if (!tensor.defined()) { continue; } |     if (!tensor.defined()) continue; | ||||||
|     if (isFunctionalTensor(tensor)) { |     if (isFunctionalTensor(tensor)) { | ||||||
|       ++functional_count; |       ++functional_count; | ||||||
|     } |     } | ||||||
| @ -756,28 +762,20 @@ void freeze_functional_tensor(const Tensor& tensor) { | |||||||
|   functional_base_impl->freeze_storage(); |   functional_base_impl->freeze_storage(); | ||||||
| } | } | ||||||
|  |  | ||||||
| Tensor create_functional_tensor_with_view_meta( | Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta, int64_t out_idx) { | ||||||
|     const at::Tensor& view_to_wrap, |  | ||||||
|     const at::Tensor& base, |  | ||||||
|     const std::shared_ptr<functionalization::ViewMeta>& meta, |  | ||||||
|     int64_t out_idx) { |  | ||||||
|   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap)); |   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap)); | ||||||
|   TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base)); |   TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base)); | ||||||
|   auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base); |   auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base); | ||||||
|   auto meta_ = meta; |  | ||||||
|   if (out_idx != 0) { |   if (out_idx != 0) { | ||||||
|     // Note [out_idx in ViewMeta] |     // Note [out_idx in ViewMeta] | ||||||
|     // When a view op outputs multiple tensors, each output needs its own separate ViewMeta. |     // When a view op outputs multiple tensors, each output needs its own separate ViewMeta. | ||||||
|     // Each ViewMeta also tracks the index of the particular output tensor, which is needed in the reverse function. |     // Each ViewMeta also tracks the index of the particular output tensor, which is needed in the reverse function. | ||||||
|     meta_ = meta->to_out_index(out_idx); |     meta = meta.to_out_idx(out_idx); | ||||||
|   } |   } | ||||||
|   return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta_); |   return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta); | ||||||
| } | } | ||||||
|  |  | ||||||
| std::vector<Tensor> create_functional_tensor_with_view_meta( | std::vector<Tensor> create_functional_tensor_with_view_meta(ITensorListRef view_to_wrap, const at::Tensor& base, const functionalization::ViewMeta& meta) { | ||||||
|     ITensorListRef view_to_wrap, |  | ||||||
|     const at::Tensor& base, |  | ||||||
|     const std::shared_ptr<functionalization::ViewMeta>& meta) { |  | ||||||
|   std::vector<Tensor> outputs(view_to_wrap.size()); |   std::vector<Tensor> outputs(view_to_wrap.size()); | ||||||
|   int64_t i = 0; |   int64_t i = 0; | ||||||
|   for (const auto& tensor : view_to_wrap) { |   for (const auto& tensor : view_to_wrap) { | ||||||
| @ -787,22 +785,12 @@ std::vector<Tensor> create_functional_tensor_with_view_meta( | |||||||
|   return outputs; |   return outputs; | ||||||
| } | } | ||||||
|  |  | ||||||
| void mutate_view_meta(const at::Tensor& self, const std::shared_ptr<functionalization::ViewMeta>& meta) { | void mutate_view_meta(const at::Tensor& self, const functionalization::ViewMeta& meta) { | ||||||
|   TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self)); |   TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self)); | ||||||
|   auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self); |   auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self); | ||||||
|   self_impl->mutate_view_meta(meta); |   self_impl->mutate_view_meta(meta); | ||||||
| } | } | ||||||
|  |  | ||||||
| Tensor apply_view_meta_sequence( |  | ||||||
|     const Tensor& base, |  | ||||||
|     const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence) { |  | ||||||
|   Tensor r = base; |  | ||||||
|   for (auto& vm : sequence) { |  | ||||||
|     r = vm->forward(r); |  | ||||||
|   } |  | ||||||
|   return r; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| // Note [Propagating strides in the functionalization pass] | // Note [Propagating strides in the functionalization pass] | ||||||
| // In order to properly compute stride information, the functionalization pass | // In order to properly compute stride information, the functionalization pass | ||||||
| // calls each {view} reference implementations with meta tensors. | // calls each {view} reference implementations with meta tensors. | ||||||
| @ -896,7 +884,7 @@ void functionalize_op_helper(const c10::OperatorHandle& op, torch::jit::Stack* s | |||||||
|     const auto& ivalue = returns[idx]; |     const auto& ivalue = returns[idx]; | ||||||
|     if (ivalue.isTensor()) { |     if (ivalue.isTensor()) { | ||||||
|       const auto& t = ivalue.toTensor(); |       const auto& t = ivalue.toTensor(); | ||||||
|       if (!t.defined()) { continue; } |       if (!t.defined()) continue; | ||||||
|       at::functionalization::impl::sync(t); |       at::functionalization::impl::sync(t); | ||||||
|       auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t)); |       auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t)); | ||||||
|       (*stack)[returns_begin + idx] = t_new; |       (*stack)[returns_begin + idx] = t_new; | ||||||
|  | |||||||
| @ -56,7 +56,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { | |||||||
|   explicit FunctionalTensorWrapper( |   explicit FunctionalTensorWrapper( | ||||||
|       const Tensor& view_value, |       const Tensor& view_value, | ||||||
|       const FunctionalTensorWrapper* base, |       const FunctionalTensorWrapper* base, | ||||||
|       const std::shared_ptr<functionalization::ViewMeta>& meta); |       const functionalization::ViewMeta& meta); | ||||||
|  |  | ||||||
|   // Get the underlying, actual tensor, that doesn't know anything about |   // Get the underlying, actual tensor, that doesn't know anything about | ||||||
|   // functionalization. |   // functionalization. | ||||||
| @ -99,17 +99,17 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { | |||||||
|         ->are_all_mutations_under_no_grad_or_inference_mode(); |         ->are_all_mutations_under_no_grad_or_inference_mode(); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void maybe_mark_symbolic(functionalization::ViewMeta* meta) { |   void maybe_mark_symbolic(const functionalization::ViewMeta& meta) { | ||||||
|     is_symbolic_ = is_symbolic_ | meta->has_symbolic_inputs; |     is_symbolic_ = is_symbolic_ | meta.has_symbolic_inputs; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   bool is_symbolic() const { |   bool is_symbolic() const { | ||||||
|     return is_symbolic_; |     return is_symbolic_; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   // Retrieves the ViewMeta sequence of this tensor. |   // Runs the forward_fn of every ViewMeta collected in the current instance | ||||||
|   const std::vector<std::shared_ptr<functionalization::ViewMeta>>& view_metas() |   // to some other base. | ||||||
|       const; |   Tensor apply_view_metas(const Tensor& base); | ||||||
|  |  | ||||||
|   // Sync's the underlying tensor with its alias, if it's out of date. This |   // Sync's the underlying tensor with its alias, if it's out of date. This | ||||||
|   // involves two steps: 1) Apply any pending updates/mutations to the alias 2) |   // involves two steps: 1) Apply any pending updates/mutations to the alias 2) | ||||||
| @ -146,8 +146,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { | |||||||
|   // from the base tensor. This method is used by inplace-view ops like |   // from the base tensor. This method is used by inplace-view ops like | ||||||
|   // transpose_. It appends a ViewMeta to the existing stack, and refreshes the |   // transpose_. It appends a ViewMeta to the existing stack, and refreshes the | ||||||
|   // tensor by replaying the views off of the alias. |   // tensor by replaying the views off of the alias. | ||||||
|   void mutate_view_meta( |   void mutate_view_meta(const at::functionalization::ViewMeta& meta); | ||||||
|       const std::shared_ptr<at::functionalization::ViewMeta>& meta); |  | ||||||
|  |  | ||||||
|   // Custom implementation of self.set_(src) |   // Custom implementation of self.set_(src) | ||||||
|   void set__impl(const FunctionalTensorWrapper* other); |   void set__impl(const FunctionalTensorWrapper* other); | ||||||
| @ -286,7 +285,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { | |||||||
|   bool is_symbolic_ = false; |   bool is_symbolic_ = false; | ||||||
|  |  | ||||||
|   size_t generation_ = 0; |   size_t generation_ = 0; | ||||||
|   std::vector<std::shared_ptr<at::functionalization::ViewMeta>> view_metas_; |   std::vector<at::functionalization::ViewMeta> view_metas_; | ||||||
|  |  | ||||||
|  protected: |  protected: | ||||||
|   static void copy_tensor_metadata( |   static void copy_tensor_metadata( | ||||||
| @ -378,20 +377,16 @@ TORCH_API void propagate_xla_data_direct( | |||||||
| Tensor create_functional_tensor_with_view_meta( | Tensor create_functional_tensor_with_view_meta( | ||||||
|     const Tensor& view_to_wrap, |     const Tensor& view_to_wrap, | ||||||
|     const Tensor& base, |     const Tensor& base, | ||||||
|     const std::shared_ptr<functionalization::ViewMeta>& meta, |     functionalization::ViewMeta meta, | ||||||
|     int64_t out_idx = 0); |     int64_t out_idx = 0); | ||||||
| std::vector<Tensor> create_functional_tensor_with_view_meta( | std::vector<Tensor> create_functional_tensor_with_view_meta( | ||||||
|     ITensorListRef view_to_wrap, |     ITensorListRef view_to_wrap, | ||||||
|     const Tensor& base, |     const Tensor& base, | ||||||
|     const std::shared_ptr<functionalization::ViewMeta>& meta); |     const functionalization::ViewMeta& meta); | ||||||
|  |  | ||||||
| void mutate_view_meta( | void mutate_view_meta( | ||||||
|     const Tensor& self, |     const Tensor& self, | ||||||
|     const std::shared_ptr<functionalization::ViewMeta>& meta); |     const functionalization::ViewMeta& meta); | ||||||
|  |  | ||||||
| TORCH_API Tensor apply_view_meta_sequence( |  | ||||||
|     const Tensor& base, |  | ||||||
|     const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence); |  | ||||||
|  |  | ||||||
| void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out); | void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out); | ||||||
| void set_sizes_strides_offset( | void set_sizes_strides_offset( | ||||||
|  | |||||||
| @ -1,5 +1,3 @@ | |||||||
| #include <ATen/FunctionalizeFallbackKernel.h> |  | ||||||
|  |  | ||||||
| #include <ATen/core/dispatch/Dispatcher.h> | #include <ATen/core/dispatch/Dispatcher.h> | ||||||
| #include <ATen/core/LegacyTypeDispatch.h> | #include <ATen/core/LegacyTypeDispatch.h> | ||||||
| #include <ATen/EmptyTensor.h> | #include <ATen/EmptyTensor.h> | ||||||
| @ -9,6 +7,7 @@ | |||||||
| #include <torch/library.h> | #include <torch/library.h> | ||||||
| #include <c10/util/irange.h> | #include <c10/util/irange.h> | ||||||
| #include <c10/util/strides.h> | #include <c10/util/strides.h> | ||||||
|  | #include <ATen/EmptyTensor.h> | ||||||
|  |  | ||||||
| #ifndef AT_PER_OPERATOR_HEADERS | #ifndef AT_PER_OPERATOR_HEADERS | ||||||
| #include <ATen/ATen.h> | #include <ATen/ATen.h> | ||||||
| @ -29,31 +28,6 @@ | |||||||
| #include <utility> | #include <utility> | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| namespace at::functionalization { |  | ||||||
|  |  | ||||||
| Tensor resize__ViewMeta::forward(const Tensor& base) { |  | ||||||
|   if (reapply_views) { |  | ||||||
|     return base.as_strided(size, c10::contiguous_strides(size)); |  | ||||||
|   } else { |  | ||||||
|     return at::as_strided_copy(base, size, c10::contiguous_strides(size)); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| Tensor resize__ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) { |  | ||||||
|   return base.as_strided_scatter( |  | ||||||
|       mutated_view, size, c10::contiguous_strides(size)); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| Tensor _unsafe_view_ViewMeta::forward(const Tensor& base) { |  | ||||||
|   return at::_unsafe_view_symint(base, size); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| Tensor _unsafe_view_ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) { |  | ||||||
|   return at::_unsafe_view_symint(mutated_view, base.sym_sizes()); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| } // namespace at::functionalization |  | ||||||
|  |  | ||||||
| namespace { | namespace { | ||||||
|   void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet [[maybe_unused]], torch::jit::Stack* stack) { |   void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet [[maybe_unused]], torch::jit::Stack* stack) { | ||||||
|     const auto& schema = op.schema(); |     const auto& schema = op.schema(); | ||||||
| @ -132,9 +106,7 @@ namespace { | |||||||
|       const auto& ivalue = returns[idx]; |       const auto& ivalue = returns[idx]; | ||||||
|       if (ivalue.isTensor() && should_wrap_outputs) { |       if (ivalue.isTensor() && should_wrap_outputs) { | ||||||
|         const auto& t = ivalue.toTensor(); |         const auto& t = ivalue.toTensor(); | ||||||
|         if (!t.defined()) { |         if (!t.defined()) continue; | ||||||
|           continue; |  | ||||||
|         } |  | ||||||
|         auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t)); |         auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t)); | ||||||
|         (*stack)[returns_begin + idx] = t_new; |         (*stack)[returns_begin + idx] = t_new; | ||||||
|       } else if (ivalue.isTensorList() && should_wrap_outputs) { |       } else if (ivalue.isTensorList() && should_wrap_outputs) { | ||||||
| @ -197,8 +169,19 @@ static const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatch | |||||||
|   // The output of resizing is equivalent to taking a slice of a larger tensor. |   // The output of resizing is equivalent to taking a slice of a larger tensor. | ||||||
|   // We have to emulate this "slicing" with an as_strided call. |   // We have to emulate this "slicing" with an as_strided call. | ||||||
|   auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS(); |   auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS(); | ||||||
|   auto view_meta = std::make_shared<at::functionalization::resize__ViewMeta>( |   at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta( | ||||||
|       reapply_views, size.vec()); |     [reapply_views = reapply_views, size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor { | ||||||
|  |       if (reapply_views) { | ||||||
|  |         return base.as_strided(size, c10::contiguous_strides(size)); | ||||||
|  |       } else { | ||||||
|  |         return at::as_strided_copy(base, size, c10::contiguous_strides(size)); | ||||||
|  |       } | ||||||
|  |     }, | ||||||
|  |     [size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor { | ||||||
|  |       return base.as_strided_scatter(mutated_view, size, c10::contiguous_strides(size)); | ||||||
|  |     }, | ||||||
|  |     /*has_symbolic_inputs=*/false | ||||||
|  |   ); | ||||||
|   at::functionalization::impl::mutate_view_meta(self, view_meta); |   at::functionalization::impl::mutate_view_meta(self, view_meta); | ||||||
|   return self; |   return self; | ||||||
| } | } | ||||||
| @ -317,11 +300,17 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt | |||||||
|     tmp_output = at::_unsafe_view_symint(self_, size); |     tmp_output = at::_unsafe_view_symint(self_, size); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   bool has_symbolic_inputs = std::any_of( |   bool has_symbolic_inputs = std::any_of(size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); }); | ||||||
|       size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); }); |  | ||||||
|   auto view_meta = |   at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta( | ||||||
|       std::make_shared<at::functionalization::_unsafe_view_ViewMeta>( |     [size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor { | ||||||
|           has_symbolic_inputs, size.vec()); |       return at::_unsafe_view_symint(base, size); | ||||||
|  |     }, | ||||||
|  |     [size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor { | ||||||
|  |       return at::_unsafe_view_symint(mutated_view, base.sym_sizes()); | ||||||
|  |     }, | ||||||
|  |     /*has_symbolic_inputs=*/has_symbolic_inputs | ||||||
|  |   ); | ||||||
|  |  | ||||||
|   auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, self, std::move(view_meta)); |   auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, self, std::move(view_meta)); | ||||||
|   // See  Note [Propagating strides in the functionalization pass] |   // See  Note [Propagating strides in the functionalization pass] | ||||||
|  | |||||||
| @ -1,58 +0,0 @@ | |||||||
| #pragma once |  | ||||||
|  |  | ||||||
| #include <ATen/FunctionalStorageImpl.h> |  | ||||||
|  |  | ||||||
| namespace at::functionalization { |  | ||||||
|  |  | ||||||
| // `ViewMeta` implementation for `resize_` operation. |  | ||||||
| struct TORCH_API resize__ViewMeta : public ViewMeta { |  | ||||||
|   FUNCTIONALIZATION_VIEWMETA_NAME(resize__ViewMeta) |  | ||||||
|   FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE( |  | ||||||
|       bool /* reapply_views */, |  | ||||||
|       const std::vector<int64_t>&); |  | ||||||
|  |  | ||||||
|   resize__ViewMeta(const SerializableTuple& tpl) |  | ||||||
|       : resize__ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {} |  | ||||||
|  |  | ||||||
|   resize__ViewMeta(bool reapply_views, const std::vector<int64_t>& size) |  | ||||||
|       : ViewMeta(/*has_symbolic_inputs=*/false), |  | ||||||
|         reapply_views(reapply_views), |  | ||||||
|         size(size) {} |  | ||||||
|  |  | ||||||
|   Tensor forward(const Tensor& base) override; |  | ||||||
|   Tensor reverse(const Tensor& base, const Tensor& mutated_view) override; |  | ||||||
|  |  | ||||||
|   SerializableTuple to_serializable_tuple() { |  | ||||||
|     return std::make_tuple(reapply_views, size); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   bool reapply_views; |  | ||||||
|   std::vector<int64_t> size; |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| // `ViewMeta` implementation for `_unsafe_view` operation. |  | ||||||
| struct TORCH_API _unsafe_view_ViewMeta : public ViewMeta { |  | ||||||
|   FUNCTIONALIZATION_VIEWMETA_NAME(_unsafe_view_ViewMeta) |  | ||||||
|   FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE( |  | ||||||
|       bool /* has_symbolic_inputs */, |  | ||||||
|       const std::vector<c10::SymInt>&); |  | ||||||
|  |  | ||||||
|   _unsafe_view_ViewMeta(const SerializableTuple& tpl) |  | ||||||
|       : _unsafe_view_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {} |  | ||||||
|  |  | ||||||
|   _unsafe_view_ViewMeta( |  | ||||||
|       bool has_symbolic_inputs, |  | ||||||
|       const std::vector<c10::SymInt>& size) |  | ||||||
|       : ViewMeta(has_symbolic_inputs), size(size) {} |  | ||||||
|  |  | ||||||
|   Tensor forward(const Tensor& base) override; |  | ||||||
|   Tensor reverse(const Tensor& base, const Tensor& mutated_view) override; |  | ||||||
|  |  | ||||||
|   SerializableTuple to_serializable_tuple() { |  | ||||||
|     return std::make_tuple(has_symbolic_inputs, size); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   std::vector<c10::SymInt> size; |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| } // namespace at::functionalization |  | ||||||
| @ -45,39 +45,7 @@ inline void infer_size_impl( | |||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if (infer_dim) { |   auto set_infer_dim = [&]() { | ||||||
|     // numel is the product of known sizes, it has to be divisible by newsize. |  | ||||||
|     // and newsize should be positive unless newsize == numel (we throw |  | ||||||
|     // different) error message in that case. |  | ||||||
|     if constexpr (std::is_same_v<NumelType, c10::SymInt>) { |  | ||||||
|       auto v = newsize.maybe_as_int(); |  | ||||||
|       if (v and *v == 0) { |  | ||||||
|         // Avoid div by 0 when sym_eq(numel % newsize, 0) is constructed! |  | ||||||
|         // which may happen when newsize is not a symbol! if its a symbol |  | ||||||
|         // division won't happen anyway during compile. |  | ||||||
|         TORCH_MAYBE_SYM_CHECK( |  | ||||||
|             numel == newsize, |  | ||||||
|             "shape '", |  | ||||||
|             shape, |  | ||||||
|             "' is invalid for input of size ", |  | ||||||
|             numel); |  | ||||||
|       } else { |  | ||||||
|         auto cond = sym_gt(newsize, 0) |  | ||||||
|                         .sym_and(sym_eq(numel % newsize, 0)) |  | ||||||
|                         .sym_or(sym_eq(numel, newsize)); |  | ||||||
|         TORCH_MAYBE_SYM_CHECK( |  | ||||||
|             cond, "shape '", shape, "' is invalid for input of size ", numel); |  | ||||||
|       } |  | ||||||
|  |  | ||||||
|     } else { |  | ||||||
|       TORCH_CHECK( |  | ||||||
|           (newsize > 0 && (numel % newsize == 0)) || numel == newsize, |  | ||||||
|           "shape '", |  | ||||||
|           shape, |  | ||||||
|           "' is invalid for input of size ", |  | ||||||
|           numel); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // We have a degree of freedom here to select the dimension size; follow |     // We have a degree of freedom here to select the dimension size; follow | ||||||
|     // NumPy semantics and just bail.  However, a nice error message is needed |     // NumPy semantics and just bail.  However, a nice error message is needed | ||||||
|     // because users often use `view` as a way to flatten & unflatten |     // because users often use `view` as a way to flatten & unflatten | ||||||
| @ -86,15 +54,19 @@ inline void infer_size_impl( | |||||||
|     // works yet |     // works yet | ||||||
|     //   empty_tensor.view(-1, 0) |     //   empty_tensor.view(-1, 0) | ||||||
|     // doesn't. |     // doesn't. | ||||||
|     TORCH_MAYBE_SYM_CHECK( |     TORCH_CHECK( | ||||||
|         newsize != 0, |         newsize != 0, | ||||||
|         "cannot reshape tensor of 0 elements into shape ", |         "cannot reshape tensor of 0 elements into shape ", | ||||||
|         shape, |         shape, | ||||||
|         " because the unspecified dimension size -1 can be any " |         " because the unspecified dimension size -1 can be any " | ||||||
|         "value and is ambiguous"); |         "value and is ambiguous"); | ||||||
|  |  | ||||||
|     res[*infer_dim] = numel / newsize; |     res[*infer_dim] = numel / newsize; | ||||||
|     return; |     return; | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   if (infer_dim && newsize > 0 && numel % newsize == 0) { | ||||||
|  |     set_infer_dim(); | ||||||
|  |     return; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   TORCH_MAYBE_SYM_CHECK( |   TORCH_MAYBE_SYM_CHECK( | ||||||
| @ -103,6 +75,9 @@ inline void infer_size_impl( | |||||||
|       shape, |       shape, | ||||||
|       "' is invalid for input of size ", |       "' is invalid for input of size ", | ||||||
|       numel); |       numel); | ||||||
|  |   if (infer_dim) { | ||||||
|  |     set_infer_dim(); | ||||||
|  |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| inline std::vector<int64_t> infer_size(IntArrayRef shape, int64_t numel) { | inline std::vector<int64_t> infer_size(IntArrayRef shape, int64_t numel) { | ||||||
|  | |||||||
| @ -1,22 +1,32 @@ | |||||||
| #include <ATen/core/PythonOpRegistrationTrampoline.h> | #include <ATen/core/PythonOpRegistrationTrampoline.h> | ||||||
| #include <c10/core/impl/PyInterpreterHooks.h> |  | ||||||
|  |  | ||||||
| // TODO: delete this |  | ||||||
| namespace at::impl { | namespace at::impl { | ||||||
|  |  | ||||||
| c10::impl::PyInterpreter* PythonOpRegistrationTrampoline::interpreter_ = nullptr; | // The strategy is that all python interpreters attempt to register themselves | ||||||
|  | // as the main interpreter, but only one wins.  Only that interpreter is | ||||||
|  | // allowed to interact with the C++ dispatcher.  Furthermore, when we execute | ||||||
|  | // logic on that interpreter, we do so hermetically, never setting pyobj field | ||||||
|  | // on Tensor. | ||||||
|  |  | ||||||
|  | std::atomic<c10::impl::PyInterpreter*> | ||||||
|  |     PythonOpRegistrationTrampoline::interpreter_{nullptr}; | ||||||
|  |  | ||||||
| c10::impl::PyInterpreter* PythonOpRegistrationTrampoline::getInterpreter() { | c10::impl::PyInterpreter* PythonOpRegistrationTrampoline::getInterpreter() { | ||||||
|   return c10::impl::getGlobalPyInterpreter(); |   return PythonOpRegistrationTrampoline::interpreter_.load(); | ||||||
| } | } | ||||||
|  |  | ||||||
| bool PythonOpRegistrationTrampoline::registerInterpreter( | bool PythonOpRegistrationTrampoline::registerInterpreter( | ||||||
|     c10::impl::PyInterpreter* interp) { |     c10::impl::PyInterpreter* interp) { | ||||||
|   if (interpreter_ != nullptr) { |   c10::impl::PyInterpreter* expected = nullptr; | ||||||
|  |   interpreter_.compare_exchange_strong(expected, interp); | ||||||
|  |   if (expected != nullptr) { | ||||||
|  |     // This is the second (or later) Python interpreter, which means we need | ||||||
|  |     // non-trivial hermetic PyObject TLS | ||||||
|  |     c10::impl::HermeticPyObjectTLS::init_state(); | ||||||
|     return false; |     return false; | ||||||
|  |   } else { | ||||||
|  |     return true; | ||||||
|   } |   } | ||||||
|   interpreter_ = interp; |  | ||||||
|   return true; |  | ||||||
| } | } | ||||||
|  |  | ||||||
| } // namespace at::impl | } // namespace at::impl | ||||||
|  | |||||||
| @ -2,21 +2,19 @@ | |||||||
|  |  | ||||||
| #include <ATen/core/dispatch/Dispatcher.h> | #include <ATen/core/dispatch/Dispatcher.h> | ||||||
|  |  | ||||||
| // TODO: We can get rid of this | // TODO: this can probably live in c10 | ||||||
|  |  | ||||||
|  |  | ||||||
| namespace at::impl { | namespace at::impl { | ||||||
|  |  | ||||||
| // Manages the single Python interpreter instance for PyTorch. |  | ||||||
| class TORCH_API PythonOpRegistrationTrampoline final { | class TORCH_API PythonOpRegistrationTrampoline final { | ||||||
|   static c10::impl::PyInterpreter* interpreter_; |   static std::atomic<c10::impl::PyInterpreter*> interpreter_; | ||||||
|  |  | ||||||
| public: | public: | ||||||
|   // Register the Python interpreter. Returns true on first registration, |   //  Returns true if you successfully registered yourself (that means | ||||||
|   // false if an interpreter was already registered. |   //  you are in the hot seat for doing the operator registrations!) | ||||||
|   static bool registerInterpreter(c10::impl::PyInterpreter*); |   static bool registerInterpreter(c10::impl::PyInterpreter*); | ||||||
|  |  | ||||||
|   // Returns the registered interpreter via the global PyInterpreter hooks. |  | ||||||
|   // Returns nullptr if no interpreter has been registered yet. |   // Returns nullptr if no interpreter has been registered yet. | ||||||
|   static c10::impl::PyInterpreter* getInterpreter(); |   static c10::impl::PyInterpreter* getInterpreter(); | ||||||
| }; | }; | ||||||
|  | |||||||
| @ -1234,7 +1234,7 @@ struct TORCH_API TupleType : public NamedType { | |||||||
|   std::shared_ptr<FunctionSchema> schema_; |   std::shared_ptr<FunctionSchema> schema_; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| // the common supertype of all Enums, only used in operator registration. | // the common supertype of all Enums, only used in operator registraion. | ||||||
| // EnumType <: AnyEnumType for all Enums | // EnumType <: AnyEnumType for all Enums | ||||||
| struct AnyEnumType; | struct AnyEnumType; | ||||||
| using AnyEnumTypePtr = SingletonTypePtr<AnyEnumType>; | using AnyEnumTypePtr = SingletonTypePtr<AnyEnumType>; | ||||||
|  | |||||||
| @ -149,105 +149,5 @@ static inline void pack_vnni4( | |||||||
| #endif | #endif | ||||||
| } | } | ||||||
|  |  | ||||||
| // This is a helper function for transpose_pack_vnni4 |  | ||||||
| // Transform a [4, 16] block (with incontiguous output) |  | ||||||
| // Src: |  | ||||||
| // a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15 a16 |  | ||||||
| // b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12 b13 b14 b15 b16 |  | ||||||
| // c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 c16 |  | ||||||
| // d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15 d16 |  | ||||||
| // Dst: |  | ||||||
| // a1 a2 a3 a4 b1 b2 b3 b4 c1 c2 c3 c4 d1 d2 d3 d4 |  | ||||||
| // a5 a6 a7 a8 b5 b6 b7 b8 c5 c6 c7 c8 d5 d6 d7 d8 |  | ||||||
| // a9 a10 a11 a12 b9 b10 b11 b12 c9 c10 c11 c12 d9 d10 d11 d12 |  | ||||||
| // a13 a14 a15 a16 b13 b14 b15 b16 c13 c14 c15 c16 d13 d14 d15 d16 |  | ||||||
| template <typename scalar_t, typename = std::enable_if_t<sizeof(scalar_t) == 1>> |  | ||||||
| static inline void transpose_vnni4_pad_4x16_block( |  | ||||||
|     const scalar_t* src, |  | ||||||
|     scalar_t* dst, |  | ||||||
|     int64_t ld_src, |  | ||||||
|     int64_t ld_dst, |  | ||||||
|     int krem = 4) { |  | ||||||
| #if defined(CPU_CAPABILITY_AVX512) |  | ||||||
|   __m128i r[4]; |  | ||||||
|   for (int i = 0; i < krem; ++i) { |  | ||||||
|     r[i] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i * ld_src)); |  | ||||||
|   } |  | ||||||
|   for (int i = krem; i < 4; ++i) { |  | ||||||
|     r[i] = _mm_setzero_si128(); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   // Transpose 4x16 bytes using unpack and shuffle |  | ||||||
|   __m128i t0 = _mm_unpacklo_epi32(r[0], r[1]); |  | ||||||
|   __m128i t1 = _mm_unpackhi_epi32(r[0], r[1]); |  | ||||||
|   __m128i t2 = _mm_unpacklo_epi32(r[2], r[3]); |  | ||||||
|   __m128i t3 = _mm_unpackhi_epi32(r[2], r[3]); |  | ||||||
|  |  | ||||||
|   __m128i r0 = _mm_unpacklo_epi64(t0, t2); |  | ||||||
|   __m128i r1 = _mm_unpackhi_epi64(t0, t2); |  | ||||||
|   __m128i r2 = _mm_unpacklo_epi64(t1, t3); |  | ||||||
|   __m128i r3 = _mm_unpackhi_epi64(t1, t3); |  | ||||||
|  |  | ||||||
|   // Store output |  | ||||||
|   if (krem == 4) { |  | ||||||
|     // normal case |  | ||||||
|     _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), r0); |  | ||||||
|     _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst), r1); |  | ||||||
|     _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst * 2), r2); |  | ||||||
|     _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst * 3), r3); |  | ||||||
|   } else { |  | ||||||
|     // masked case |  | ||||||
|     __mmask16 mask = (1ULL << (krem * 4)) - 1; |  | ||||||
|     _mm_mask_storeu_epi8(dst, mask, r0); |  | ||||||
|     _mm_mask_storeu_epi8(reinterpret_cast<__m128i*>(dst + ld_dst), mask, r1); |  | ||||||
|     _mm_mask_storeu_epi8( |  | ||||||
|         reinterpret_cast<__m128i*>(dst + ld_dst * 2), mask, r2); |  | ||||||
|     _mm_mask_storeu_epi8( |  | ||||||
|         reinterpret_cast<__m128i*>(dst + ld_dst * 3), mask, r3); |  | ||||||
|   } |  | ||||||
| #else |  | ||||||
|   TORCH_CHECK( |  | ||||||
|       false, |  | ||||||
|       "transpose_vnni4_pad_4x16_block is only supported when AVX-512 is supported") |  | ||||||
| #endif |  | ||||||
| } |  | ||||||
|  |  | ||||||
| // Do the transpose packing fusion with VNNI4 |  | ||||||
| // Reorder [K, N] → [N/4, K, 4] (VNNI4-style layout for bit8) |  | ||||||
| template <typename scalar_t, typename = std::enable_if_t<sizeof(scalar_t) == 1>> |  | ||||||
| static inline void transpose_pack_vnni4( |  | ||||||
|     const scalar_t* src, |  | ||||||
|     scalar_t* dst, |  | ||||||
|     int64_t ld_src, |  | ||||||
|     int64_t K, |  | ||||||
|     int64_t N) { |  | ||||||
| #if defined(CPU_CAPABILITY_AVX512) |  | ||||||
|   TORCH_CHECK( |  | ||||||
|       N % 16 == 0, "N needs to be multiple of 16 for transpose_pack_vnni4"); |  | ||||||
|   int64_t bk = 0; |  | ||||||
|   int64_t _K = K / 4 * 4; |  | ||||||
|   for (; bk < _K; bk += 4) { |  | ||||||
|     int64_t bn = 0; |  | ||||||
|     for (; bn < N; bn += 16) { |  | ||||||
|       transpose_vnni4_pad_4x16_block( |  | ||||||
|           src + bk * ld_src + bn, dst + bn * K + bk * 4, ld_src, K * 4); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   // Handle leftover K rows (< 4) |  | ||||||
|   if (K % 4 != 0) { |  | ||||||
|     int krem = K - bk; |  | ||||||
|     int64_t bn = 0; |  | ||||||
|     for (; bn < N; bn += 16) { |  | ||||||
|       transpose_vnni4_pad_4x16_block( |  | ||||||
|           src + bk * ld_src + bn, dst + bn * K + bk * 4, ld_src, K * 4, krem); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
| #else |  | ||||||
|   TORCH_CHECK( |  | ||||||
|       false, "transpose_pack_vnni4 is only supported when AVX-512 is supported") |  | ||||||
| #endif |  | ||||||
| } |  | ||||||
|  |  | ||||||
| } // namespace CPU_CAPABILITY | } // namespace CPU_CAPABILITY | ||||||
| } // namespace at::vec | } // namespace at::vec | ||||||
|  | |||||||
| @ -1637,7 +1637,9 @@ bool gemm_and_bias( | |||||||
|   if (activation == GEMMAndBiasActivationEpilogue::RELU) { |   if (activation == GEMMAndBiasActivationEpilogue::RELU) { | ||||||
|     epilogue = CUBLASLT_EPILOGUE_RELU_BIAS; |     epilogue = CUBLASLT_EPILOGUE_RELU_BIAS; | ||||||
|   } else if (activation == GEMMAndBiasActivationEpilogue::GELU) { |   } else if (activation == GEMMAndBiasActivationEpilogue::GELU) { | ||||||
|  | #if CUDA_VERSION >= 11040 || defined(USE_ROCM) | ||||||
|     epilogue = CUBLASLT_EPILOGUE_GELU_BIAS; |     epilogue = CUBLASLT_EPILOGUE_GELU_BIAS; | ||||||
|  | #endif | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if (bias != nullptr) { |   if (bias != nullptr) { | ||||||
| @ -1929,6 +1931,7 @@ void scaled_gemm( | |||||||
|     bool use_fast_accum) { |     bool use_fast_accum) { | ||||||
|   // Note: see `cublasCommonArgs` for various non-intuitive manupulations |   // Note: see `cublasCommonArgs` for various non-intuitive manupulations | ||||||
|   // of input arguments to this function. |   // of input arguments to this function. | ||||||
|  | #if CUDA_VERSION >= 11080 || defined(USE_ROCM) | ||||||
|   const auto computeType = CUBLAS_COMPUTE_32F; |   const auto computeType = CUBLAS_COMPUTE_32F; | ||||||
|   const auto scaleType = CUDA_R_32F; |   const auto scaleType = CUDA_R_32F; | ||||||
|   const float alpha_val = 1.0; |   const float alpha_val = 1.0; | ||||||
| @ -2130,6 +2133,8 @@ void scaled_gemm( | |||||||
|       " scaleType ", |       " scaleType ", | ||||||
|       scaleType); |       scaleType); | ||||||
|   return; |   return; | ||||||
|  | #endif // if CUDA_VERSION >= 11080 || defined(USE_ROCM) | ||||||
|  |   TORCH_CHECK(false, "scaled_gemm is only supported for CUDA 11.8 and above"); | ||||||
| } | } | ||||||
|  |  | ||||||
| void int8_gemm( | void int8_gemm( | ||||||
|  | |||||||
| @ -281,9 +281,6 @@ bool CUDAHooks::compiledWithMIOpen() const { | |||||||
|  |  | ||||||
| bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const { | bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const { | ||||||
| #if AT_CUDNN_ENABLED() | #if AT_CUDNN_ENABLED() | ||||||
|   if (!hasCUDA()) { |  | ||||||
|     return false; |  | ||||||
|   } |  | ||||||
|   // NOTE: extra parenthesis around numbers disable clang warnings about |   // NOTE: extra parenthesis around numbers disable clang warnings about | ||||||
|   // dead code |   // dead code | ||||||
|   return true; |   return true; | ||||||
| @ -294,9 +291,6 @@ bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const { | |||||||
|  |  | ||||||
| bool CUDAHooks::supportsDepthwiseConvolutionWithCuDNN() const { | bool CUDAHooks::supportsDepthwiseConvolutionWithCuDNN() const { | ||||||
| #if AT_CUDNN_ENABLED() | #if AT_CUDNN_ENABLED() | ||||||
|   if (!hasCUDA()) { |  | ||||||
|     return false; |  | ||||||
|   } |  | ||||||
|   cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); |   cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); | ||||||
|   // Check for Volta cores |   // Check for Volta cores | ||||||
|   if (prop->major >= 7) { |   if (prop->major >= 7) { | ||||||
| @ -311,9 +305,6 @@ bool CUDAHooks::supportsDepthwiseConvolutionWithCuDNN() const { | |||||||
|  |  | ||||||
| bool CUDAHooks::supportsBFloat16ConvolutionWithCuDNNv8() const { | bool CUDAHooks::supportsBFloat16ConvolutionWithCuDNNv8() const { | ||||||
| #if AT_CUDNN_ENABLED() | #if AT_CUDNN_ENABLED() | ||||||
|   if (!hasCUDA()) { |  | ||||||
|     return false; |  | ||||||
|   } |  | ||||||
|   cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); |   cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); | ||||||
|   // Check for Volta cores |   // Check for Volta cores | ||||||
|   if (prop->major >= 8) { |   if (prop->major >= 8) { | ||||||
|  | |||||||
| @ -465,11 +465,8 @@ inline bool mps_conv_use_channels_last(const at::Tensor& input, const at::Tensor | |||||||
|     return false; |     return false; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   auto is_channel_last = [](const at::Tensor& t) { |   auto fmt = input.suggest_memory_format(); | ||||||
|     auto fmt = t.suggest_memory_format(); |   return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d; | ||||||
|     return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d; |  | ||||||
|   }; |  | ||||||
|   return is_channel_last(input) || is_channel_last(weight); |  | ||||||
| } | } | ||||||
|  |  | ||||||
| } // namespace at::native | } // namespace at::native | ||||||
|  | |||||||
| @ -32,6 +32,10 @@ | |||||||
| #include <ATen/native/mkldnn/Utils.h> | #include <ATen/native/mkldnn/Utils.h> | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | #ifdef USE_MPS | ||||||
|  | #include <ATen/mps/MPSDevice.h> | ||||||
|  | #endif | ||||||
|  |  | ||||||
| #ifndef AT_PER_OPERATOR_HEADERS | #ifndef AT_PER_OPERATOR_HEADERS | ||||||
| #include <ATen/Functions.h> | #include <ATen/Functions.h> | ||||||
| #include <ATen/NativeFunctions.h> | #include <ATen/NativeFunctions.h> | ||||||
| @ -406,23 +410,11 @@ struct ConvParams { | |||||||
|   // cudnn and miopen are guaranteed not to be on mobile, and T102591915 / T110194934 suggest |   // cudnn and miopen are guaranteed not to be on mobile, and T102591915 / T110194934 suggest | ||||||
|   // that maybe the compiledWithCuDNN() check sometimes segfaults (though I can't imagine how) |   // that maybe the compiledWithCuDNN() check sometimes segfaults (though I can't imagine how) | ||||||
| #if !defined(C10_MOBILE) | #if !defined(C10_MOBILE) | ||||||
|     if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) { |     if (!detail::getCUDAHooks().compiledWithCuDNN()) { | ||||||
|       return false; |       return false; | ||||||
|     } |     } | ||||||
|     static long cudnn_version = detail::getCUDAHooks().versionCuDNN(); |  | ||||||
|     // broken on cuDNN 9.8 |  | ||||||
|     if (cudnn_version >= 90800) { |  | ||||||
|       if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous && |  | ||||||
|           (input.scalar_type() == at::kBFloat16 || input.scalar_type() == at::kHalf) && |  | ||||||
|           weight.dim() == 5) { |  | ||||||
|         for (int i = 2; i < weight.dim(); i++) { |  | ||||||
|           if (weight.size(i) != 1) { |  | ||||||
|             return false; |  | ||||||
|           } |  | ||||||
|         } |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     if (needs_64bit_indexing_no_split(input, weight)) { |     if (needs_64bit_indexing_no_split(input, weight)) { | ||||||
|  |       static long cudnn_version = detail::getCUDAHooks().versionCuDNN(); | ||||||
|       if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) { |       if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) { | ||||||
|         TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions" |         TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions" | ||||||
|                         " if the V8 API is not enabled or before cuDNN version 9.3+." |                         " if the V8 API is not enabled or before cuDNN version 9.3+." | ||||||
| @ -430,6 +422,9 @@ struct ConvParams { | |||||||
|         return false; |         return false; | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |     if (!input.is_cuda() || !cudnn_enabled) { | ||||||
|  |       return false; | ||||||
|  |     } | ||||||
|     if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16) { |     if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16) { | ||||||
|       if (!(detail::getCUDAHooks().supportsBFloat16ConvolutionWithCuDNNv8() && at::native::cudnnv8_enabled_check_debug())) { |       if (!(detail::getCUDAHooks().supportsBFloat16ConvolutionWithCuDNNv8() && at::native::cudnnv8_enabled_check_debug())) { | ||||||
|         return false; |         return false; | ||||||
| @ -448,19 +443,16 @@ struct ConvParams { | |||||||
|  |  | ||||||
|   // Use cudnn for FP16 depthwise convolutions |   // Use cudnn for FP16 depthwise convolutions | ||||||
|   bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const  { |   bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const  { | ||||||
|     if (!cudnn_enabled || !detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda()) { |     if (!detail::getCUDAHooks().compiledWithCuDNN()) { | ||||||
|       return false; |       return false; | ||||||
|     } |     } | ||||||
|  |     if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous && use_cudnn(input, weight)) { | ||||||
|  |       // always use cudnn_depthwise for channels_last format | ||||||
|  |       return true; | ||||||
|  |     } | ||||||
|     // native kernel doesn't support 64-bit non-splittable case |     // native kernel doesn't support 64-bit non-splittable case | ||||||
|     if (!(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) { |     if (cudnn_enabled && !(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) { | ||||||
|       static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1; |       static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1; | ||||||
|       // TODO(eqy): remove this once cuDNN fixes 64-bit depthwise support, first broken in 9.11x |  | ||||||
|       if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) { |  | ||||||
|         if (cudnn_version < 0 || cudnn_version > 91000) { |  | ||||||
|           return false; |  | ||||||
|         } |  | ||||||
|       } |  | ||||||
|  |  | ||||||
|       if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) { |       if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) { | ||||||
|         TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions" |         TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions" | ||||||
|                         " if the V8 API is not enabled or before cuDNN version 9.3+." |                         " if the V8 API is not enabled or before cuDNN version 9.3+." | ||||||
| @ -470,10 +462,6 @@ struct ConvParams { | |||||||
|         return true; |         return true; | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|     if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) { |  | ||||||
|       // always use cudnn_depthwise for channels_last format |  | ||||||
|       return true; |  | ||||||
|     } |  | ||||||
|     if (detail::getCUDAHooks().supportsDepthwiseConvolutionWithCuDNN()) { |     if (detail::getCUDAHooks().supportsDepthwiseConvolutionWithCuDNN()) { | ||||||
|       bool kernel_cond =  (use_cudnn(input, weight) && |       bool kernel_cond =  (use_cudnn(input, weight) && | ||||||
|                            input.scalar_type() == kHalf && // only for FP16 |                            input.scalar_type() == kHalf && // only for FP16 | ||||||
| @ -1441,8 +1429,12 @@ static inline at::MemoryFormat determine_backend_memory_format( | |||||||
|       } |       } | ||||||
|       break; |       break; | ||||||
|     case ConvBackend::Mps: |     case ConvBackend::Mps: | ||||||
|     case ConvBackend::MpsTranspose: |  | ||||||
|       if (mps_conv_use_channels_last(input, weight)) { |       if (mps_conv_use_channels_last(input, weight)) { | ||||||
|  | #ifdef USE_MPS | ||||||
|  |         if (!mps::is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_15_0_PLUS)) { | ||||||
|  |           break; | ||||||
|  |         } | ||||||
|  | #endif | ||||||
|         backend_memory_format = (k == 5) ? MemoryFormat::ChannelsLast3d : MemoryFormat::ChannelsLast; |         backend_memory_format = (k == 5) ? MemoryFormat::ChannelsLast3d : MemoryFormat::ChannelsLast; | ||||||
|       } |       } | ||||||
|       break; |       break; | ||||||
|  | |||||||
| @ -9,7 +9,6 @@ | |||||||
| #include <ATen/native/TransposeType.h> | #include <ATen/native/TransposeType.h> | ||||||
| #include <ATen/native/Unfold3d.h> | #include <ATen/native/Unfold3d.h> | ||||||
| #include <c10/util/irange.h> | #include <c10/util/irange.h> | ||||||
| #include <c10/util/safe_numerics.h> |  | ||||||
|  |  | ||||||
| #ifndef AT_PER_OPERATOR_HEADERS | #ifndef AT_PER_OPERATOR_HEADERS | ||||||
| #include <ATen/Functions.h> | #include <ATen/Functions.h> | ||||||
| @ -175,23 +174,6 @@ static inline void slow_conv3d_shape_check( | |||||||
|   const int64_t input_height = input.size(dim_height); |   const int64_t input_height = input.size(dim_height); | ||||||
|   const int64_t input_width = input.size(dim_width); |   const int64_t input_width = input.size(dim_width); | ||||||
|  |  | ||||||
|   constexpr int64_t MAX_SAFE_PAD = (1LL << 61); |  | ||||||
|  |  | ||||||
|   TORCH_CHECK_VALUE( |  | ||||||
|     pad_height <= MAX_SAFE_PAD, |  | ||||||
|     "Padding height too large: pad_height=", |  | ||||||
|     pad_height); |  | ||||||
|  |  | ||||||
|   TORCH_CHECK_VALUE( |  | ||||||
|     pad_width <= MAX_SAFE_PAD, |  | ||||||
|     "Padding width too large: pad_width=", |  | ||||||
|     pad_width); |  | ||||||
|  |  | ||||||
|   TORCH_CHECK_VALUE( |  | ||||||
|     pad_depth <= MAX_SAFE_PAD, |  | ||||||
|     "Padding depth too large: pad_depth=", |  | ||||||
|     pad_depth); |  | ||||||
|  |  | ||||||
|   const int64_t exact_input_depth = input_depth + 2 * pad_depth; |   const int64_t exact_input_depth = input_depth + 2 * pad_depth; | ||||||
|   const int64_t exact_input_height = input_height + 2 * pad_height; |   const int64_t exact_input_height = input_height + 2 * pad_height; | ||||||
|   const int64_t exact_input_width = input_width + 2 * pad_width; |   const int64_t exact_input_width = input_width + 2 * pad_width; | ||||||
| @ -239,14 +221,6 @@ static inline void slow_conv3d_shape_check( | |||||||
|       output_width, |       output_width, | ||||||
|       "). Output size is too small"); |       "). Output size is too small"); | ||||||
|  |  | ||||||
|   uint64_t kernel_product; |  | ||||||
|   TORCH_CHECK( |  | ||||||
|     !c10::mul_overflows(kernel_height, kernel_width, &kernel_product), |  | ||||||
|     "Kernel height x width product is too large: kernel_height=", |  | ||||||
|     kernel_height, |  | ||||||
|     ", kernel_width=", |  | ||||||
|     kernel_width); |  | ||||||
|  |  | ||||||
|   if (weight.defined()) { |   if (weight.defined()) { | ||||||
|     int64_t n_input_plane = weight.size(1); |     int64_t n_input_plane = weight.size(1); | ||||||
|     if (weight.dim() == 2) { |     if (weight.dim() == 2) { | ||||||
|  | |||||||
| @ -97,38 +97,43 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) { | |||||||
|   int64_t nDims = self.dim(); |   int64_t nDims = self.dim(); | ||||||
|   TORCH_CHECK(nDims >= 2, "dimensions must larger than 1"); |   TORCH_CHECK(nDims >= 2, "dimensions must larger than 1"); | ||||||
|  |  | ||||||
|   auto height = self.sym_size(0); |   int64_t height = self.size(0); | ||||||
|   auto width = self.sym_size(1); |   int64_t width = self.size(1); | ||||||
|  |  | ||||||
|   if (nDims > 2) { |   if (nDims > 2) { | ||||||
|  |     int64_t dim1 = height; | ||||||
|     for (const auto i : c10::irange(1, nDims)) { |     for (const auto i : c10::irange(1, nDims)) { | ||||||
|       if (self.sym_size(i) != height) { |       if (self.size(i) != dim1) { | ||||||
|         TORCH_CHECK(false, "all dimensions of input must be of equal length"); |         TORCH_CHECK(false, "all dimensions of input must be of equal length"); | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   auto storage_offset = self.sym_storage_offset(); |   int64_t storage_offset = self.storage_offset(); | ||||||
|   auto size = std::min(height, width); |   std::vector<int64_t> sizes; | ||||||
|  |   std::vector<int64_t> strides; | ||||||
|  |   int64_t size = std::min(height, width); | ||||||
|  |  | ||||||
|   int64_t stride = 0; |   int64_t stride = 0; | ||||||
|   for (const auto i : c10::irange(nDims)) { |   for (const auto i : c10::irange(nDims)) { | ||||||
|     stride += self.stride(i); |     stride += self.stride(i); | ||||||
|   } |   } | ||||||
|   std::vector<SymInt> strides{stride}; |   strides.push_back(stride); | ||||||
|   std::vector<SymInt> sizes{size}; |   sizes.push_back(size); | ||||||
|  |  | ||||||
|   auto main_diag = self.as_strided_symint(sizes, strides, storage_offset); |   auto main_diag = self.as_strided(sizes, strides, storage_offset); | ||||||
|   main_diag.fill_(fill_value); |   main_diag.fill_(fill_value); | ||||||
|  |  | ||||||
|   if (wrap && nDims == 2 && height > width + 1) { |   if (wrap && nDims == 2 && height > width + 1) { | ||||||
|     auto step = width + 1; |     std::vector<int64_t> wrap_sizes; | ||||||
|     auto wrap_size = ((self.numel() + step - 1) / step) - size; |  | ||||||
|     std::vector<SymInt> wrap_sizes{wrap_size}; |  | ||||||
|  |  | ||||||
|     auto offset = self.stride(0) * (width + 1); |     int64_t step = width + 1; | ||||||
|  |     int64_t wrap_size = ((self.numel() + step - 1) / step) - size; | ||||||
|  |     wrap_sizes.push_back(wrap_size); | ||||||
|  |  | ||||||
|     auto wrap_diag = self.as_strided_symint(wrap_sizes, strides, storage_offset + offset); |     int64_t offset = self.stride(0) * (width + 1); | ||||||
|  |  | ||||||
|  |     auto wrap_diag = self.as_strided(wrap_sizes, strides, storage_offset + offset); | ||||||
|     wrap_diag.fill_(fill_value); |     wrap_diag.fill_(fill_value); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  | |||||||
| @ -23,7 +23,6 @@ | |||||||
| #include <ATen/ops/linspace.h> | #include <ATen/ops/linspace.h> | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| #include <cmath> |  | ||||||
| #include <numeric> | #include <numeric> | ||||||
| #include <tuple> | #include <tuple> | ||||||
| #include <vector> | #include <vector> | ||||||
| @ -203,46 +202,6 @@ select_outer_bin_edges(const Tensor& input, std::optional<c10::ArrayRef<double>> | |||||||
|     return std::make_pair(leftmost_edges, rightmost_edges); |     return std::make_pair(leftmost_edges, rightmost_edges); | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| /* Bin edges correction based on the precision representation. |  | ||||||
|  * To maintain the backward compatibility we take max(std::nextafter<>, +1) |  | ||||||
|  * and min(std::nextafter<>, -1) for scalar types. For other types +/- 1 as usual. |  | ||||||
|  */ |  | ||||||
| void bins_edges_correction(const ScalarType& t, double &leftmost_edge, double &rightmost_edge) |  | ||||||
| { |  | ||||||
| #define UPDATE_WITH_LIMIT(real_type, scalartype) \ |  | ||||||
|   case ScalarType::scalartype:                   \ |  | ||||||
|     leftmost_edge = std::min(                    \ |  | ||||||
|         static_cast<double>(                     \ |  | ||||||
|             std::nexttoward(                     \ |  | ||||||
|                 static_cast<real_type>(leftmost_edge),   \ |  | ||||||
|                 std::numeric_limits<real_type>::lowest() \ |  | ||||||
|             )                                    \ |  | ||||||
|         ),                                       \ |  | ||||||
|         leftmost_edge - 1.                       \ |  | ||||||
|     );                                           \ |  | ||||||
|     rightmost_edge = std::max(                   \ |  | ||||||
|         static_cast<double>(                     \ |  | ||||||
|             std::nexttoward(                     \ |  | ||||||
|                 static_cast<real_type>(rightmost_edge), \ |  | ||||||
|                 std::numeric_limits<real_type>::max()   \ |  | ||||||
|             )                                    \ |  | ||||||
|         ),                                       \ |  | ||||||
|         rightmost_edge + 1.                      \ |  | ||||||
|     );                                           \ |  | ||||||
|     break; |  | ||||||
|  |  | ||||||
|     switch (t) { |  | ||||||
|         UPDATE_WITH_LIMIT(double, Double) |  | ||||||
|         UPDATE_WITH_LIMIT(float, Float) |  | ||||||
|         default: |  | ||||||
|             // Fallback to the default behavior for other types |  | ||||||
|             leftmost_edge -= 1; |  | ||||||
|             rightmost_edge += 1; |  | ||||||
|     } |  | ||||||
| #undef UPDATE_WITH_LIMIT |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /* histc's version of the logic for outermost bin edges. | /* histc's version of the logic for outermost bin edges. | ||||||
|  */ |  */ | ||||||
| std::pair<double, double> histc_select_outer_bin_edges(const Tensor& input, | std::pair<double, double> histc_select_outer_bin_edges(const Tensor& input, | ||||||
| @ -257,7 +216,8 @@ std::pair<double, double> histc_select_outer_bin_edges(const Tensor& input, | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     if (leftmost_edge == rightmost_edge) { |     if (leftmost_edge == rightmost_edge) { | ||||||
|         bins_edges_correction(input.dtype().toScalarType(), leftmost_edge, rightmost_edge); |         leftmost_edge -= 1; | ||||||
|  |         rightmost_edge += 1; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     TORCH_CHECK(!(std::isinf(leftmost_edge) || std::isinf(rightmost_edge) || |     TORCH_CHECK(!(std::isinf(leftmost_edge) || std::isinf(rightmost_edge) || | ||||||
|  | |||||||
| @ -23,6 +23,8 @@ Tensor& max_unpooling2d_forward_out_cpu( | |||||||
|   // Nondeterministic with duplicate indices |   // Nondeterministic with duplicate indices | ||||||
|   at::globalContext().alertNotDeterministic("max_unpooling2d_forward_out"); |   at::globalContext().alertNotDeterministic("max_unpooling2d_forward_out"); | ||||||
|  |  | ||||||
|  |   auto oheight = output_size[0]; | ||||||
|  |   auto owidth = output_size[1]; | ||||||
|   TORCH_CHECK( |   TORCH_CHECK( | ||||||
|       indices_.scalar_type() == at::ScalarType::Long, |       indices_.scalar_type() == at::ScalarType::Long, | ||||||
|       "elements in indices should be type int64 but got: ", indices_.scalar_type()); |       "elements in indices should be type int64 but got: ", indices_.scalar_type()); | ||||||
| @ -43,9 +45,6 @@ Tensor& max_unpooling2d_forward_out_cpu( | |||||||
|                 self_.sizes(), " with dimension ", i , " being empty."); |                 self_.sizes(), " with dimension ", i , " being empty."); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   auto oheight = output_size[0]; |  | ||||||
|   auto owidth = output_size[1]; |  | ||||||
|  |  | ||||||
|   auto memory_format = self_.suggest_memory_format(); |   auto memory_format = self_.suggest_memory_format(); | ||||||
|   auto self = self_.contiguous(memory_format); |   auto self = self_.contiguous(memory_format); | ||||||
|   auto indices = indices_.contiguous(memory_format); |   auto indices = indices_.contiguous(memory_format); | ||||||
|  | |||||||
| @ -1,5 +1,3 @@ | |||||||
| #include <ATen/core/ATen_fwd.h> |  | ||||||
| #include <c10/core/ScalarType.h> |  | ||||||
| #define TORCH_ASSERT_ONLY_METHOD_OPERATORS | #define TORCH_ASSERT_ONLY_METHOD_OPERATORS | ||||||
| #include <ATen/AccumulateType.h> | #include <ATen/AccumulateType.h> | ||||||
| #include <ATen/Dispatch.h> | #include <ATen/Dispatch.h> | ||||||
| @ -1880,18 +1878,19 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) { | |||||||
|  |  | ||||||
|   Tensor xtensor = self.expand(padded_size); |   Tensor xtensor = self.expand(padded_size); | ||||||
|  |  | ||||||
|   Tensor urtensor; |   Tensor result; | ||||||
|   if (self.is_quantized()) { |   if (self.is_quantized()) { | ||||||
|     urtensor = at::empty_quantized(target_size, self); |     result = at::empty_quantized(target_size, self); | ||||||
|   } else { |   } else { | ||||||
|     urtensor = at::empty(target_size, self.options()); |     result = at::empty(target_size, self.options()); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   // return an empty tensor if one of the repeat dimensions is zero |   // return an empty tensor if one of the repeat dimensions is zero | ||||||
|   if (zero_tensor) { |   if (zero_tensor) { | ||||||
|     return urtensor; |     return result; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   Tensor urtensor = at::alias(result); | ||||||
|   for (const auto i : c10::irange(xtensor.dim())) { |   for (const auto i : c10::irange(xtensor.dim())) { | ||||||
|     // can't unfold with step 0, so make sure step is at least 1 |     // can't unfold with step 0, so make sure step is at least 1 | ||||||
|     // (it doesn't matter what it is in that case, because the size is 0). |     // (it doesn't matter what it is in that case, because the size is 0). | ||||||
| @ -1901,22 +1900,7 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) { | |||||||
|  |  | ||||||
|   urtensor.copy_(xtensor.expand_as(urtensor)); |   urtensor.copy_(xtensor.expand_as(urtensor)); | ||||||
|  |  | ||||||
|   // Combine the dimensions to produce the target_size. |   return result; | ||||||
|   // xtensor dims: [a0, ..., ad-1] |  | ||||||
|   // urtensor dims: [a0, ..., ad-1, b0, ..., bd-1] |  | ||||||
|   // b dims are produced by unfold. |  | ||||||
|   // Transform urtensor to [a0 * b0, ..., ad-1 * bd-1] |  | ||||||
|   const int64_t n_dims = xtensor.dim(); |  | ||||||
|   auto range_a = at::arange(xtensor.dim(), at::TensorOptions(at::kLong)); |  | ||||||
|   auto range_b = range_a + n_dims; |  | ||||||
|   auto stacked = stack({std::move(range_a), std::move(range_b)}, 1).flatten(); |  | ||||||
|   auto permutation = IntArrayRef(stacked.data_ptr<int64_t>(), n_dims * 2); |  | ||||||
|   // Permute from [a0, ..., ad-1, b0, ..., bd-1] to [a0, b0, ..., ad-1, bd-1] |  | ||||||
|   urtensor = urtensor.permute(permutation); |  | ||||||
|   // Reshape from [a0, b0, ..., ad-1, bd-1] to [a0 * b0, ..., ad-1 * bd-1] |  | ||||||
|   urtensor = urtensor.reshape(target_size); |  | ||||||
|  |  | ||||||
|   return urtensor; |  | ||||||
| } | } | ||||||
|  |  | ||||||
| Tensor tile_symint(const Tensor& self, SymIntArrayRef reps) { | Tensor tile_symint(const Tensor& self, SymIntArrayRef reps) { | ||||||
|  | |||||||
| @ -999,41 +999,12 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) { | |||||||
|       dtypes[i] = iter.dtype(i); |       dtypes[i] = iter.dtype(i); | ||||||
|     } |     } | ||||||
|     auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter); |     auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter); | ||||||
| #ifdef USE_ROCM |  | ||||||
|     constexpr int grp_sz = 128; |  | ||||||
|     launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) { |  | ||||||
|       if (unrl) { |  | ||||||
|         auto offsets0 = offset_calc.get(idx); |  | ||||||
|         auto offsets1 = offset_calc.get(idx + grp_sz); |  | ||||||
|         auto offsets2 = offset_calc.get(idx + grp_sz * 2); |  | ||||||
|         auto offsets3 = offset_calc.get(idx + grp_sz * 3); |  | ||||||
|         void* out0 = data[0] + offsets0[0]; |  | ||||||
|         void* out1 = data[0] + offsets1[0]; |  | ||||||
|         void* out2 = data[0] + offsets2[0]; |  | ||||||
|         void* out3 = data[0] + offsets3[0]; |  | ||||||
|         arg0_t result0 = invoke(f, &data[1], &offsets0[1], &dtypes[1], 1); |  | ||||||
|         arg0_t result1 = invoke(f, &data[1], &offsets1[1], &dtypes[1], 1); |  | ||||||
|         arg0_t result2 = invoke(f, &data[1], &offsets2[1], &dtypes[1], 1); |  | ||||||
|         arg0_t result3 = invoke(f, &data[1], &offsets3[1], &dtypes[1], 1); |  | ||||||
|         c10::cast_and_store<arg0_t>(dtypes[0], out0, result0); |  | ||||||
|         c10::cast_and_store<arg0_t>(dtypes[0], out1, result1); |  | ||||||
|         c10::cast_and_store<arg0_t>(dtypes[0], out2, result2); |  | ||||||
|         c10::cast_and_store<arg0_t>(dtypes[0], out3, result3); |  | ||||||
|       } else { |  | ||||||
|         auto offsets = offset_calc.get(idx); |  | ||||||
|         void* out = data[0] + offsets[0]; |  | ||||||
|         arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1); |  | ||||||
|         c10::cast_and_store<arg0_t>(dtypes[0], out, result); |  | ||||||
|       } |  | ||||||
|     }); |  | ||||||
| #else |  | ||||||
|     launch_legacy_kernel<128, 4>(numel, [=] GPU_LAMBDA(int idx) { |     launch_legacy_kernel<128, 4>(numel, [=] GPU_LAMBDA(int idx) { | ||||||
|       auto offsets = offset_calc.get(idx); |       auto offsets = offset_calc.get(idx); | ||||||
|       void* out = data[0] + offsets[0]; |       void* out = data[0] + offsets[0]; | ||||||
|       arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1); |       arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1); | ||||||
|       c10::cast_and_store<arg0_t>(dtypes[0], out, result); |       c10::cast_and_store<arg0_t>(dtypes[0], out, result); | ||||||
|     }); |     }); | ||||||
| #endif |  | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | |||||||
| @ -42,19 +42,6 @@ void bfloat16_copy_kernel_cuda(TensorIteratorBase &iter) { | |||||||
|     }); |     }); | ||||||
| } | } | ||||||
|  |  | ||||||
| #ifdef USE_ROCM |  | ||||||
| void bfloat16tofloat32_copy_kernel_cuda(TensorIteratorBase &iter) { |  | ||||||
|     gpu_kernel_nocast(iter, [] GPU_LAMBDA(at::BFloat16 value) { |  | ||||||
|         return static_cast<float>(value); |  | ||||||
|     }); |  | ||||||
| } |  | ||||||
| void float16tofloat32_copy_kernel_cuda(TensorIteratorBase &iter) { |  | ||||||
|     gpu_kernel_nocast(iter, [] GPU_LAMBDA(at::Half value) { |  | ||||||
|         return static_cast<float>(value); |  | ||||||
|     }); |  | ||||||
| } |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| void float8_copy_kernel_cuda(TensorIteratorBase &iter) { | void float8_copy_kernel_cuda(TensorIteratorBase &iter) { | ||||||
|   ScalarType dtype = iter.dtype(0); |   ScalarType dtype = iter.dtype(0); | ||||||
|   ScalarType other_dtype = iter.dtype(1); |   ScalarType other_dtype = iter.dtype(1); | ||||||
| @ -200,17 +187,7 @@ void direct_copy_kernel_cuda(TensorIteratorBase &iter) { | |||||||
|      } else { |      } else { | ||||||
|        float16_copy_kernel_cuda(iter); |        float16_copy_kernel_cuda(iter); | ||||||
|      } |      } | ||||||
|   } |   } else if (isBitsType(dtype)) { | ||||||
| #ifdef USE_ROCM |  | ||||||
|   else if ((iter.dtype(1) == kBFloat16 || iter.dtype(1) == kHalf) && dtype == kFloat) { |  | ||||||
|     if (iter.dtype(1) == kBFloat16) { |  | ||||||
|       bfloat16tofloat32_copy_kernel_cuda(iter); |  | ||||||
|     } else { |  | ||||||
|       float16tofloat32_copy_kernel_cuda(iter); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
| #endif |  | ||||||
|   else if (isBitsType(dtype)) { |  | ||||||
|     TORCH_CHECK(dtype == iter.dtype(1), "copy_() does not support casting " |     TORCH_CHECK(dtype == iter.dtype(1), "copy_() does not support casting " | ||||||
|       "bits types to different bits types. Source dtype is ", iter.dtype(1), "target dtype is ", dtype); |       "bits types to different bits types. Source dtype is ", iter.dtype(1), "target dtype is ", dtype); | ||||||
|     AT_DISPATCH_BIT_TYPES(dtype, "copy_", [&] { |     AT_DISPATCH_BIT_TYPES(dtype, "copy_", [&] { | ||||||
|  | |||||||
| @ -5,7 +5,6 @@ | |||||||
| #include <array> | #include <array> | ||||||
| #include <type_traits> | #include <type_traits> | ||||||
| #include <ATen/core/TensorBase.h> | #include <ATen/core/TensorBase.h> | ||||||
| #include <ATen/ceil_div.h> |  | ||||||
| #include <ATen/Dispatch.h> | #include <ATen/Dispatch.h> | ||||||
| #include <ATen/Dispatch_v2.h> | #include <ATen/Dispatch_v2.h> | ||||||
| #include <ATen/cuda/CUDAContext.h> | #include <ATen/cuda/CUDAContext.h> | ||||||
| @ -84,17 +83,11 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co | |||||||
|         auto ind_dim_size = index_size[0]; |         auto ind_dim_size = index_size[0]; | ||||||
|         auto inp_stride_bytes = index_stride[0]; |         auto inp_stride_bytes = index_stride[0]; | ||||||
|         auto out_stride_bytes = iter.strides(0)[1]; |         auto out_stride_bytes = iter.strides(0)[1]; | ||||||
|         // avoid grid overflow in the fast kernel |         if (iter.numel() == 0) return; | ||||||
|         const int64_t vec_chunks = ceil_div(slice_size, alignment); |         at::native::vectorized_gather_kernel_launch<alignment, int64_t>(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind, | ||||||
|         const int64_t blocks_per_slice_upper = ceil_div(vec_chunks, (int64_t)launch_size_nd); |         slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes, /*allow_neg_indices*/true); | ||||||
|         const int max_grid_y = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; |         return; | ||||||
|         // if it's an eligible grid we use the fast path, otherwise default to slower path |       } | ||||||
|         if (blocks_per_slice_upper <= max_grid_y) { |  | ||||||
|           at::native::vectorized_gather_kernel_launch<alignment, int64_t>(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind, |  | ||||||
|           slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes, /*allow_neg_indices*/true); |  | ||||||
|           return; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   auto sizes = std::array<int64_t, MAX_DIMS>{}; |   auto sizes = std::array<int64_t, MAX_DIMS>{}; | ||||||
|  | |||||||
| @ -125,6 +125,8 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_, | |||||||
|   TORCH_CHECK( |   TORCH_CHECK( | ||||||
|       indices_.scalar_type() == at::ScalarType::Long, |       indices_.scalar_type() == at::ScalarType::Long, | ||||||
|       "elements in indices should be type int64 but got: ", indices_.scalar_type()); |       "elements in indices should be type int64 but got: ", indices_.scalar_type()); | ||||||
|  |   auto oheight = output_size[0]; | ||||||
|  |   auto owidth = output_size[1]; | ||||||
|  |  | ||||||
|   TensorArg output_arg{output, "output", 1}, self_arg{self_, "self_", 2}, |   TensorArg output_arg{output, "output", 1}, self_arg{self_, "self_", 2}, | ||||||
|       indices_arg{indices_, "indices_", 3}; |       indices_arg{indices_, "indices_", 3}; | ||||||
| @ -147,9 +149,6 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_, | |||||||
|       output_size.size() == 2, |       output_size.size() == 2, | ||||||
|       "There should be exactly two elements (height, width) in output_size, but got ", output_size.size(), " elements."); |       "There should be exactly two elements (height, width) in output_size, but got ", output_size.size(), " elements."); | ||||||
|  |  | ||||||
|   auto oheight = output_size[0]; |  | ||||||
|   auto owidth = output_size[1]; |  | ||||||
|  |  | ||||||
|   int64_t dimw = 2; |   int64_t dimw = 2; | ||||||
|   int64_t dimh = 1; |   int64_t dimh = 1; | ||||||
|   int64_t numBatch = 1; |   int64_t numBatch = 1; | ||||||
| @ -218,6 +217,9 @@ static void max_unpooling3d_shape_check( | |||||||
|     IntArrayRef stride, |     IntArrayRef stride, | ||||||
|     IntArrayRef padding, |     IntArrayRef padding, | ||||||
|     const char *fn_name) { |     const char *fn_name) { | ||||||
|  |   int64_t oT = output_size[0]; | ||||||
|  |   int64_t oH = output_size[1]; | ||||||
|  |   int64_t oW = output_size[2]; | ||||||
|   TORCH_CHECK( |   TORCH_CHECK( | ||||||
|       indices.scalar_type() == at::ScalarType::Long, |       indices.scalar_type() == at::ScalarType::Long, | ||||||
|       "elements in indices should be type int64 but got: ", indices.scalar_type()); |       "elements in indices should be type int64 but got: ", indices.scalar_type()); | ||||||
| @ -248,10 +250,6 @@ static void max_unpooling3d_shape_check( | |||||||
|       "strides should be greater than zero, but got stride: ", |       "strides should be greater than zero, but got stride: ", | ||||||
|       stride); |       stride); | ||||||
|  |  | ||||||
|   int64_t oT = output_size[0]; |  | ||||||
|   int64_t oH = output_size[1]; |  | ||||||
|   int64_t oW = output_size[2]; |  | ||||||
|  |  | ||||||
|   int dimw = 3; |   int dimw = 3; | ||||||
|   int dimh = 2; |   int dimh = 2; | ||||||
|   int dimt = 1; |   int dimt = 1; | ||||||
| @ -404,6 +402,8 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_, | |||||||
|     const Tensor& indices_, |     const Tensor& indices_, | ||||||
|     IntArrayRef output_size, |     IntArrayRef output_size, | ||||||
|     Tensor& grad_input) { |     Tensor& grad_input) { | ||||||
|  |   int64_t oheight = output_size[0]; | ||||||
|  |   int64_t owidth = output_size[1]; | ||||||
|   TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous"); |   TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous"); | ||||||
|   TORCH_CHECK( |   TORCH_CHECK( | ||||||
|       indices_.scalar_type() == at::ScalarType::Long, |       indices_.scalar_type() == at::ScalarType::Long, | ||||||
| @ -426,9 +426,6 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_, | |||||||
|  |  | ||||||
|   TORCH_CHECK(output_size.size() == 2, "output_size must have two elements, got size: ", output_size.size()); |   TORCH_CHECK(output_size.size() == 2, "output_size must have two elements, got size: ", output_size.size()); | ||||||
|  |  | ||||||
|   int64_t oheight = output_size[0]; |  | ||||||
|   int64_t owidth = output_size[1]; |  | ||||||
|  |  | ||||||
|   int64_t nInputCols, nInputRows, nInputPlane; |   int64_t nInputCols, nInputRows, nInputPlane; | ||||||
|  |  | ||||||
|   int dimw = 2; |   int dimw = 2; | ||||||
| @ -508,14 +505,13 @@ at::Tensor& max_unpooling3d_backward_out_cuda(const Tensor& grad_output_, | |||||||
|     IntArrayRef padding, |     IntArrayRef padding, | ||||||
|     Tensor& grad_input) { |     Tensor& grad_input) { | ||||||
|   TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous"); |   TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous"); | ||||||
|  |  | ||||||
|   max_unpooling3d_shape_check( |  | ||||||
|     self_, grad_output_, indices_, output_size, stride, padding, "max_unpooling3d_backward_out_cuda()"); |  | ||||||
|  |  | ||||||
|   int64_t oT = output_size[0]; |   int64_t oT = output_size[0]; | ||||||
|   int64_t oH = output_size[1]; |   int64_t oH = output_size[1]; | ||||||
|   int64_t oW = output_size[2]; |   int64_t oW = output_size[2]; | ||||||
|  |  | ||||||
|  |   max_unpooling3d_shape_check( | ||||||
|  |     self_, grad_output_, indices_, output_size, stride, padding, "max_unpooling3d_backward_out_cuda()"); | ||||||
|  |  | ||||||
|   int batchSize = 0; |   int batchSize = 0; | ||||||
|   int inputSlices = 0; |   int inputSlices = 0; | ||||||
|   int inputTime = 0; |   int inputTime = 0; | ||||||
|  | |||||||
| @ -300,6 +300,8 @@ void nonzero_static_cuda_out_impl( | |||||||
|     int64_t size, |     int64_t size, | ||||||
|     int64_t fill_value, |     int64_t fill_value, | ||||||
|     Tensor& out) { |     Tensor& out) { | ||||||
|  | #if defined(CUDA_VERSION) || defined(USE_ROCM) | ||||||
|  |  | ||||||
|   Tensor self_contiguous_ = self.contiguous(); |   Tensor self_contiguous_ = self.contiguous(); | ||||||
|   // see comment in nonzero_cuda_out_impl on reqs for out |   // see comment in nonzero_cuda_out_impl on reqs for out | ||||||
|   bool out_correct_size = |   bool out_correct_size = | ||||||
| @ -375,6 +377,9 @@ void nonzero_static_cuda_out_impl( | |||||||
|   if (need_to_copy) { |   if (need_to_copy) { | ||||||
|     out.copy_(out_temp); |     out.copy_(out_temp); | ||||||
|   } |   } | ||||||
|  | #else | ||||||
|  |   TORCH_CHECK(false, "Nonzero_static is not supported for cuda <= 11.4"); | ||||||
|  | #endif | ||||||
| } | } | ||||||
|  |  | ||||||
| Tensor& nonzero_out_cuda(const Tensor& self, Tensor& out) { | Tensor& nonzero_out_cuda(const Tensor& self, Tensor& out) { | ||||||
|  | |||||||
| @ -221,9 +221,22 @@ static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_ | |||||||
|   std::optional<CuFFTConfig> uncached_plan; |   std::optional<CuFFTConfig> uncached_plan; | ||||||
|   const CuFFTConfig * config = nullptr; |   const CuFFTConfig * config = nullptr; | ||||||
|  |  | ||||||
|  |   // Workaround for gh-63152, gh-58724 | ||||||
|  |   // Bluestein plans in CUDA 11.1 (cufft 10.3) cannot be re-used | ||||||
|   // Bluestein's algorithm is only used when a size has large prime factors, |   // Bluestein's algorithm is only used when a size has large prime factors, | ||||||
|   // sizes with only small prime factors can still be cached |   // sizes with only small prime factors can still be cached | ||||||
|   if (plan_cache.max_size() > 0) { |   bool use_caching = true; | ||||||
|  | #ifdef CUFFT_VERSION | ||||||
|  |   if constexpr (10300 <= CUFFT_VERSION && CUFFT_VERSION < 10400) { | ||||||
|  |     // Only cache plans for transforms with small prime factors | ||||||
|  |     use_caching = std::none_of( | ||||||
|  |         signal_size.begin() + 1, signal_size.end(), [](int64_t dim_size) { | ||||||
|  |       return has_large_prime_factor(dim_size); | ||||||
|  |     }); | ||||||
|  |   } | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  |   if (use_caching && plan_cache.max_size() > 0) { | ||||||
|     guard.lock(); |     guard.lock(); | ||||||
|     if (plan_cache.max_size() > 0) {  // check again after acquiring the lock |     if (plan_cache.max_size() > 0) {  // check again after acquiring the lock | ||||||
|       config = &plan_cache.lookup(Params); |       config = &plan_cache.lookup(Params); | ||||||
|  | |||||||
| @ -1238,7 +1238,7 @@ Tensor _cholesky_solve_helper_cuda_magma(const Tensor& self, const Tensor& A, bo | |||||||
| // Todo: cusolverDn<T>potrsBatched only supports nrhs == 1 and does not have good performance. | // Todo: cusolverDn<T>potrsBatched only supports nrhs == 1 and does not have good performance. | ||||||
| //     Batched cholesky_solve is dispatched to magma. | //     Batched cholesky_solve is dispatched to magma. | ||||||
| Tensor _cholesky_solve_helper_cuda(const Tensor& self, const Tensor& A, bool upper) { | Tensor _cholesky_solve_helper_cuda(const Tensor& self, const Tensor& A, bool upper) { | ||||||
| #if defined(USE_LINALG_SOLVER) | #if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM) | ||||||
|   auto preferred_backend = at::globalContext().linalgPreferredBackend(); |   auto preferred_backend = at::globalContext().linalgPreferredBackend(); | ||||||
|   switch (preferred_backend) { |   switch (preferred_backend) { | ||||||
|     case at::LinalgBackend::Cusolver: |     case at::LinalgBackend::Cusolver: | ||||||
| @ -1352,7 +1352,7 @@ void cholesky_helper_magma(const Tensor& input, bool upper, const Tensor& info) | |||||||
| } | } | ||||||
|  |  | ||||||
| static void cholesky_kernel(const Tensor& input, const Tensor& info, bool upper) { | static void cholesky_kernel(const Tensor& input, const Tensor& info, bool upper) { | ||||||
| #if defined(USE_LINALG_SOLVER) | #if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM) | ||||||
|   auto preferred_backend = at::globalContext().linalgPreferredBackend(); |   auto preferred_backend = at::globalContext().linalgPreferredBackend(); | ||||||
|   switch (preferred_backend) { |   switch (preferred_backend) { | ||||||
|     case at::LinalgBackend::Cusolver: |     case at::LinalgBackend::Cusolver: | ||||||
| @ -2709,7 +2709,7 @@ void linalg_lstsq_gels(const Tensor& A, const Tensor& B, const Tensor& /*infos*/ | |||||||
| } | } | ||||||
|  |  | ||||||
| void gels_looped(const Tensor& a, Tensor& b, Tensor& infos) { | void gels_looped(const Tensor& a, Tensor& b, Tensor& infos) { | ||||||
| #if defined(USE_LINALG_SOLVER) | #if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM) | ||||||
|   auto preferred_backend = at::globalContext().linalgPreferredBackend(); |   auto preferred_backend = at::globalContext().linalgPreferredBackend(); | ||||||
|   switch (preferred_backend) { |   switch (preferred_backend) { | ||||||
|     case at::LinalgBackend::Magma: |     case at::LinalgBackend::Magma: | ||||||
| @ -2733,7 +2733,7 @@ void lstsq_kernel(const Tensor& a, Tensor& b, Tensor& /*rank*/, Tensor& /*singul | |||||||
|   // first handle the underdetermined case (m < n) |   // first handle the underdetermined case (m < n) | ||||||
|   // this case is not supported by MAGMA or cuBLAS |   // this case is not supported by MAGMA or cuBLAS | ||||||
|   if (m < n) { |   if (m < n) { | ||||||
| #if defined(USE_LINALG_SOLVER) | #if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM) | ||||||
|     linalg_lstsq_gels(a, b, infos); |     linalg_lstsq_gels(a, b, infos); | ||||||
| #else | #else | ||||||
|     TORCH_CHECK( |     TORCH_CHECK( | ||||||
|  | |||||||
| @ -362,11 +362,7 @@ Tensor rms_norm_symint( | |||||||
|     return std::get<0>(rms_norm_composite(input, IntArrayRef(reinterpret_cast<const int64_t*>(normalized_shape.data()), normalized_shape.size()), weight_opt, eps)); |     return std::get<0>(rms_norm_composite(input, IntArrayRef(reinterpret_cast<const int64_t*>(normalized_shape.data()), normalized_shape.size()), weight_opt, eps)); | ||||||
|   } |   } | ||||||
|   #endif |   #endif | ||||||
|   if (input.device().type() == DeviceType::CUDA) { |   return std::get<0>(at::_fused_rms_norm(input, IntArrayRef(reinterpret_cast<const int64_t*>(normalized_shape.data()), normalized_shape.size()), weight_opt, eps)); | ||||||
|     return std::get<0>(at::_fused_rms_norm(input, IntArrayRef(reinterpret_cast<const int64_t*>(normalized_shape.data()), normalized_shape.size()), weight_opt, eps)); |  | ||||||
|   } else { |  | ||||||
|     return std::get<0>(rms_norm_composite(input, IntArrayRef(reinterpret_cast<const int64_t*>(normalized_shape.data()), normalized_shape.size()), weight_opt, eps)); |  | ||||||
|   } |  | ||||||
| } | } | ||||||
|  |  | ||||||
| } // namespace at::native | } // namespace at::native | ||||||
|  | |||||||
| @ -14,7 +14,7 @@ struct EmbeddingBagParams { | |||||||
|   ::c10::metal::array<idx_type_t, 2> output_strides; |   ::c10::metal::array<idx_type_t, 2> output_strides; | ||||||
|   ::c10::metal::array<idx_type_t, 2> max_indices_strides; |   ::c10::metal::array<idx_type_t, 2> max_indices_strides; | ||||||
|  |  | ||||||
|   idx_type_t per_sample_weights_stride; |   idx_type_t per_sample_weights_strides; | ||||||
|  |  | ||||||
|   idx_type_t num_indices; |   idx_type_t num_indices; | ||||||
|   idx_type_t num_bags; |   idx_type_t num_bags; | ||||||
|  | |||||||
| @ -23,72 +23,54 @@ struct ReductionOpInit<EmbeddingBagMode::MAX, T> { | |||||||
| template <EmbeddingBagMode M, typename T> | template <EmbeddingBagMode M, typename T> | ||||||
| struct ReductionOp { | struct ReductionOp { | ||||||
|   inline opmath_t<T> operator()( |   inline opmath_t<T> operator()( | ||||||
|       opmath_t<T> weight_val, |       T weight_val, | ||||||
|       opmath_t<T> out_val, |       opmath_t<T> out_val, | ||||||
|       bool is_first) { |       uint32_t per_sample_weights_index, | ||||||
|     return weight_val + out_val; |       constant T* per_sample_weights, | ||||||
|  |       uint32_t per_sample_weights_strides); | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | template <typename T> | ||||||
|  | struct ReductionOp<EmbeddingBagMode::SUM, T> { | ||||||
|  |   inline opmath_t<T> operator()( | ||||||
|  |       T weight_val, | ||||||
|  |       opmath_t<T> out_val, | ||||||
|  |       uint32_t per_sample_weights_index, | ||||||
|  |       constant T* per_sample_weights, | ||||||
|  |       uint32_t per_sample_weights_strides) { | ||||||
|  |     if (per_sample_weights_strides) { | ||||||
|  |       T per_sample_weight = per_sample_weights | ||||||
|  |           [per_sample_weights_strides * per_sample_weights_index]; | ||||||
|  |       return static_cast<opmath_t<T>>(per_sample_weight) * | ||||||
|  |           static_cast<opmath_t<T>>(weight_val) + | ||||||
|  |           out_val; | ||||||
|  |     } else { | ||||||
|  |       return static_cast<opmath_t<T>>(weight_val) + out_val; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | template <typename T> | ||||||
|  | struct ReductionOp<EmbeddingBagMode::MEAN, T> { | ||||||
|  |   inline opmath_t<T> operator()( | ||||||
|  |       T weight_val, | ||||||
|  |       opmath_t<T> out_val, | ||||||
|  |       uint32_t, | ||||||
|  |       constant T*, | ||||||
|  |       uint32_t) { | ||||||
|  |     return static_cast<opmath_t<T>>(weight_val) + out_val; | ||||||
|   } |   } | ||||||
| }; | }; | ||||||
|  |  | ||||||
| template <typename T> | template <typename T> | ||||||
| struct ReductionOp<EmbeddingBagMode::MAX, T> { | struct ReductionOp<EmbeddingBagMode::MAX, T> { | ||||||
|   inline opmath_t<T> operator()( |   inline opmath_t<T> operator()( | ||||||
|       opmath_t<T> weight_val, |       T weight_val, | ||||||
|       opmath_t<T> out_val, |       opmath_t<T> out_val, | ||||||
|       bool is_first) { |       uint32_t, | ||||||
|     return (is_first || weight_val > out_val) ? weight_val : out_val; |       constant T*, | ||||||
|   } |       uint32_t) { | ||||||
| }; |     return max(static_cast<opmath_t<T>>(weight_val), out_val); | ||||||
|  |  | ||||||
| template <EmbeddingBagMode M, typename T> |  | ||||||
| struct MaybeApplyPerSampleWeight { |  | ||||||
|   inline opmath_t<T> operator()( |  | ||||||
|       opmath_t<T> weight_val, |  | ||||||
|       uint32_t per_sample_weights_index, |  | ||||||
|       constant T* per_sample_weights, |  | ||||||
|       uint32_t per_sample_weights_stride) { |  | ||||||
|     return weight_val; |  | ||||||
|   } |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| template <typename T> |  | ||||||
| struct MaybeApplyPerSampleWeight<EmbeddingBagMode::SUM, T> { |  | ||||||
|   inline opmath_t<T> operator()( |  | ||||||
|       opmath_t<T> weight_val, |  | ||||||
|       uint32_t per_sample_weights_index, |  | ||||||
|       constant T* per_sample_weights, |  | ||||||
|       uint32_t per_sample_weights_stride) { |  | ||||||
|     if (per_sample_weights_stride) { |  | ||||||
|       T per_sample_weight = per_sample_weights |  | ||||||
|           [per_sample_weights_stride * per_sample_weights_index]; |  | ||||||
|       return static_cast<opmath_t<T>>(per_sample_weight) * weight_val; |  | ||||||
|     } else { |  | ||||||
|       return weight_val; |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| template <EmbeddingBagMode M, typename T, typename I> |  | ||||||
| struct MaybeCalcMaxIndex { |  | ||||||
|   inline void operator()( |  | ||||||
|       opmath_t<T> weight_val, |  | ||||||
|       opmath_t<T> out_val, |  | ||||||
|       bool is_first, |  | ||||||
|       thread I& max_idx, |  | ||||||
|       I weight_idx, |  | ||||||
|       bool pad) {} |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| template <typename T, typename I> |  | ||||||
| struct MaybeCalcMaxIndex<EmbeddingBagMode::MAX, T, I> { |  | ||||||
|   inline void operator()( |  | ||||||
|       opmath_t<T> weight_val, |  | ||||||
|       opmath_t<T> out_val, |  | ||||||
|       bool is_first, |  | ||||||
|       thread I& max_idx, |  | ||||||
|       I weight_idx, |  | ||||||
|       bool pad) { |  | ||||||
|     max_idx = !pad && (is_first || weight_val > out_val) ? weight_idx : max_idx; |  | ||||||
|   } |   } | ||||||
| }; | }; | ||||||
|  |  | ||||||
| @ -114,30 +96,6 @@ struct ReductionOpFinal<EmbeddingBagMode::MAX, T> { | |||||||
|   } |   } | ||||||
| }; | }; | ||||||
|  |  | ||||||
| template <EmbeddingBagMode M, typename I> |  | ||||||
| struct MaybeWriteMaxIndex { |  | ||||||
|   inline void operator()( |  | ||||||
|       device I*, |  | ||||||
|       const constant ::c10::metal::array<uint32_t, 2>&, |  | ||||||
|       uint32_t, |  | ||||||
|       uint32_t, |  | ||||||
|       I) {} |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| template <typename I> |  | ||||||
| struct MaybeWriteMaxIndex<EmbeddingBagMode::MAX, I> { |  | ||||||
|   inline void operator()( |  | ||||||
|       device I* max_indices, |  | ||||||
|       const constant ::c10::metal::array<uint32_t, 2>& max_indices_strides, |  | ||||||
|       uint32_t bag_idx, |  | ||||||
|       uint32_t feature_idx, |  | ||||||
|       I max_idx) { |  | ||||||
|     max_indices |  | ||||||
|         [bag_idx * max_indices_strides[0] + |  | ||||||
|          feature_idx * max_indices_strides[1]] = max_idx; |  | ||||||
|   } |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| template <EmbeddingBagMode M, typename T, typename I> | template <EmbeddingBagMode M, typename T, typename I> | ||||||
| void embedding_bag_impl( | void embedding_bag_impl( | ||||||
|     constant T* weight, |     constant T* weight, | ||||||
| @ -154,7 +112,7 @@ void embedding_bag_impl( | |||||||
|   auto num_bags = params.num_bags; |   auto num_bags = params.num_bags; | ||||||
|   auto feature_size = params.feature_size; |   auto feature_size = params.feature_size; | ||||||
|   auto padding_idx = params.padding_idx; |   auto padding_idx = params.padding_idx; | ||||||
|   auto per_sample_weights_stride = params.per_sample_weights_stride; |   auto per_sample_weights_strides = params.per_sample_weights_strides; | ||||||
|   constant auto& output_strides = params.output_strides; |   constant auto& output_strides = params.output_strides; | ||||||
|   constant auto& weight_strides = params.weight_strides; |   constant auto& weight_strides = params.weight_strides; | ||||||
|   constant auto& max_indices_strides = params.max_indices_strides; |   constant auto& max_indices_strides = params.max_indices_strides; | ||||||
| @ -162,6 +120,8 @@ void embedding_bag_impl( | |||||||
|   auto bag_idx = tid / feature_size; |   auto bag_idx = tid / feature_size; | ||||||
|   auto feature_idx = tid % feature_size; |   auto feature_idx = tid % feature_size; | ||||||
|  |  | ||||||
|  |   output += bag_idx * output_strides[0] + feature_idx * output_strides[1]; | ||||||
|  |  | ||||||
|   uint32_t offsets_end = min(bag_idx + 1, num_bags - 1); |   uint32_t offsets_end = min(bag_idx + 1, num_bags - 1); | ||||||
|   bool is_last_bag = bag_idx + 1 == num_bags; |   bool is_last_bag = bag_idx + 1 == num_bags; | ||||||
|   uint32_t indices_start = static_cast<uint32_t>(offsets[bag_idx]); |   uint32_t indices_start = static_cast<uint32_t>(offsets[bag_idx]); | ||||||
| @ -171,37 +131,28 @@ void embedding_bag_impl( | |||||||
|   auto out_val = ReductionOpInit<M, T>()(); |   auto out_val = ReductionOpInit<M, T>()(); | ||||||
|  |  | ||||||
|   uint32_t bag_size_ = 0; |   uint32_t bag_size_ = 0; | ||||||
|   I max_idx = 0; |  | ||||||
|  |  | ||||||
|   for (uint32_t indices_idx = indices_start; indices_idx < indices_end; |   for (uint32_t indices_idx = indices_start; indices_idx < indices_end; | ||||||
|        indices_idx++) { |        indices_idx++) { | ||||||
|     I weight_idx = indices[indices_idx]; |     I weight_idx = indices[indices_idx]; | ||||||
|     bool pad = (weight_idx == padding_idx); |     bool pad = (weight_idx == padding_idx); | ||||||
|     auto weight_val = static_cast<opmath_t<T>>( |     T weight_val = weight | ||||||
|         weight |         [static_cast<uint32_t>(weight_idx) * weight_strides[0] + | ||||||
|             [static_cast<uint32_t>(weight_idx) * weight_strides[0] + |          feature_idx * weight_strides[1]]; | ||||||
|              feature_idx * weight_strides[1]]); |  | ||||||
|  |  | ||||||
|     weight_val = MaybeApplyPerSampleWeight<M, T>()( |  | ||||||
|         weight_val, indices_idx, per_sample_weights, per_sample_weights_stride); |  | ||||||
|  |  | ||||||
|     auto new_out_val = ReductionOp<M, T>()(weight_val, out_val, bag_size_ == 0); |  | ||||||
|  |  | ||||||
|     MaybeCalcMaxIndex<M, T, I>()( |  | ||||||
|         weight_val, out_val, bag_size_ == 0, max_idx, weight_idx, pad); |  | ||||||
|  |  | ||||||
|     out_val = pad ? out_val : new_out_val; |  | ||||||
|     offset2bag[indices_idx] = bag_idx; |  | ||||||
|     bag_size_ += static_cast<uint32_t>(!pad); |     bag_size_ += static_cast<uint32_t>(!pad); | ||||||
|  |  | ||||||
|  |     auto tmp_val = ReductionOp<M, T>()( | ||||||
|  |         weight_val, | ||||||
|  |         out_val, | ||||||
|  |         indices_idx, | ||||||
|  |         per_sample_weights, | ||||||
|  |         per_sample_weights_strides); | ||||||
|  |  | ||||||
|  |     out_val = pad ? out_val : tmp_val; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   output[bag_idx * output_strides[0] + feature_idx * output_strides[1]] = |   *output = ReductionOpFinal<M, T>()(out_val, bag_size_); | ||||||
|       ReductionOpFinal<M, T>()(out_val, bag_size_); |  | ||||||
|  |  | ||||||
|   bag_size[bag_idx] = bag_size_; |  | ||||||
|  |  | ||||||
|   MaybeWriteMaxIndex<M, I>()( |  | ||||||
|       max_indices, max_indices_strides, bag_idx, feature_idx, max_idx); |  | ||||||
| } | } | ||||||
|  |  | ||||||
| #define DISPATCH_IMPL(MODE)        \ | #define DISPATCH_IMPL(MODE)        \ | ||||||
|  | |||||||
| @ -223,6 +223,9 @@ void grid_sampler_single_element( | |||||||
|     auto input_size = input_sizes[input_dim]; |     auto input_size = input_sizes[input_dim]; | ||||||
|     auto coord = static_cast<opmath_t<T>>(coords[coord_dim]); |     auto coord = static_cast<opmath_t<T>>(coords[coord_dim]); | ||||||
|  |  | ||||||
|  |     // Interpret nan as -1 | ||||||
|  |     coord = isnan(coord) ? -1 : coord; | ||||||
|  |  | ||||||
|     if (!align_corners) { |     if (!align_corners) { | ||||||
|       // Map unaligned grid space to aligned grid space |       // Map unaligned grid space to aligned grid space | ||||||
|       auto corner_alignment_factor = static_cast<opmath_t<T>>(input_size) / |       auto corner_alignment_factor = static_cast<opmath_t<T>>(input_size) / | ||||||
|  | |||||||
| @ -52,7 +52,9 @@ static void fill_depthwise_conv_desc(MPSGraphDepthwiseConvolution3DOpDescriptor* | |||||||
|                                      NSUInteger dilationRateInX, |                                      NSUInteger dilationRateInX, | ||||||
|                                      NSUInteger dilationRateInY, |                                      NSUInteger dilationRateInY, | ||||||
|                                      NSUInteger paddingHorizontal, |                                      NSUInteger paddingHorizontal, | ||||||
|                                      NSUInteger paddingVertical) { |                                      NSUInteger paddingVertical, | ||||||
|  |                                      c10::MemoryFormat memory_format, | ||||||
|  |                                      NSUInteger groups) { | ||||||
|   descriptor_.strides = |   descriptor_.strides = | ||||||
|       @[ @1, [[NSNumber alloc] initWithInteger:strideInY], [[NSNumber alloc] initWithInteger:strideInX] ]; |       @[ @1, [[NSNumber alloc] initWithInteger:strideInY], [[NSNumber alloc] initWithInteger:strideInX] ]; | ||||||
|   descriptor_.dilationRates = |   descriptor_.dilationRates = | ||||||
| @ -101,7 +103,7 @@ static void fill_conv_desc(MPSGraphConvolution2DOpDescriptor* descriptor_, | |||||||
|   descriptor_.groups = groups; |   descriptor_.groups = groups; | ||||||
| } | } | ||||||
|  |  | ||||||
| static Tensor _mps_convolution_impl(const Tensor& input_t, | static Tensor _mps_convolution_impl(const Tensor& input_t_, | ||||||
|                                     const Tensor& weight_t, |                                     const Tensor& weight_t, | ||||||
|                                     const std::optional<Tensor>& bias_opt, |                                     const std::optional<Tensor>& bias_opt, | ||||||
|                                     IntArrayRef padding, |                                     IntArrayRef padding, | ||||||
| @ -109,15 +111,12 @@ static Tensor _mps_convolution_impl(const Tensor& input_t, | |||||||
|                                     IntArrayRef dilation, |                                     IntArrayRef dilation, | ||||||
|                                     int64_t groups, |                                     int64_t groups, | ||||||
|                                     std::optional<IntArrayRef> input_shape) { |                                     std::optional<IntArrayRef> input_shape) { | ||||||
|   constexpr auto kChannelsLast = MemoryFormat::ChannelsLast; |   const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS); | ||||||
|   constexpr auto kContiguous = MemoryFormat::Contiguous; |   Tensor input_t = input_t_; | ||||||
|   const bool is_macos_15_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS); |   bool is3DConv = input_t.dim() == 5; | ||||||
|  |   if (!is_macOS_15_0_or_newer || is3DConv) { | ||||||
|   const bool is3DConv = input_t.dim() == 5; |     input_t = input_t.contiguous(); | ||||||
|   const auto memory_format = input_t.suggest_memory_format(); |   } | ||||||
|   const auto input_suggested_layout = memory_format == kChannelsLast && is_macos_15_plus ? kChannelsLast : kContiguous; |  | ||||||
|   const bool is_channels_last = mps_conv_use_channels_last(input_t, weight_t) && !is3DConv; |  | ||||||
|   const bool bias_defined = bias_opt ? bias_opt->defined() : false; |  | ||||||
|  |  | ||||||
|   TORCH_CHECK(isFloatingType(input_t.scalar_type()), "Convolution is supported only for Floating types"); |   TORCH_CHECK(isFloatingType(input_t.scalar_type()), "Convolution is supported only for Floating types"); | ||||||
|  |  | ||||||
| @ -127,6 +126,15 @@ static Tensor _mps_convolution_impl(const Tensor& input_t, | |||||||
|   checkAllSameType(c, {input, weight}); |   checkAllSameType(c, {input, weight}); | ||||||
|   checkAllSameGPU(c, {input, weight}); |   checkAllSameGPU(c, {input, weight}); | ||||||
|  |  | ||||||
|  |   bool bias_defined; | ||||||
|  |  | ||||||
|  |   if (bias_opt == std::nullopt) | ||||||
|  |     bias_defined = false; | ||||||
|  |   else | ||||||
|  |     bias_defined = bias_opt->defined(); | ||||||
|  |  | ||||||
|  |   auto memory_format = input_t.suggest_memory_format(); | ||||||
|  |   bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast) && !is3DConv; | ||||||
|   auto output_t = |   auto output_t = | ||||||
|       at::empty(input_shape.has_value() ? input_shape.value() |       at::empty(input_shape.has_value() ? input_shape.value() | ||||||
|                                         : conv_output_size(input->sizes(), weight->sizes(), padding, stride, dilation), |                                         : conv_output_size(input->sizes(), weight->sizes(), padding, stride, dilation), | ||||||
| @ -134,18 +142,12 @@ static Tensor _mps_convolution_impl(const Tensor& input_t, | |||||||
|                 std::nullopt, |                 std::nullopt, | ||||||
|                 kMPS, |                 kMPS, | ||||||
|                 std::nullopt, |                 std::nullopt, | ||||||
|                 is_channels_last ? kChannelsLast : kContiguous); |                 is_macOS_15_0_or_newer ? memory_format : MemoryFormat::Contiguous); | ||||||
|   if (output_t.numel() == 0) { |   if (output_t.numel() == 0) { | ||||||
|     return output_t; |     return output_t; | ||||||
|   } |   } | ||||||
|   TensorArg output{output_t, "result", 0}; |   TensorArg output{output_t, "result", 0}; | ||||||
|  |  | ||||||
|   // TODO: Remove me when MacOS-14 is no longer supported |  | ||||||
|   std::optional<Tensor> output_c; |  | ||||||
|   if (!is_macos_15_plus && is_channels_last) { |  | ||||||
|     output_c = at::empty_like(output_t, output_t.options().memory_format(kContiguous)); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_1_PLUS)) { |   if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_1_PLUS)) { | ||||||
|     // On macOS < 15.1, MPS convolution kernel does not support output channels > 2^16 |     // On macOS < 15.1, MPS convolution kernel does not support output channels > 2^16 | ||||||
|     for (auto elem : output_t.sizes()) { |     for (auto elem : output_t.sizes()) { | ||||||
| @ -184,22 +186,32 @@ static Tensor _mps_convolution_impl(const Tensor& input_t, | |||||||
|                                   getArrayRefString(dilation), |                                   getArrayRefString(dilation), | ||||||
|                                   getArrayRefString(padding), |                                   getArrayRefString(padding), | ||||||
|                                   groups, |                                   groups, | ||||||
|                                   input_suggested_layout == kChannelsLast, |                                   is_channels_last, | ||||||
|                                   mps::getTensorsStringKey({input_t, weight_t}), |                                   mps::getTensorsStringKey({input_t, weight_t}), | ||||||
|                                   bias_defined, |                                   bias_defined, | ||||||
|                                   bias_shape_key); |                                   bias_shape_key); | ||||||
|  |  | ||||||
|     auto inputShape = mps::getMPSShape(input_t, input_suggested_layout); |     MPSShape* inputShape = mps::getMPSShape(input_t, memory_format); | ||||||
|     auto outputShape = mps::getMPSShape(output_t, input_suggested_layout); |     MPSShape* outputShape = mps::getMPSShape(output_t, memory_format); | ||||||
|     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) { |     MPSNDArray* inputNDArray = nil; | ||||||
|       bool isDepthwiseConv = |     MPSNDArray* outputNDArray = nil; | ||||||
|           (groups > 1 && weight_t.size(1) == 1) && input_t.dim() >= 4 && weight_t.dim() >= 4 && !is_channels_last; |  | ||||||
|  |  | ||||||
|       auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(input_t), inputShape); |     if (input_t.is_contiguous(memory_format) && output_t.is_contiguous(memory_format) && is_macOS_15_0_or_newer) { | ||||||
|       auto weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t); |       inputNDArray = getMPSNDArray(input_t, inputShape); | ||||||
|       MPSGraphTensor* outputTensor = nil; |       outputNDArray = getMPSNDArray(*output, outputShape); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) { | ||||||
|  |       MPSShape* weightShape = mps::getMPSShape(weight_t); | ||||||
|  |       bool isDepthwiseConv = ((groups > 1 && (weightShape[1].intValue == 1)) && inputShape.count >= 4 && | ||||||
|  |                               weightShape.count >= 4 && !is_channels_last); | ||||||
|  |  | ||||||
|  |       MPSGraphTensor* inputTensor = | ||||||
|  |           mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(input_t.scalar_type()), inputShape); | ||||||
|  |       MPSGraphTensor* weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t); | ||||||
|  |       MPSGraphTensor* outputTensor; | ||||||
|       if (is3DConv) { |       if (is3DConv) { | ||||||
|         auto conv3dDescriptor_ = [[MPSGraphConvolution3DOpDescriptor new] autorelease]; |         MPSGraphConvolution3DOpDescriptor* conv3dDescriptor_ = [[MPSGraphConvolution3DOpDescriptor new] autorelease]; | ||||||
|         fill_conv3d_desc(conv3dDescriptor_, |         fill_conv3d_desc(conv3dDescriptor_, | ||||||
|                          stride[2], |                          stride[2], | ||||||
|                          stride[1], |                          stride[1], | ||||||
| @ -217,9 +229,17 @@ static Tensor _mps_convolution_impl(const Tensor& input_t, | |||||||
|                                                     descriptor:conv3dDescriptor_ |                                                     descriptor:conv3dDescriptor_ | ||||||
|                                                           name:nil]; |                                                           name:nil]; | ||||||
|       } else if (isDepthwiseConv) { |       } else if (isDepthwiseConv) { | ||||||
|         auto depthWiseConv3dDescriptor_ = [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease]; |         MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor_ = | ||||||
|         fill_depthwise_conv_desc( |             [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease]; | ||||||
|             depthWiseConv3dDescriptor_, stride[1], stride[0], dilation[1], dilation[0], padding[1], padding[0]); |         fill_depthwise_conv_desc(depthWiseConv3dDescriptor_, | ||||||
|  |                                  stride[1], | ||||||
|  |                                  stride[0], | ||||||
|  |                                  dilation[1], | ||||||
|  |                                  dilation[0], | ||||||
|  |                                  padding[1], | ||||||
|  |                                  padding[0], | ||||||
|  |                                  memory_format, | ||||||
|  |                                  groups); | ||||||
|  |  | ||||||
|         MPSGraphTensor* weightTransposeTensor = [mpsGraph transposeTensor:weightTensor |         MPSGraphTensor* weightTransposeTensor = [mpsGraph transposeTensor:weightTensor | ||||||
|                                                                 dimension:-3 |                                                                 dimension:-3 | ||||||
| @ -238,7 +258,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t, | |||||||
|                        dilation[0], |                        dilation[0], | ||||||
|                        padding[1], |                        padding[1], | ||||||
|                        padding[0], |                        padding[0], | ||||||
|                        input_suggested_layout, |                        memory_format, | ||||||
|                        groups); |                        groups); | ||||||
|  |  | ||||||
|         outputTensor = [mpsGraph convolution2DWithSourceTensor:inputTensor |         outputTensor = [mpsGraph convolution2DWithSourceTensor:inputTensor | ||||||
| @ -250,6 +270,13 @@ static Tensor _mps_convolution_impl(const Tensor& input_t, | |||||||
|       MPSGraphTensor* biasTensor = nil; |       MPSGraphTensor* biasTensor = nil; | ||||||
|       if (bias_defined) { |       if (bias_defined) { | ||||||
|         biasTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(bias_opt.value())); |         biasTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(bias_opt.value())); | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       if (is_channels_last && !is_macOS_15_0_or_newer) { | ||||||
|  |         outputTensor = mps::convertNHWCtoNCHW(mpsGraph, outputTensor); | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       if (bias_defined) { | ||||||
|         outputTensor = [mpsGraph additionWithPrimaryTensor:outputTensor secondaryTensor:biasTensor name:nil]; |         outputTensor = [mpsGraph additionWithPrimaryTensor:outputTensor secondaryTensor:biasTensor name:nil]; | ||||||
|       } |       } | ||||||
|       newCachedGraph->inputTensor_ = inputTensor; |       newCachedGraph->inputTensor_ = inputTensor; | ||||||
| @ -258,26 +285,27 @@ static Tensor _mps_convolution_impl(const Tensor& input_t, | |||||||
|       newCachedGraph->outputTensor_ = outputTensor; |       newCachedGraph->outputTensor_ = outputTensor; | ||||||
|     }); |     }); | ||||||
|  |  | ||||||
|     auto inputPlaceholder = input_suggested_layout == kContiguous |     auto inputPlaceholder = inputNDArray ? Placeholder(cachedGraph->inputTensor_, inputNDArray) | ||||||
|         ? Placeholder(cachedGraph->inputTensor_, output_c || is3DConv ? input_t.contiguous() : input_t) |                                          : Placeholder(cachedGraph->inputTensor_, input_t, inputShape); | ||||||
|         : Placeholder(cachedGraph->inputTensor_, getMPSNDArray(input_t, inputShape)); |     auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t); | ||||||
|     auto outputPlaceholder = input_suggested_layout == kContiguous |  | ||||||
|         ? Placeholder(cachedGraph->outputTensor_, output_c ? *output_c : output_t) |  | ||||||
|         : Placeholder(cachedGraph->outputTensor_, getMPSNDArray(output_t, outputShape)); |  | ||||||
|     auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, output_c ? weight_t.contiguous() : weight_t); |  | ||||||
|     auto biasPlaceholder = Placeholder(); |     auto biasPlaceholder = Placeholder(); | ||||||
|     // Reshape the bias to be broadcastable with output of conv2d or conv3d |     // Reshape the bias to be broadcastable with output of conv2d or conv3d | ||||||
|     if (bias_defined) { |     if (bias_defined) { | ||||||
|       if (is3DConv) { |       if (is3DConv) { | ||||||
|         biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias_opt->view({1, bias_shape[0], 1, 1, 1})); |         biasPlaceholder = Placeholder(cachedGraph->biasTensor_, (bias_opt.value()).view({1, bias_shape[0], 1, 1, 1})); | ||||||
|       } else if (input_suggested_layout == kChannelsLast) { |  | ||||||
|         biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias_opt->view({1, 1, 1, bias_shape[0]})); |  | ||||||
|       } else { |       } else { | ||||||
|         biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias_opt->view({1, bias_shape[0], 1, 1})); |         if (is_channels_last && is_macOS_15_0_or_newer) { | ||||||
|  |           biasPlaceholder = Placeholder(cachedGraph->biasTensor_, (bias_opt.value()).view({1, 1, 1, bias_shape[0]})); | ||||||
|  |         } else { | ||||||
|  |           biasPlaceholder = Placeholder(cachedGraph->biasTensor_, (bias_opt.value()).view({1, bias_shape[0], 1, 1})); | ||||||
|  |         } | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |     auto outputPlaceholder = outputNDArray ? Placeholder(cachedGraph->outputTensor_, outputNDArray) | ||||||
|  |                                            : Placeholder(cachedGraph->outputTensor_, *output); | ||||||
|  |  | ||||||
|     auto feeds = [[[NSMutableDictionary alloc] initWithCapacity:3] autorelease]; |     NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = | ||||||
|  |         [[[NSMutableDictionary alloc] initWithCapacity:3] autorelease]; | ||||||
|     feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData(); |     feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData(); | ||||||
|     feeds[weightsPlaceholder.getMPSGraphTensor()] = weightsPlaceholder.getMPSGraphTensorData(); |     feeds[weightsPlaceholder.getMPSGraphTensor()] = weightsPlaceholder.getMPSGraphTensorData(); | ||||||
|     if (bias_defined) { |     if (bias_defined) { | ||||||
| @ -287,11 +315,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t, | |||||||
|     runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); |     runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if (output_c) { |   return *output; | ||||||
|     output_t.copy_(*output_c); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   return output_t; |  | ||||||
| } | } | ||||||
|  |  | ||||||
| Tensor _mps_convolution(const Tensor& input_t, | Tensor _mps_convolution(const Tensor& input_t, | ||||||
| @ -327,21 +351,14 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size, | |||||||
|   TensorArg grad_output{grad_output_t, "grad_output", 1}, weight{weight_t, "weight", 2}; |   TensorArg grad_output{grad_output_t, "grad_output", 1}, weight{weight_t, "weight", 2}; | ||||||
|   checkAllSameType(c, {grad_output, weight}); |   checkAllSameType(c, {grad_output, weight}); | ||||||
|   checkAllSameGPU(c, {grad_output, weight}); |   checkAllSameGPU(c, {grad_output, weight}); | ||||||
|   constexpr auto kChannelsLast = at::MemoryFormat::ChannelsLast; |   auto memory_format = grad_output_t.suggest_memory_format(); | ||||||
|   bool is_channels_last = mps_conv_use_channels_last(grad_output_t, weight_t) && !is3DConv; |   bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast) && !is3DConv; | ||||||
|   auto grad_input_t = |   auto grad_input_t = at::empty(input_size, grad_output_t.options(), std::nullopt); | ||||||
|       at::empty(input_size, grad_output_t.options(), is_channels_last ? std::optional(kChannelsLast) : std::nullopt); |  | ||||||
|  |  | ||||||
|   // Avoid "grad_input" when this is being used as transposed convolution |   // Avoid "grad_input" when this is being used as transposed convolution | ||||||
|   TensorArg grad_input{grad_input_t, "result", 0}; |   TensorArg grad_input{grad_input_t, "result", 0}; | ||||||
|   convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups); |   convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups); | ||||||
|  |  | ||||||
|   // TODO: Remove me when MacOS-14 is no longer supported |  | ||||||
|   std::optional<Tensor> grad_input_c; |  | ||||||
|   if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS) && is_channels_last) { |  | ||||||
|     grad_input_c = at::empty_like(grad_input_t, grad_input_t.options().memory_format(MemoryFormat::Contiguous)); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   // Derive from MPSCachedGraph |   // Derive from MPSCachedGraph | ||||||
|   struct CachedGraph : public MPSCachedGraph { |   struct CachedGraph : public MPSCachedGraph { | ||||||
|     CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {} |     CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {} | ||||||
| @ -353,6 +370,7 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size, | |||||||
|   // Add backward with input |   // Add backward with input | ||||||
|   @autoreleasepool { |   @autoreleasepool { | ||||||
|     MPSStream* stream = getCurrentMPSStream(); |     MPSStream* stream = getCurrentMPSStream(); | ||||||
|  |  | ||||||
|     MPSShape* mps_input_shape = getMPSShape(input_size); |     MPSShape* mps_input_shape = getMPSShape(input_size); | ||||||
|     std::string key = fmt::format("mps_{}_convolution_backward_input:{}:{}:{}:{}:{}:{}", |     std::string key = fmt::format("mps_{}_convolution_backward_input:{}:{}:{}:{}:{}:{}", | ||||||
|                                   is3DConv ? "3d_" : "", |                                   is3DConv ? "3d_" : "", | ||||||
| @ -393,8 +411,15 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size, | |||||||
|       } else if (isDepthwiseConv) { |       } else if (isDepthwiseConv) { | ||||||
|         MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor_ = |         MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor_ = | ||||||
|             [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease]; |             [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease]; | ||||||
|         fill_depthwise_conv_desc( |         fill_depthwise_conv_desc(depthWiseConv3dDescriptor_, | ||||||
|             depthWiseConv3dDescriptor_, stride[1], stride[0], dilation[1], dilation[0], padding[1], padding[0]); |                                  stride[1], | ||||||
|  |                                  stride[0], | ||||||
|  |                                  dilation[1], | ||||||
|  |                                  dilation[0], | ||||||
|  |                                  padding[1], | ||||||
|  |                                  padding[0], | ||||||
|  |                                  at::MemoryFormat::Contiguous, | ||||||
|  |                                  groups); | ||||||
|         MPSGraphTensor* weightTransposeTensor = [mpsGraph transposeTensor:weightTensor |         MPSGraphTensor* weightTransposeTensor = [mpsGraph transposeTensor:weightTensor | ||||||
|                                                                 dimension:-3 |                                                                 dimension:-3 | ||||||
|                                                             withDimension:-4 |                                                             withDimension:-4 | ||||||
| @ -429,18 +454,14 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size, | |||||||
|       newCachedGraph->gradInputTensor_ = gradInputTensor; |       newCachedGraph->gradInputTensor_ = gradInputTensor; | ||||||
|     }); |     }); | ||||||
|  |  | ||||||
|     auto gradOutputPlaceholder = |     auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t); | ||||||
|         Placeholder(cachedGraph->gradOutputTensor_, grad_input_c ? grad_output_t.contiguous() : grad_output_t); |     auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t); | ||||||
|     auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, grad_input_c ? weight_t.contiguous() : weight_t); |     auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, *grad_input); | ||||||
|     auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input_c ? *grad_input_c : grad_input_t); |  | ||||||
|  |  | ||||||
|     auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, weightsPlaceholder); |     auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, weightsPlaceholder); | ||||||
|     runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); |     runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); | ||||||
|   } |   } | ||||||
|   if (grad_input_c) { |   return *grad_input; | ||||||
|     grad_input_t.copy_(*grad_input_c); |  | ||||||
|   } |  | ||||||
|   return grad_input_t; |  | ||||||
| } | } | ||||||
|  |  | ||||||
| static Tensor mps_convolution_backward_weights(IntArrayRef weight_size, | static Tensor mps_convolution_backward_weights(IntArrayRef weight_size, | ||||||
| @ -453,11 +474,9 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size, | |||||||
|                                                bool bias_defined) { |                                                bool bias_defined) { | ||||||
|   using namespace at::native::mps; |   using namespace at::native::mps; | ||||||
|   using namespace mps; |   using namespace mps; | ||||||
|   const bool is3DConv = input_t.dim() == 5; |   bool is3DConv = input_t.dim() == 5; | ||||||
|   TORCH_CHECK(isFloatingType(grad_output_t.scalar_type()), "Convolution is supported only for Floating types"); |   TORCH_CHECK(isFloatingType(grad_output_t.scalar_type()), "Convolution is supported only for Floating types"); | ||||||
|   CheckedFrom c = "mps_convolution_backward_weights"; |   CheckedFrom c = "mps_convolution_backward_weights"; | ||||||
|   constexpr auto kChannelsLast = at::MemoryFormat::ChannelsLast; |  | ||||||
|   bool is_channels_last = mps_conv_use_channels_last(input_t, grad_output_t) && !is3DConv; |  | ||||||
|  |  | ||||||
|   // For uniformity with everything else, although it seems grad_weight |   // For uniformity with everything else, although it seems grad_weight | ||||||
|   // would be unambiguous too. |   // would be unambiguous too. | ||||||
| @ -468,8 +487,7 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size, | |||||||
|   checkAllSameGPU(c, {grad_output, input}); |   checkAllSameGPU(c, {grad_output, input}); | ||||||
|  |  | ||||||
|   auto grad_weight_t = |   auto grad_weight_t = | ||||||
|       at::empty(weight_size, grad_output_t.options(), is_channels_last ? std::optional(kChannelsLast) : std::nullopt); |       at::empty(weight_size, grad_output_t.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt); | ||||||
|  |  | ||||||
|   TensorArg grad_weight{grad_weight_t, "result", 0}; |   TensorArg grad_weight{grad_weight_t, "result", 0}; | ||||||
|  |  | ||||||
|   convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups); |   convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups); | ||||||
| @ -482,23 +500,16 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size, | |||||||
|     MPSGraphTensor* gradWeightTensor_ = nil; |     MPSGraphTensor* gradWeightTensor_ = nil; | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   // TODO: Remove me when MacOS-14 is no longer supported |  | ||||||
|   std::optional<Tensor> grad_weight_c; |  | ||||||
|   if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS) && is_channels_last) { |  | ||||||
|     grad_weight_c = at::empty_like(grad_weight_t, grad_weight_t.options().memory_format(MemoryFormat::Contiguous)); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   @autoreleasepool { |   @autoreleasepool { | ||||||
|     MPSStream* stream = getCurrentMPSStream(); |     MPSStream* stream = getCurrentMPSStream(); | ||||||
|  |  | ||||||
|     MPSShape* mps_weight_shape = getMPSShape(weight_size); |     MPSShape* mps_weight_shape = getMPSShape(weight_size); | ||||||
|     std::string key = fmt::format("mps_{}convolution_backward_weights:{}:{}:{}:{}:{}:{}", |     std::string key = fmt::format("mps_{}convolution_backward_weights:{}:{}:{}:{}:{}", | ||||||
|                                   is3DConv ? "3d_" : "", |                                   is3DConv ? "3d_" : "", | ||||||
|                                   getArrayRefString(stride), |                                   getArrayRefString(stride), | ||||||
|                                   getArrayRefString(dilation), |                                   getArrayRefString(dilation), | ||||||
|                                   getArrayRefString(padding), |                                   getArrayRefString(padding), | ||||||
|                                   groups, |                                   groups, | ||||||
|                                   is_channels_last, |  | ||||||
|                                   getTensorsStringKey({grad_output_t, input_t, grad_weight_t})); |                                   getTensorsStringKey({grad_output_t, input_t, grad_weight_t})); | ||||||
|     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) { |     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) { | ||||||
|       MPSShape* inputShape = getMPSShape(input_t); |       MPSShape* inputShape = getMPSShape(input_t); | ||||||
| @ -530,8 +541,15 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size, | |||||||
|       } else if (isDepthwiseConv) { |       } else if (isDepthwiseConv) { | ||||||
|         MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor_ = |         MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor_ = | ||||||
|             [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease]; |             [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease]; | ||||||
|         fill_depthwise_conv_desc( |         fill_depthwise_conv_desc(depthWiseConv3dDescriptor_, | ||||||
|             depthWiseConv3dDescriptor_, stride[1], stride[0], dilation[1], dilation[0], padding[1], padding[0]); |                                  stride[1], | ||||||
|  |                                  stride[0], | ||||||
|  |                                  dilation[1], | ||||||
|  |                                  dilation[0], | ||||||
|  |                                  padding[1], | ||||||
|  |                                  padding[0], | ||||||
|  |                                  at::MemoryFormat::Contiguous, | ||||||
|  |                                  groups); | ||||||
|         NSNumber* outputFeatChannelDim = mps_weight_shape[0]; |         NSNumber* outputFeatChannelDim = mps_weight_shape[0]; | ||||||
|         MPSShape* weightShapeTranspose = @[ @1, outputFeatChannelDim, mps_weight_shape[2], mps_weight_shape[3] ]; |         MPSShape* weightShapeTranspose = @[ @1, outputFeatChannelDim, mps_weight_shape[2], mps_weight_shape[3] ]; | ||||||
|         MPSGraphTensor* gradWeightTensorTranspose = |         MPSGraphTensor* gradWeightTensorTranspose = | ||||||
| @ -565,19 +583,14 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size, | |||||||
|       newCachedGraph->gradWeightTensor_ = gradWeightTensor; |       newCachedGraph->gradWeightTensor_ = gradWeightTensor; | ||||||
|     }); |     }); | ||||||
|  |  | ||||||
|     auto gradOutputPlaceholder = |     auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t); | ||||||
|         Placeholder(cachedGraph->gradOutputTensor_, grad_weight_c ? grad_output_t.contiguous() : grad_output_t); |     auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t); | ||||||
|     auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, grad_weight_c ? input_t.contiguous() : input_t); |     auto outputPlaceholder = Placeholder(cachedGraph->gradWeightTensor_, grad_weight_t); | ||||||
|     auto outputPlaceholder = |  | ||||||
|         Placeholder(cachedGraph->gradWeightTensor_, grad_weight_c ? *grad_weight_c : grad_weight_t); |  | ||||||
|  |  | ||||||
|     auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, inputPlaceholder); |     auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, inputPlaceholder); | ||||||
|     runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); |     runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if (grad_weight_c) { |  | ||||||
|     grad_weight_t.copy_(*grad_weight_c); |  | ||||||
|   } |  | ||||||
|   return grad_weight_t; |   return grad_weight_t; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | |||||||
| @ -66,12 +66,11 @@ static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl( | |||||||
|   int64_t num_indices = indices.size(0); |   int64_t num_indices = indices.size(0); | ||||||
|   int64_t num_bags = offsets.size(0); |   int64_t num_bags = offsets.size(0); | ||||||
|   if (include_last_offset) { |   if (include_last_offset) { | ||||||
|     TORCH_CHECK(num_bags >= 1, "include_last_offset: number of offsets should be at least 1"); |  | ||||||
|     num_bags -= 1; |     num_bags -= 1; | ||||||
|   } |   } | ||||||
|   int64_t feature_size = weight.size(1); |   int64_t feature_size = weight.size(1); | ||||||
|  |  | ||||||
|   auto bag_size = at::empty({num_bags}, indices.options()); |   auto bag_size = at::empty(offsets.sizes(), indices.options()); | ||||||
|   auto offset2bag = at::empty({indices.size(0)}, indices.options()); |   auto offset2bag = at::empty({indices.size(0)}, indices.options()); | ||||||
|   auto output = at::empty({num_bags, feature_size}, weight.options()); |   auto output = at::empty({num_bags, feature_size}, weight.options()); | ||||||
|  |  | ||||||
| @ -95,7 +94,7 @@ static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl( | |||||||
|   } |   } | ||||||
|  |  | ||||||
|   bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined(); |   bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined(); | ||||||
|   params.per_sample_weights_stride = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0; |   params.per_sample_weights_strides = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0; | ||||||
|  |  | ||||||
|   params.num_indices = num_indices; |   params.num_indices = num_indices; | ||||||
|   params.num_bags = num_bags; |   params.num_bags = num_bags; | ||||||
|  | |||||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user
	