mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-29 19:24:55 +08:00
Compare commits
173 Commits
flash_deco
...
gh/fffrog/
| Author | SHA1 | Date | |
|---|---|---|---|
| 5ac16f2d93 | |||
| 6ba83e06a5 | |||
| 960290d629 | |||
| b1a4efc302 | |||
| 96182faf96 | |||
| dcb8af7501 | |||
| 280e712c13 | |||
| 254d2864d6 | |||
| 9dac6437da | |||
| 8a0e8cad5f | |||
| 3a115da3e6 | |||
| b48a3d0a38 | |||
| 8d474bdc14 | |||
| 008051b13c | |||
| e4ffd718ec | |||
| ed3085814a | |||
| e2817ac204 | |||
| 1d138e658d | |||
| f9095fb285 | |||
| a0136f149c | |||
| 62b0ebd8f9 | |||
| 19f16a65b4 | |||
| 0ebfa3d7d2 | |||
| 0ea10f9912 | |||
| 48a852b7ae | |||
| f1260c9b9a | |||
| 28c7d11428 | |||
| a60c6ed99f | |||
| c257570e6c | |||
| 2f85de0b42 | |||
| e21b037756 | |||
| f8c7505855 | |||
| 425ea90f95 | |||
| 5b764267f4 | |||
| 50c0550f5a | |||
| d7491fb1c1 | |||
| 9534c59311 | |||
| 5880996b4c | |||
| 1d26eb0fcc | |||
| a05f6ecfec | |||
| c106ee8515 | |||
| 8aba513506 | |||
| 8c194a367e | |||
| 33f3413bd3 | |||
| d4e4f70768 | |||
| bfd21cd3e6 | |||
| 7441a1b9b1 | |||
| 6a2bd1f4ee | |||
| 4783e3ff49 | |||
| c8e5b7dabb | |||
| 04b51499f7 | |||
| 54461a53bd | |||
| d1403250c9 | |||
| b42e81def5 | |||
| 2a45f30ae7 | |||
| 11b4c0eb9e | |||
| fb93491ddc | |||
| 39df24fe04 | |||
| bbde16fe98 | |||
| 1b78ca2ef5 | |||
| 082eaf4aae | |||
| f1f2e3e4da | |||
| 67cc0e0ac9 | |||
| bbf8aa43ef | |||
| 5daa79fd6e | |||
| b776e0c71e | |||
| 5c2f09d1f9 | |||
| b4be380480 | |||
| 5b8fef3f17 | |||
| ff2f319e6e | |||
| 94195a37ae | |||
| c58e096cd0 | |||
| 2a6e6a9e3b | |||
| 6e6c899347 | |||
| 366961df78 | |||
| 520fca82c8 | |||
| 908bcfd403 | |||
| 96275dbf88 | |||
| b14a14a662 | |||
| 92f7361e27 | |||
| 6a6d838832 | |||
| 183dca423f | |||
| b8efa336d2 | |||
| 1cffa42d4d | |||
| ebfc87e303 | |||
| 21a41edd4f | |||
| 7bad9c5a64 | |||
| 151e66e50d | |||
| b61bdc7cc4 | |||
| 3dd89a079f | |||
| 6539537a59 | |||
| 3cbfbbd691 | |||
| 112e204797 | |||
| f9821b1be7 | |||
| c4312b443f | |||
| 7194d77550 | |||
| 22d5f5ff94 | |||
| c8e75c48b9 | |||
| e8f5f1b1a2 | |||
| 10e69a6e17 | |||
| 5fcde74aed | |||
| 6fa3715c12 | |||
| 98c4e35f14 | |||
| 00059db034 | |||
| 22fcc8b76b | |||
| ab2ce3c50e | |||
| 7d710403b0 | |||
| 29cbcbac42 | |||
| 5f90e8c7ae | |||
| eb7f4e0004 | |||
| 42928876eb | |||
| c44ec9f4c2 | |||
| 353991dd92 | |||
| 2b6a74abf1 | |||
| ad869c58f5 | |||
| d5afb9e31a | |||
| e7d6ea65ca | |||
| a6974195da | |||
| a213848703 | |||
| cde5c9aebd | |||
| 783a9dcb6d | |||
| ad2f7315ca | |||
| cc660d38ac | |||
| 00f96dd84d | |||
| 77b9aac6c2 | |||
| 7163dce1e0 | |||
| 4ac4a7351e | |||
| 65ddd91421 | |||
| 8c98aee436 | |||
| d927e55498 | |||
| 754c7e2e88 | |||
| 0ec946a052 | |||
| 2b1236de61 | |||
| bc8680c298 | |||
| 1495b35d29 | |||
| 90a282504e | |||
| 0dce2afd44 | |||
| 71eec6a0bf | |||
| 0456b23b77 | |||
| c414f75c8b | |||
| 768361e67f | |||
| 9341ede617 | |||
| 4c2c401ccf | |||
| 5d0f639234 | |||
| 9d0d98acfe | |||
| 3b73841f43 | |||
| 141fc7276e | |||
| b66aa1ade1 | |||
| 207f104594 | |||
| 3e1b1a30f2 | |||
| 2390d34c9b | |||
| a635505a99 | |||
| 6f34cc040f | |||
| ec0cd81c38 | |||
| 33aabdd8ac | |||
| 0bca77951d | |||
| bf0747c6c6 | |||
| 11a231ef52 | |||
| dad54ca7c0 | |||
| 2c5a3d7e60 | |||
| f68de58c9d | |||
| a8e9ed2407 | |||
| 0390798dad | |||
| 124dd364e9 | |||
| 20eeb54814 | |||
| 6f1d962d5b | |||
| 42e9902a0f | |||
| d746b987d8 | |||
| 6fa972796e | |||
| ca512af3e7 | |||
| c261c71f3e | |||
| e2ce79e4cc | |||
| be6c127927 |
@ -1004,7 +1004,7 @@ if __name__ == "__main__":
|
||||
install_condaforge_python(host, args.python_version)
|
||||
sys.exit(0)
|
||||
|
||||
python_version = args.python_version if args.python_version is not None else "3.9"
|
||||
python_version = args.python_version if args.python_version is not None else "3.10"
|
||||
|
||||
if args.use_torch_from_pypi:
|
||||
configure_system(host, compiler=args.compiler, python_version=python_version)
|
||||
|
||||
@ -69,7 +69,8 @@ RUN bash ./install_cuda.sh 13.0
|
||||
ENV DESIRED_CUDA=13.0
|
||||
|
||||
FROM ${ROCM_IMAGE} as rocm
|
||||
ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
|
||||
ARG PYTORCH_ROCM_ARCH
|
||||
ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
|
||||
ADD ./common/install_mkl.sh install_mkl.sh
|
||||
RUN bash ./install_mkl.sh && rm install_mkl.sh
|
||||
ENV MKLROOT /opt/intel
|
||||
|
||||
@ -36,6 +36,12 @@ case ${DOCKER_TAG_PREFIX} in
|
||||
;;
|
||||
rocm*)
|
||||
BASE_TARGET=rocm
|
||||
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
|
||||
# add gfx950 conditionally starting in ROCm 7.0
|
||||
if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
|
||||
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
|
||||
fi
|
||||
EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
|
||||
;;
|
||||
*)
|
||||
echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}"
|
||||
|
||||
@ -12,8 +12,8 @@ function do_install() {
|
||||
|
||||
rocm_version_nodot=${rocm_version//./}
|
||||
|
||||
# Version 2.7.2 + ROCm related updates
|
||||
MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
|
||||
# https://github.com/icl-utk-edu/magma/pull/65
|
||||
MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
|
||||
magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
|
||||
|
||||
rocm_dir="/opt/rocm"
|
||||
|
||||
@ -40,12 +40,16 @@ case ${DOCKER_TAG_PREFIX} in
|
||||
;;
|
||||
rocm*)
|
||||
# we want the patch version of 6.4 instead
|
||||
if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
|
||||
if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
|
||||
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
|
||||
fi
|
||||
BASE_TARGET=rocm
|
||||
GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
|
||||
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
|
||||
# add gfx950 conditionally starting in ROCm 7.0
|
||||
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
|
||||
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
|
||||
fi
|
||||
DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
|
||||
;;
|
||||
*)
|
||||
|
||||
@ -82,7 +82,7 @@ case ${image} in
|
||||
;;
|
||||
manylinux2_28-builder:rocm*)
|
||||
# we want the patch version of 6.4 instead
|
||||
if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
|
||||
if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
|
||||
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
|
||||
fi
|
||||
TARGET=rocm_final
|
||||
@ -90,6 +90,10 @@ case ${image} in
|
||||
DEVTOOLSET_VERSION="11"
|
||||
GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
|
||||
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
|
||||
# add gfx950 conditionally starting in ROCm 7.0
|
||||
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
|
||||
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
|
||||
fi
|
||||
DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
|
||||
;;
|
||||
manylinux2_28-builder:xpu)
|
||||
|
||||
@ -112,8 +112,6 @@ ninja==1.11.1.3
|
||||
#Pinned versions: 1.11.1.3
|
||||
#test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
|
||||
|
||||
numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x"
|
||||
numba==0.55.2 ; python_version == "3.9" and platform_machine != "s390x"
|
||||
numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
|
||||
numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
|
||||
#Description: Just-In-Time Compiler for Numerical Functions
|
||||
@ -134,7 +132,7 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
|
||||
#test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
|
||||
#test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
|
||||
#test_binary_ufuncs.py
|
||||
numpy==1.22.4; python_version == "3.9" or python_version == "3.10"
|
||||
numpy==1.22.4; python_version == "3.10"
|
||||
numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
|
||||
numpy==2.1.2; python_version >= "3.13"
|
||||
|
||||
@ -326,8 +324,6 @@ pywavelets==1.7.0 ; python_version >= "3.12"
|
||||
lxml==5.3.0
|
||||
#Description: This is a requirement of unittest-xml-reporting
|
||||
|
||||
# Python-3.9 binaries
|
||||
|
||||
PyGithub==2.3.0
|
||||
|
||||
sympy==1.13.3
|
||||
|
||||
@ -1,8 +1,15 @@
|
||||
sphinx==5.3.0
|
||||
#Description: This is used to generate PyTorch docs
|
||||
#Pinned versions: 5.3.0
|
||||
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@d53b0ffb9b1cda68260693ea98f3483823c88d8e#egg=pytorch_sphinx_theme2
|
||||
|
||||
standard-imghdr==3.13.0; python_version >= "3.13"
|
||||
#Description: This is needed by Sphinx, so it needs to be added here.
|
||||
# The reasons are as follows:
|
||||
# 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr);
|
||||
# 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
|
||||
# Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.
|
||||
|
||||
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@d53b0ffb9b1cda68260693ea98f3483823c88d8e#egg=pytorch_sphinx_theme2
|
||||
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
|
||||
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
|
||||
# something related to Docker setup. We can investigate this later.
|
||||
|
||||
@ -1,11 +1,11 @@
|
||||
SHELL=/usr/bin/env bash
|
||||
|
||||
DOCKER_CMD ?= docker
|
||||
DESIRED_ROCM ?= 6.4
|
||||
DESIRED_ROCM ?= 7.0
|
||||
DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
|
||||
PACKAGE_NAME = magma-rocm
|
||||
# inherit this from underlying docker image, do not pass this env var to docker
|
||||
#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
|
||||
#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
|
||||
|
||||
DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
|
||||
-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
|
||||
@ -16,6 +16,7 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
|
||||
magma-rocm/build_magma.sh
|
||||
|
||||
.PHONY: all
|
||||
all: magma-rocm70
|
||||
all: magma-rocm64
|
||||
all: magma-rocm63
|
||||
|
||||
@ -24,6 +25,11 @@ clean:
|
||||
$(RM) -r magma-*
|
||||
$(RM) -r output
|
||||
|
||||
.PHONY: magma-rocm70
|
||||
magma-rocm70: DESIRED_ROCM := 7.0
|
||||
magma-rocm70:
|
||||
$(DOCKER_RUN)
|
||||
|
||||
.PHONY: magma-rocm64
|
||||
magma-rocm64: DESIRED_ROCM := 6.4
|
||||
magma-rocm64:
|
||||
|
||||
@ -6,8 +6,8 @@ set -eou pipefail
|
||||
# The script expects DESIRED_CUDA and PACKAGE_NAME to be set
|
||||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
|
||||
# Version 2.7.2 + ROCm related updates
|
||||
MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
|
||||
# https://github.com/icl-utk-edu/magma/pull/65
|
||||
MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
|
||||
|
||||
# Folders for the build
|
||||
PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
|
||||
@ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE
|
||||
|
||||
# Fetch magma sources and verify checksum
|
||||
pushd ${PACKAGE_DIR}
|
||||
git clone https://bitbucket.org/icl/magma.git
|
||||
git clone https://github.com/jeffdaily/magma
|
||||
pushd magma
|
||||
git checkout ${MAGMA_VERSION}
|
||||
popd
|
||||
|
||||
@ -58,7 +58,7 @@ time python tools/setup_helpers/generate_code.py \
|
||||
|
||||
# Build the docs
|
||||
pushd docs/cpp
|
||||
time make VERBOSE=1 html -j
|
||||
time make VERBOSE=1 html
|
||||
|
||||
popd
|
||||
popd
|
||||
|
||||
@ -35,10 +35,11 @@ fi
|
||||
|
||||
print_cmake_info
|
||||
if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
|
||||
USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
|
||||
# Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
|
||||
USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
|
||||
else
|
||||
# NB: we always build with distributed; USE_DISTRIBUTED turns off all
|
||||
# backends (specifically the gloo backend), so test that this case works too
|
||||
# Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
|
||||
# that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
|
||||
USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
|
||||
fi
|
||||
if which sccache > /dev/null; then
|
||||
|
||||
@ -13,13 +13,9 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
|
||||
fi
|
||||
popd
|
||||
|
||||
python -mpip install -r requirements.txt
|
||||
|
||||
# enable debug asserts in serialization
|
||||
export TORCH_SERIALIZATION_DEBUG=1
|
||||
|
||||
python -mpip install --no-input -r requirements.txt
|
||||
|
||||
setup_test_python() {
|
||||
# The CircleCI worker hostname doesn't resolve to an address.
|
||||
# This environment variable makes ProcessGroupGloo default to
|
||||
@ -59,7 +55,7 @@ test_python_shard() {
|
||||
|
||||
setup_test_python
|
||||
|
||||
time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --shard "$1" "$NUM_TEST_SHARDS"
|
||||
time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "$1" "$NUM_TEST_SHARDS"
|
||||
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
@ -322,14 +322,14 @@ test_python_shard() {
|
||||
|
||||
# modify LD_LIBRARY_PATH to ensure it has the conda env.
|
||||
# This set of tests has been shown to be buggy without it for the split-build
|
||||
time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
test_python() {
|
||||
# shellcheck disable=SC2086
|
||||
time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
|
||||
time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
@ -390,6 +390,7 @@ test_dynamo_wrapped_shard() {
|
||||
--exclude-distributed-tests \
|
||||
--exclude-torch-export-tests \
|
||||
--exclude-aot-dispatch-tests \
|
||||
--exclude-quantization-tests \
|
||||
--shard "$1" "$NUM_TEST_SHARDS" \
|
||||
--verbose \
|
||||
--upload-artifacts-while-running
|
||||
@ -1162,6 +1163,12 @@ test_distributed() {
|
||||
fi
|
||||
}
|
||||
|
||||
test_quantization() {
|
||||
echo "Testing quantization"
|
||||
|
||||
python test/test_quantization.py
|
||||
}
|
||||
|
||||
test_rpc() {
|
||||
echo "Testing RPC C++ tests"
|
||||
# NB: the ending test_rpc must match the current function name for the current
|
||||
@ -1579,7 +1586,7 @@ test_executorch() {
|
||||
test_linux_aarch64() {
|
||||
python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
|
||||
test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
|
||||
test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
|
||||
test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops profiler/test_memory_profiler \
|
||||
distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \
|
||||
--shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
|
||||
|
||||
@ -1655,6 +1662,8 @@ elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
|
||||
test_executorch
|
||||
elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then
|
||||
test_python_legacy_jit
|
||||
elif [[ "$TEST_CONFIG" == 'quantization' ]]; then
|
||||
test_quantization
|
||||
elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
|
||||
# TODO: run some C++ tests
|
||||
echo "no-op at the moment"
|
||||
@ -1785,6 +1794,8 @@ elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
|
||||
test_h100_distributed
|
||||
elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
|
||||
test_h100_symm_mem
|
||||
elif [[ "${TEST_CONFIG}" == "b200-symm-mem" ]]; then
|
||||
test_h100_symm_mem
|
||||
elif [[ "${TEST_CONFIG}" == h100_cutlass_backend ]]; then
|
||||
test_h100_cutlass_backend
|
||||
else
|
||||
|
||||
@ -25,7 +25,7 @@ echo Copying over test times file
|
||||
robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files"
|
||||
|
||||
echo Run nn tests
|
||||
python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
|
||||
python run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
|
||||
if ERRORLEVEL 1 goto fail
|
||||
|
||||
popd
|
||||
|
||||
@ -63,7 +63,7 @@ if errorlevel 1 exit /b 1
|
||||
call %CONDA_HOME%\condabin\activate.bat testenv
|
||||
if errorlevel 1 exit /b 1
|
||||
|
||||
call conda install -y -q -c conda-forge libuv=1.39
|
||||
call conda install -y -q -c conda-forge libuv=1.51
|
||||
call conda install -y -q intel-openmp
|
||||
|
||||
echo "install and test libtorch"
|
||||
|
||||
@ -177,8 +177,7 @@ source ~/${desired_python}-build/bin/activate
|
||||
retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
|
||||
retry brew install libomp
|
||||
|
||||
# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
|
||||
# is build as part of tensorpipe submodule
|
||||
# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
|
||||
export USE_DISTRIBUTED=1
|
||||
|
||||
export USE_MKLDNN=OFF
|
||||
|
||||
@ -69,6 +69,8 @@ readability-string-compare,
|
||||
'
|
||||
HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
|
||||
WarningsAsErrors: '*'
|
||||
LineFilter:
|
||||
- name: '/usr/include/.*'
|
||||
CheckOptions:
|
||||
cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor: true
|
||||
cppcoreguidelines-special-member-functions.AllowImplicitlyDeletedCopyOrMove: true
|
||||
|
||||
3
.github/actionlint.yaml
vendored
3
.github/actionlint.yaml
vendored
@ -22,6 +22,9 @@ self-hosted-runner:
|
||||
- linux.arm64.m7g.4xlarge
|
||||
- linux.arm64.m7g.4xlarge.ephemeral
|
||||
- linux.arm64.r7g.12xlarge.memory
|
||||
- linux.aws.h100
|
||||
- linux.aws.h100.4
|
||||
- linux.aws.h100.8
|
||||
- linux.4xlarge.nvidia.gpu
|
||||
- linux.8xlarge.nvidia.gpu
|
||||
- linux.16xlarge.nvidia.gpu
|
||||
|
||||
2
.github/actions/setup-win/action.yml
vendored
2
.github/actions/setup-win/action.yml
vendored
@ -59,7 +59,7 @@ runs:
|
||||
set -x
|
||||
|
||||
# Create new py_tmp env with python-version
|
||||
${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp
|
||||
${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp libuv
|
||||
|
||||
PYTHON3=$(${CONDA_RUN} -n py_tmp which python3)
|
||||
EXIT_CODE=$?
|
||||
|
||||
2
.github/ci_commit_pins/vllm.txt
vendored
2
.github/ci_commit_pins/vllm.txt
vendored
@ -1 +1 @@
|
||||
090197034faf3b193c4467cedeb9281e3078892d
|
||||
8bf8f4582208ac7af230512ff5f3ac1dc36d5222
|
||||
|
||||
15
.github/merge_rules.yaml
vendored
15
.github/merge_rules.yaml
vendored
@ -525,6 +525,21 @@
|
||||
- Lint
|
||||
- pull
|
||||
|
||||
- name: typechecking
|
||||
patterns:
|
||||
- 'pyrefly.toml'
|
||||
- 'mypy.ini'
|
||||
- 'mypy-strict.ini'
|
||||
approved_by:
|
||||
- lolpack
|
||||
- maggiemoss
|
||||
- ndmitchell
|
||||
- kinto0
|
||||
mandatory_checks_name:
|
||||
- EasyCLA
|
||||
- Lint
|
||||
- pull
|
||||
|
||||
- name: superuser
|
||||
patterns:
|
||||
- '*'
|
||||
|
||||
36
.github/pytorch-probot.yml
vendored
36
.github/pytorch-probot.yml
vendored
@ -1,42 +1,44 @@
|
||||
tracking_issue: 24422
|
||||
ciflow_tracking_issue: 64124
|
||||
ciflow_push_tags:
|
||||
- ciflow/b200
|
||||
- ciflow/b200-symm-mem
|
||||
- ciflow/binaries
|
||||
- ciflow/binaries_libtorch
|
||||
- ciflow/binaries_wheel
|
||||
- ciflow/triton_binaries
|
||||
- ciflow/h100
|
||||
- ciflow/h100-cutlass-backend
|
||||
- ciflow/h100-distributed
|
||||
- ciflow/h100-symm-mem
|
||||
- ciflow/inductor
|
||||
- ciflow/inductor-periodic
|
||||
- ciflow/inductor-rocm
|
||||
- ciflow/inductor-perf-test-nightly-rocm
|
||||
- ciflow/inductor-perf-compare
|
||||
- ciflow/inductor-cu126
|
||||
- ciflow/inductor-micro-benchmark
|
||||
- ciflow/inductor-micro-benchmark-cpu-x86
|
||||
- ciflow/inductor-perf-compare
|
||||
- ciflow/inductor-perf-test-nightly-rocm
|
||||
- ciflow/inductor-perf-test-nightly-x86-zen
|
||||
- ciflow/inductor-cu126
|
||||
- ciflow/inductor-periodic
|
||||
- ciflow/inductor-rocm
|
||||
- ciflow/linux-aarch64
|
||||
- ciflow/mps
|
||||
- ciflow/nightly
|
||||
- ciflow/op-benchmark
|
||||
- ciflow/periodic
|
||||
- ciflow/periodic-rocm-mi300
|
||||
- ciflow/pull
|
||||
- ciflow/quantization-periodic
|
||||
- ciflow/riscv64
|
||||
- ciflow/rocm
|
||||
- ciflow/rocm-mi300
|
||||
- ciflow/s390
|
||||
- ciflow/riscv64
|
||||
- ciflow/slow
|
||||
- ciflow/torchbench
|
||||
- ciflow/triton_binaries
|
||||
- ciflow/trunk
|
||||
- ciflow/unstable
|
||||
- ciflow/xpu
|
||||
- ciflow/vllm
|
||||
- ciflow/torchbench
|
||||
- ciflow/op-benchmark
|
||||
- ciflow/pull
|
||||
- ciflow/h100
|
||||
- ciflow/h100-distributed
|
||||
- ciflow/win-arm64
|
||||
- ciflow/h100-symm-mem
|
||||
- ciflow/h100-cutlass-backend
|
||||
- ciflow/b200
|
||||
- ciflow/xpu
|
||||
retryable_workflows:
|
||||
- pull
|
||||
- trunk
|
||||
@ -45,4 +47,4 @@ retryable_workflows:
|
||||
- inductor-A100-perf-nightly
|
||||
labeler_config: labeler.yml
|
||||
label_to_label_config: label_to_label.yml
|
||||
mergebot: True
|
||||
mergebot: true
|
||||
|
||||
@ -30,7 +30,7 @@ CUDA_ARCHES_CUDNN_VERSION = {
|
||||
}
|
||||
|
||||
# NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
|
||||
ROCM_ARCHES = ["6.3", "6.4"]
|
||||
ROCM_ARCHES = ["6.4", "7.0"]
|
||||
|
||||
XPU_ARCHES = ["xpu"]
|
||||
|
||||
|
||||
2
.github/scripts/generate_ci_workflows.py
vendored
2
.github/scripts/generate_ci_workflows.py
vendored
@ -155,7 +155,7 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
|
||||
package_type="manywheel",
|
||||
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
|
||||
OperatingSystem.LINUX,
|
||||
arches=["12.8"],
|
||||
arches=["13.0"],
|
||||
python_versions=["3.12"],
|
||||
),
|
||||
branches="main",
|
||||
|
||||
@ -71,12 +71,15 @@ jobs:
|
||||
with:!{{ upload.binary_env_as_input(config) }}
|
||||
{%- if "aarch64" in build_environment %}
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
{%- elif "s390x" in build_environment %}
|
||||
runs_on: linux.s390x
|
||||
ALPINE_IMAGE: "docker.io/s390x/alpine"
|
||||
timeout-minutes: 420
|
||||
{%- elif config["gpu_arch_type"] == "rocm" %}
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
timeout-minutes: 300
|
||||
{%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %}
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.24xlarge.ephemeral
|
||||
|
||||
2
.github/workflows/_docs.yml
vendored
2
.github/workflows/_docs.yml
vendored
@ -67,7 +67,7 @@ jobs:
|
||||
# an OOM issue when running the job, so this upgrades the runner from 4xlarge
|
||||
# to the next available tier of 12xlarge. So much memory just to generate cpp
|
||||
# doc
|
||||
runner: ${{ inputs.runner_prefix }}linux.12xlarge
|
||||
runner: ${{ inputs.runner_prefix }}linux.12xlarge.memory
|
||||
# TODO: Nightly cpp docs take longer and longer to finish (more than 3h now)
|
||||
# Let's try to figure out how this can be improved
|
||||
timeout-minutes: 360
|
||||
|
||||
60
.github/workflows/b200-symm-mem.yml
vendored
Normal file
60
.github/workflows/b200-symm-mem.yml
vendored
Normal file
@ -0,0 +1,60 @@
|
||||
name: Limited CI for symmetric memory tests on B200
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- .github/workflows/b200-symm-mem.yml
|
||||
workflow_dispatch:
|
||||
push:
|
||||
tags:
|
||||
- ciflow/b200-symm-mem/*
|
||||
schedule:
|
||||
- cron: 22 8 * * * # about 1:22am PDT
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
|
||||
get-label-type:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: get-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '10.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "b200-symm-mem", shard: 1, num_shards: 1, runner: "linux.dgx.b200.8" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-sm100-test:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs:
|
||||
- linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm.outputs.test-matrix }}
|
||||
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
secrets: inherit
|
||||
2
.github/workflows/build-almalinux-images.yml
vendored
2
.github/workflows/build-almalinux-images.yml
vendored
@ -36,7 +36,7 @@ jobs:
|
||||
runs-on: linux.9xlarge.ephemeral
|
||||
strategy:
|
||||
matrix:
|
||||
tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "cpu"]
|
||||
tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "rocm7.0", "cpu"]
|
||||
steps:
|
||||
- name: Build docker image
|
||||
uses: pytorch/pytorch/.github/actions/binary-docker-build@main
|
||||
|
||||
2
.github/workflows/build-libtorch-images.yml
vendored
2
.github/workflows/build-libtorch-images.yml
vendored
@ -52,8 +52,8 @@ jobs:
|
||||
{ tag: "cuda12.9" },
|
||||
{ tag: "cuda12.8" },
|
||||
{ tag: "cuda12.6" },
|
||||
{ tag: "rocm6.3" },
|
||||
{ tag: "rocm6.4" },
|
||||
{ tag: "rocm7.0" },
|
||||
{ tag: "cpu" },
|
||||
]
|
||||
steps:
|
||||
|
||||
2
.github/workflows/build-magma-rocm-linux.yml
vendored
2
.github/workflows/build-magma-rocm-linux.yml
vendored
@ -34,7 +34,7 @@ jobs:
|
||||
id-token: write
|
||||
strategy:
|
||||
matrix:
|
||||
rocm_version: ["64", "63"]
|
||||
rocm_version: ["70", "64"]
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
2
.github/workflows/build-manywheel-images.yml
vendored
2
.github/workflows/build-manywheel-images.yml
vendored
@ -52,8 +52,8 @@ jobs:
|
||||
{ name: "manylinuxaarch64-builder", tag: "cuda13.0", runner: "linux.arm64.2xlarge.ephemeral" },
|
||||
{ name: "manylinuxaarch64-builder", tag: "cuda12.8", runner: "linux.arm64.2xlarge.ephemeral" },
|
||||
{ name: "manylinuxaarch64-builder", tag: "cuda12.6", runner: "linux.arm64.2xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28-builder", tag: "rocm6.3", runner: "linux.9xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28-builder", tag: "rocm6.4", runner: "linux.9xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28-builder", tag: "rocm7.0", runner: "linux.9xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28-builder", tag: "cpu", runner: "linux.9xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28_aarch64-builder", tag: "cpu-aarch64", runner: "linux.arm64.2xlarge.ephemeral" },
|
||||
{ name: "manylinuxcxx11-abi-builder", tag: "cpu-cxx11-abi", runner: "linux.9xlarge.ephemeral" },
|
||||
|
||||
9
.github/workflows/build-triton-wheel.yml
vendored
9
.github/workflows/build-triton-wheel.yml
vendored
@ -50,12 +50,12 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
|
||||
py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
|
||||
device: ["cuda", "rocm", "xpu", "aarch64"]
|
||||
docker-image: ["pytorch/manylinux2_28-builder:cpu"]
|
||||
include:
|
||||
- device: "rocm"
|
||||
rocm_version: "6.4"
|
||||
rocm_version: "7.0"
|
||||
runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
|
||||
- device: "cuda"
|
||||
rocm_version: ""
|
||||
@ -108,9 +108,6 @@ jobs:
|
||||
|
||||
# Determine python executable for given version
|
||||
case $PY_VERS in
|
||||
3.9)
|
||||
PYTHON_EXECUTABLE=/opt/python/cp39-cp39/bin/python
|
||||
;;
|
||||
3.10)
|
||||
PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python
|
||||
;;
|
||||
@ -194,7 +191,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
|
||||
py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
|
||||
device: ["xpu"]
|
||||
timeout-minutes: 40
|
||||
env:
|
||||
|
||||
59
.github/workflows/create_release.yml
vendored
59
.github/workflows/create_release.yml
vendored
@ -35,6 +35,7 @@ jobs:
|
||||
contents: write
|
||||
outputs:
|
||||
pt_release_name: ${{ steps.release_name.outputs.pt_release_name }}
|
||||
pt_pep517_release_name: ${{ steps.release_name.outputs.pt_pep517_release_name }}
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
@ -53,8 +54,12 @@ jobs:
|
||||
tag_or_branch="${tag_or_branch#refs/heads/}"
|
||||
# replace directory separators with _ in branch name
|
||||
tag_or_branch="${tag_or_branch//\//_}"
|
||||
echo "PT_RELEASE_NAME=pytorch-$tag_or_branch" >> "$GITHUB_ENV"
|
||||
echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV"
|
||||
torch_version="$(python -c 'from tools.generate_torch_version import get_torch_version; print(get_torch_version())')"
|
||||
{
|
||||
echo "PT_RELEASE_NAME=pytorch-$tag_or_branch";
|
||||
echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz";
|
||||
echo "PT_PEP517_RELEASE_FILE=torch-${torch_version}.tar.gz";
|
||||
} >> "$GITHUB_ENV"
|
||||
- name: Checkout optional submodules
|
||||
run: python3 tools/optional_submodules.py
|
||||
- name: Copy docs requirements for inclusion
|
||||
@ -64,30 +69,47 @@ jobs:
|
||||
cp .ci/docker/requirements-docs.txt docs/requirements.txt
|
||||
- name: Create source distribution
|
||||
run: |
|
||||
# Create new folder with specified name so extracting the archive yields that
|
||||
rm -rf "/tmp/$PT_RELEASE_NAME"
|
||||
cp -r "$PWD" "/tmp/$PT_RELEASE_NAME"
|
||||
mv "/tmp/$PT_RELEASE_NAME" .
|
||||
# Cleanup
|
||||
rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci}
|
||||
find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true
|
||||
# Create archive
|
||||
tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME"
|
||||
echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")"
|
||||
# Create new folder with specified name so extracting the archive yields that
|
||||
rm -rf "/tmp/$PT_RELEASE_NAME"
|
||||
cp -r "$PWD" "/tmp/$PT_RELEASE_NAME"
|
||||
mv "/tmp/$PT_RELEASE_NAME" .
|
||||
# Cleanup
|
||||
rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci}
|
||||
find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true
|
||||
# Create archive
|
||||
tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME"
|
||||
echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")"
|
||||
- name: Create PEP 517 compatible source distribution
|
||||
run: |
|
||||
pip install build==1.2.2.post1 || exit 1
|
||||
python -m build --sdist || exit 1
|
||||
cd dist || exit 1
|
||||
- name: Upload source distribution for release
|
||||
if: ${{ github.event_name == 'release' }}
|
||||
uses: softprops/action-gh-release@da05d552573ad5aba039eaac05058a918a7bf631 # v2.2.2
|
||||
with:
|
||||
files: ${{env.PT_RELEASE_FILE}}
|
||||
- name: Upload source distribution to GHA artifacts for release tags
|
||||
files: |
|
||||
${{ env.PT_RELEASE_FILE }}
|
||||
${{ env.PT_PEP517_RELEASE_FILE }}
|
||||
- name: Upload source distribution to GHA artifacts # for release tags
|
||||
if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
|
||||
with:
|
||||
name: ${{ env.PT_RELEASE_FILE }}
|
||||
path: ${{ env.PT_RELEASE_FILE }}
|
||||
- name: Upload PEP 517 source distribution to GHA artifacts # for release tags
|
||||
if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
|
||||
with:
|
||||
name: ${{ env.PT_PEP517_RELEASE_FILE }}
|
||||
path: dist/${{ env.PT_PEP517_RELEASE_FILE }}
|
||||
- name: Set output
|
||||
id: release_name
|
||||
run: echo "pt_release_name=${{ env.PT_RELEASE_NAME }}.tar.gz" >> "${GITHUB_OUTPUT}"
|
||||
run: |
|
||||
{
|
||||
echo "pt_release_name=${{ env.PT_RELEASE_FILE }}";
|
||||
echo "pt_pep517_release_name=${{ env.PT_PEP517_RELEASE_FILE }}";
|
||||
} >> "${GITHUB_OUTPUT}"
|
||||
|
||||
upload_source_code_to_s3:
|
||||
if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
|
||||
@ -103,6 +125,9 @@ jobs:
|
||||
- uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
|
||||
with:
|
||||
name: ${{ needs.release.outputs.pt_release_name }}
|
||||
- uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
|
||||
with:
|
||||
name: ${{ needs.release.outputs.pt_pep517_release_name }}
|
||||
- name: Configure AWS credentials(PyTorch account)
|
||||
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
|
||||
with:
|
||||
@ -113,7 +138,9 @@ jobs:
|
||||
s3-bucket: pytorch
|
||||
s3-prefix: source_code/test
|
||||
if-no-files-found: warn
|
||||
path: ${{ needs.release.outputs.pt_release_name }}
|
||||
path: |
|
||||
${{ needs.release.outputs.pt_release_name }}
|
||||
${{ needs.release.outputs.pt_pep517_release_name }}
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
|
||||
|
||||
56
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
56
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
@ -62,7 +62,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -128,7 +128,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -174,7 +174,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -220,7 +220,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -265,7 +265,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -331,7 +331,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -377,7 +377,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -423,7 +423,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -468,7 +468,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -534,7 +534,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -580,7 +580,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -626,7 +626,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -671,7 +671,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
DESIRED_PYTHON: "3.13"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -737,7 +737,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
|
||||
DESIRED_PYTHON: "3.13"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -783,7 +783,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
|
||||
DESIRED_PYTHON: "3.13"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -829,7 +829,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.13"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -874,7 +874,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
DESIRED_PYTHON: "3.13t"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -940,7 +940,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
|
||||
DESIRED_PYTHON: "3.13t"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -986,7 +986,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
|
||||
DESIRED_PYTHON: "3.13t"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -1032,7 +1032,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.13t"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -1077,7 +1077,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
DESIRED_PYTHON: "3.14"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -1143,7 +1143,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
|
||||
DESIRED_PYTHON: "3.14"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -1189,7 +1189,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
|
||||
DESIRED_PYTHON: "3.14"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -1235,7 +1235,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.14"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -1280,7 +1280,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
|
||||
DESIRED_PYTHON: "3.14t"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -1346,7 +1346,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
|
||||
DESIRED_PYTHON: "3.14t"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -1392,7 +1392,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
|
||||
DESIRED_PYTHON: "3.14t"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -1438,7 +1438,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.14t"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
runs_on: linux.arm64.r7g.12xlarge.memory
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
|
||||
230
.github/workflows/generated-linux-binary-libtorch-nightly.yml
generated
vendored
230
.github/workflows/generated-linux-binary-libtorch-nightly.yml
generated
vendored
@ -316,120 +316,6 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
libtorch-rocm6_3-shared-with-deps-release-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm6.3
|
||||
GPU_ARCH_VERSION: "6.3"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.3
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: libtorch-rocm6_3-shared-with-deps-release
|
||||
build_environment: linux-binary-libtorch
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
libtorch-rocm6_3-shared-with-deps-release-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- libtorch-rocm6_3-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm6.3
|
||||
GPU_ARCH_VERSION: "6.3"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
SKIP_ALL_TESTS: 1
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.3
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
steps:
|
||||
- name: Setup ROCm
|
||||
uses: ./.github/actions/setup-rocm
|
||||
- uses: actions/download-artifact@v4.1.7
|
||||
name: Download Build Artifacts
|
||||
with:
|
||||
name: libtorch-rocm6_3-shared-with-deps-release
|
||||
path: "${{ runner.temp }}/artifacts/"
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
- name: ROCm set GPU_FLAG
|
||||
run: |
|
||||
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
|
||||
- name: configure aws credentials
|
||||
id: aws_creds
|
||||
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
aws-region: us-east-1
|
||||
role-duration-seconds: 18000
|
||||
- name: Calculate docker image
|
||||
id: calculate-docker-image
|
||||
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
|
||||
with:
|
||||
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
|
||||
docker-image-name: libtorch-cxx11-builder
|
||||
custom-tag-prefix: rocm6.3
|
||||
docker-build-dir: .ci/docker
|
||||
working-directory: pytorch
|
||||
- name: Pull Docker image
|
||||
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
||||
with:
|
||||
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
- name: Test Pytorch binary
|
||||
uses: ./pytorch/.github/actions/test-pytorch-binary
|
||||
env:
|
||||
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
- name: Teardown ROCm
|
||||
uses: ./.github/actions/teardown-rocm
|
||||
libtorch-rocm6_3-shared-with-deps-release-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: libtorch-rocm6_3-shared-with-deps-release-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm6.3
|
||||
GPU_ARCH_VERSION: "6.3"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.3
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
build_name: libtorch-rocm6_3-shared-with-deps-release
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
libtorch-rocm6_4-shared-with-deps-release-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -447,6 +333,7 @@ jobs:
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
timeout-minutes: 300
|
||||
build_name: libtorch-rocm6_4-shared-with-deps-release
|
||||
build_environment: linux-binary-libtorch
|
||||
secrets:
|
||||
@ -543,3 +430,118 @@ jobs:
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
libtorch-rocm7_0-shared-with-deps-release-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm7.0
|
||||
GPU_ARCH_VERSION: "7.0"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
timeout-minutes: 300
|
||||
build_name: libtorch-rocm7_0-shared-with-deps-release
|
||||
build_environment: linux-binary-libtorch
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
libtorch-rocm7_0-shared-with-deps-release-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- libtorch-rocm7_0-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm7.0
|
||||
GPU_ARCH_VERSION: "7.0"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
SKIP_ALL_TESTS: 1
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
steps:
|
||||
- name: Setup ROCm
|
||||
uses: ./.github/actions/setup-rocm
|
||||
- uses: actions/download-artifact@v4.1.7
|
||||
name: Download Build Artifacts
|
||||
with:
|
||||
name: libtorch-rocm7_0-shared-with-deps-release
|
||||
path: "${{ runner.temp }}/artifacts/"
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
- name: ROCm set GPU_FLAG
|
||||
run: |
|
||||
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
|
||||
- name: configure aws credentials
|
||||
id: aws_creds
|
||||
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
aws-region: us-east-1
|
||||
role-duration-seconds: 18000
|
||||
- name: Calculate docker image
|
||||
id: calculate-docker-image
|
||||
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
|
||||
with:
|
||||
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
|
||||
docker-image-name: libtorch-cxx11-builder
|
||||
custom-tag-prefix: rocm7.0
|
||||
docker-build-dir: .ci/docker
|
||||
working-directory: pytorch
|
||||
- name: Pull Docker image
|
||||
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
||||
with:
|
||||
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
- name: Test Pytorch binary
|
||||
uses: ./pytorch/.github/actions/test-pytorch-binary
|
||||
env:
|
||||
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
- name: Teardown ROCm
|
||||
uses: ./.github/actions/teardown-rocm
|
||||
libtorch-rocm7_0-shared-with-deps-release-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: libtorch-rocm7_0-shared-with-deps-release-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm7.0
|
||||
GPU_ARCH_VERSION: "7.0"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
build_name: libtorch-rocm7_0-shared-with-deps-release
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
24
.github/workflows/generated-linux-binary-manywheel-main.yml
generated
vendored
24
.github/workflows/generated-linux-binary-manywheel-main.yml
generated
vendored
@ -42,7 +42,7 @@ jobs:
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
manywheel-py3_12-cuda12_8-build:
|
||||
manywheel-py3_12-cuda13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
@ -51,22 +51,22 @@ jobs:
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu128
|
||||
GPU_ARCH_VERSION: "12.8"
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_8
|
||||
build_name: manywheel-py3_12-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda12_8-test: # Testing
|
||||
manywheel-py3_12-cuda13_0-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_12-cuda12_8-build
|
||||
- manywheel-py3_12-cuda13_0-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
@ -74,13 +74,13 @@ jobs:
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu128
|
||||
GPU_ARCH_VERSION: "12.8"
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cuda12_8
|
||||
build_name: manywheel-py3_12-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
|
||||
|
||||
1568
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
1568
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
File diff suppressed because it is too large
Load Diff
1
.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
generated
vendored
1
.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
generated
vendored
@ -60,6 +60,7 @@ jobs:
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
timeout-minutes: 300
|
||||
build_name: manywheel-py3_10-rocm6_4
|
||||
build_environment: linux-binary-manywheel-rocm
|
||||
secrets:
|
||||
|
||||
2
.github/workflows/pull.yml
vendored
2
.github/workflows/pull.yml
vendored
@ -127,6 +127,8 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
# More memory is needed to build with asan
|
||||
runner: linux.2xlarge.memory
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.10-clang18-asan
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
|
||||
|
||||
54
.github/workflows/quantization-periodic.yml
vendored
Normal file
54
.github/workflows/quantization-periodic.yml
vendored
Normal file
@ -0,0 +1,54 @@
|
||||
name: quantization-periodic
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- ciflow/quantization-periodic/*
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# run weekly
|
||||
- cron: "45 0 * * 0"
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
get-default-label-prefix:
|
||||
name: get-default-label-prefix
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
opt_out_experiments: lf
|
||||
|
||||
periodic-quantization-build:
|
||||
name: periodic-quantization-build
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-default-label-prefix
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '8.9'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "quantization", shard: 1, num_shards: 1, runner: "${{ needs.get-default-label-prefix.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
periodic-test-quantization:
|
||||
name: periodic-test-quantization
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: periodic-quantization-build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
docker-image: ${{ needs.periodic-quantization-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.periodic-quantization-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
2
.github/workflows/slow.yml
vendored
2
.github/workflows/slow.yml
vendored
@ -140,6 +140,8 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
# More memory is needed to build with asan
|
||||
runner: linux.2xlarge.memory
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.10-clang18-asan
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@ -82,6 +82,7 @@ torch/return_types.pyi
|
||||
torch/nn/functional.pyi
|
||||
torch/utils/data/datapipes/datapipe.pyi
|
||||
torch/csrc/autograd/generated/*
|
||||
torch/csrc/functionalization/generated/*
|
||||
torch/csrc/lazy/generated/*.[!m]*
|
||||
torch_compile_debug/
|
||||
# Listed manually because some files in this directory are not generated
|
||||
|
||||
@ -49,7 +49,7 @@ init_command = [
|
||||
'mccabe==0.7.0',
|
||||
'pycodestyle==2.14.0',
|
||||
'pyflakes==3.4.0',
|
||||
'torchfix==0.4.0 ; python_version >= "3.9" and python_version < "3.13"',
|
||||
'torchfix==0.4.0 ; python_version >= "3.10" and python_version < "3.13"',
|
||||
]
|
||||
|
||||
|
||||
@ -153,7 +153,7 @@ init_command = [
|
||||
'python3',
|
||||
'tools/linter/adapters/pip_init.py',
|
||||
'--dry-run={{DRYRUN}}',
|
||||
'numpy==1.26.4 ; python_version >= "3.9" and python_version <= "3.11"',
|
||||
'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"',
|
||||
'numpy==2.1.0 ; python_version >= "3.12"',
|
||||
'expecttest==0.3.0',
|
||||
'mypy==1.16.0',
|
||||
@ -1453,7 +1453,7 @@ init_command = [
|
||||
'--dry-run={{DRYRUN}}',
|
||||
'usort==1.0.8.post1',
|
||||
'isort==6.0.1',
|
||||
'ruff==0.12.9', # sync with RUFF
|
||||
'ruff==0.13.1', # sync with RUFF
|
||||
]
|
||||
is_formatter = true
|
||||
|
||||
@ -1587,7 +1587,7 @@ init_command = [
|
||||
'python3',
|
||||
'tools/linter/adapters/pip_init.py',
|
||||
'--dry-run={{DRYRUN}}',
|
||||
'ruff==0.12.9', # sync with PYFMT
|
||||
'ruff==0.13.1', # sync with PYFMT
|
||||
]
|
||||
is_formatter = true
|
||||
|
||||
|
||||
37
BUILD.bazel
37
BUILD.bazel
@ -22,6 +22,7 @@ COMMON_COPTS = [
|
||||
"-DHAVE_SHM_UNLINK=1",
|
||||
"-D_FILE_OFFSET_BITS=64",
|
||||
"-DUSE_FBGEMM",
|
||||
"-DUSE_DISTRIBUTED",
|
||||
"-DAT_PER_OPERATOR_HEADERS",
|
||||
"-DATEN_THREADING=NATIVE",
|
||||
"-DNO_CUDNN_DESTROY_HANDLE",
|
||||
@ -90,6 +91,8 @@ generated_cpu_cpp = [
|
||||
"aten/src/ATen/NativeMetaFunctions.h",
|
||||
"aten/src/ATen/RegistrationDeclarations.h",
|
||||
"aten/src/ATen/VmapGeneratedPlumbing.h",
|
||||
"aten/src/ATen/ViewMetaClasses.h",
|
||||
"aten/src/ATen/ViewMetaClasses.cpp",
|
||||
"aten/src/ATen/core/aten_interned_strings.h",
|
||||
"aten/src/ATen/core/enum_tag.h",
|
||||
"aten/src/ATen/core/TensorBody.h",
|
||||
@ -810,7 +813,7 @@ cc_library(
|
||||
name = "torch_python",
|
||||
srcs = libtorch_python_core_sources
|
||||
+ if_cuda(libtorch_python_cuda_sources)
|
||||
+ libtorch_python_distributed_sources
|
||||
+ if_cuda(libtorch_python_distributed_sources)
|
||||
+ GENERATED_AUTOGRAD_PYTHON,
|
||||
hdrs = glob([
|
||||
"torch/csrc/generic/*.cpp",
|
||||
@ -832,36 +835,6 @@ pybind_extension(
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "functorch",
|
||||
hdrs = glob([
|
||||
"functorch/csrc/dim/*.h",
|
||||
]),
|
||||
srcs = glob([
|
||||
"functorch/csrc/dim/*.cpp",
|
||||
]),
|
||||
deps = [
|
||||
":aten_nvrtc",
|
||||
":torch_python",
|
||||
"@pybind11",
|
||||
],
|
||||
)
|
||||
|
||||
pybind_extension(
|
||||
name = "functorch/_C",
|
||||
copts=[
|
||||
"-DTORCH_EXTENSION_NAME=_C"
|
||||
],
|
||||
srcs = [
|
||||
"functorch/csrc/init_dim_only.cpp",
|
||||
],
|
||||
deps = [
|
||||
":functorch",
|
||||
":torch_python",
|
||||
":aten_nvrtc",
|
||||
],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "torch/bin/torch_shm_manager",
|
||||
srcs = [
|
||||
@ -902,7 +875,6 @@ py_library(
|
||||
],
|
||||
data = [
|
||||
":torch/_C.so",
|
||||
":functorch/_C.so",
|
||||
":torch/bin/torch_shm_manager",
|
||||
],
|
||||
)
|
||||
@ -1105,6 +1077,7 @@ test_suite(
|
||||
"aten/src/ATen/templates/LazyNonNativeIr.h",
|
||||
"aten/src/ATen/templates/RegisterDispatchKey.cpp",
|
||||
"aten/src/ATen/templates/RegisterDispatchDefinitions.ini",
|
||||
"aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp",
|
||||
"aten/src/ATen/native/native_functions.yaml",
|
||||
"aten/src/ATen/native/tags.yaml",
|
||||
"aten/src/ATen/native/ts_native_functions.yaml",
|
||||
|
||||
@ -180,9 +180,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
|
||||
set(CPU_POWER ON)
|
||||
endif()
|
||||
|
||||
# For non-supported platforms, turn USE_DISTRIBUTED off by default.
|
||||
# NB: USE_DISTRIBUTED simply disables the backend; distributed code
|
||||
# still gets built
|
||||
# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
|
||||
# tested and likely won't work without additional changes.
|
||||
if(NOT LINUX AND NOT WIN32)
|
||||
set(USE_DISTRIBUTED
|
||||
OFF
|
||||
@ -262,11 +261,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
|
||||
option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
|
||||
option(USE_NATIVE_ARCH "Use -march=native" OFF)
|
||||
cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
|
||||
option(USE_DISTRIBUTED "Enable default distributed backends" ON)
|
||||
option(USE_DISTRIBUTED "Use distributed" ON)
|
||||
cmake_dependent_option(USE_NCCL "Use NCCL" ON
|
||||
"USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
|
||||
cmake_dependent_option(USE_XCCL "Use XCCL" ON
|
||||
"USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
|
||||
"USE_XPU;UNIX;NOT APPLE" OFF)
|
||||
cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
|
||||
cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
|
||||
cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
|
||||
@ -438,11 +437,12 @@ if(WIN32)
|
||||
PATH_SUFFIXES lib
|
||||
NO_DEFAULT_PATH)
|
||||
if(NOT libuv_tmp_LIBRARY)
|
||||
set(USE_DISTRIBUTED OFF)
|
||||
set(USE_GLOO OFF)
|
||||
message(
|
||||
WARNING
|
||||
"Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
|
||||
"Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
|
||||
"Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
|
||||
"Please run command 'conda install -c conda-forge libuv=1.51' to install libuv."
|
||||
)
|
||||
else()
|
||||
set(ENV{libuv_ROOT} ${libuv_tmp_LIBRARY}/../../)
|
||||
@ -1390,10 +1390,6 @@ endif()
|
||||
include(cmake/Summary.cmake)
|
||||
caffe2_print_configuration_summary()
|
||||
|
||||
if(BUILD_FUNCTORCH)
|
||||
add_subdirectory(functorch)
|
||||
endif()
|
||||
|
||||
# Parse custom debug info
|
||||
if(DEFINED USE_CUSTOM_DEBINFO)
|
||||
string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}")
|
||||
|
||||
105
MANIFEST.in
105
MANIFEST.in
@ -1,20 +1,61 @@
|
||||
# Reference: https://setuptools.pypa.io/en/latest/userguide/miscellaneous.html
|
||||
|
||||
# Include source files in SDist
|
||||
include CMakeLists.txt
|
||||
include *.bzl *.bazel .bazel* BUILD *.BUILD BUILD.* WORKSPACE
|
||||
include BUCK BUCK.*
|
||||
include requirements*.txt
|
||||
include version.txt
|
||||
include [Mm]akefile *.[Mm]akefile [Mm]akefile.*
|
||||
include [Dd]ockerfile *.[Dd]ockerfile [Dd]ockerfile.* .dockerignore
|
||||
# Include individual top-level files
|
||||
include CITATION.cff
|
||||
include CODEOWNERS
|
||||
include Dockerfile
|
||||
include LICENSE
|
||||
include MANIFEST.in
|
||||
include Makefile
|
||||
include NOTICE
|
||||
include .bc-linter.yml
|
||||
include .clang-format .clang-tidy
|
||||
include .cmakelintrc
|
||||
include .coveragerc
|
||||
include .dockerignore
|
||||
include .editorconfig
|
||||
include .flake8
|
||||
include .gdbinit
|
||||
include .lintrunner.toml
|
||||
include .lldbinit
|
||||
include codex_setup.sh
|
||||
include docker.Makefile
|
||||
include pyrefly.toml
|
||||
include ubsan.supp
|
||||
|
||||
# Include bazel and BUCK related files
|
||||
include BUILD.bazel BUCK.oss
|
||||
include WORKSPACE
|
||||
include *.bzl
|
||||
include .bazelignore .bazelrc .bazelversion
|
||||
|
||||
# Include general configuration files
|
||||
include *.ini
|
||||
# Include important top-level information
|
||||
include *.md
|
||||
# Include technical text files at the moment, comprises
|
||||
# version.txt, CMakeLists.txt, requirements.txt
|
||||
include *.txt
|
||||
|
||||
# Include ctags configuration
|
||||
include .ctags.d/*.ctags
|
||||
|
||||
# Include subfolders completely
|
||||
graft .devcontainer
|
||||
graft .vscode
|
||||
graft android
|
||||
graft aten
|
||||
graft benchmarks
|
||||
graft binaries
|
||||
graft c10
|
||||
graft caffe2
|
||||
graft cmake
|
||||
graft docs
|
||||
graft functorch
|
||||
graft ios
|
||||
graft mypy_plugins
|
||||
graft scripts
|
||||
graft test
|
||||
graft third_party
|
||||
graft tools
|
||||
graft torch
|
||||
@ -22,29 +63,37 @@ graft torchgen
|
||||
# FIXME: torch-xla build during codegen will fail if include this file in wheel
|
||||
exclude torchgen/BUILD.bazel
|
||||
|
||||
# Misc files and directories in SDist
|
||||
include *.md
|
||||
include CITATION.cff
|
||||
include LICENSE NOTICE
|
||||
include mypy*.ini
|
||||
graft benchmarks
|
||||
graft docs
|
||||
graft mypy_plugins
|
||||
graft scripts
|
||||
# The following exclusions omit parts from third-party dependencies that
|
||||
# contain invalid symlinks[1] and that are not needed for pytorch, such as
|
||||
# bindings for unused languages
|
||||
prune third_party/flatbuffers/java
|
||||
prune third_party/flatbuffers/kotlin
|
||||
prune third_party/ittapi/rust
|
||||
prune third_party/nccl/pkg/debian
|
||||
prune third_party/opentelemetry-cpp/third_party/prometheus-cpp/cmake/project-import-*
|
||||
|
||||
# The following document is also an invalid symlink[1] and superfluous
|
||||
exclude third_party/flatbuffers/docs/source/CONTRIBUTING.md
|
||||
|
||||
# Omit autogenerated code
|
||||
prune torchgen/packaged
|
||||
|
||||
# Omit caches, compiled, and scm related content
|
||||
prune */__pycache__
|
||||
prune **/.github
|
||||
prune **/.gitlab
|
||||
global-exclude *.o *.obj *.so *.dylib *.a *.pxd *.dll *.lib
|
||||
global-exclude *.py[cod] *.swp *~
|
||||
global-exclude .git .git-blame-ignore-revs .gitattributes .gitignore .gitmodules
|
||||
global-exclude .gitlab-ci.yml
|
||||
|
||||
# Misc files needed for custom setuptools command
|
||||
include .gitignore
|
||||
include .gitmodules
|
||||
|
||||
# Include test suites in SDist
|
||||
graft test
|
||||
include pytest.ini
|
||||
include .coveragerc
|
||||
# [1] Invalid symlinks for the purposes of Python source distributions are,
|
||||
# according to the source distribution format[2] links pointing outside the
|
||||
# destination directory or links with a `..` component, which is those of
|
||||
# concern here.
|
||||
|
||||
# Prune generated/compiled files
|
||||
prune torchgen/packaged
|
||||
prune */__pycache__
|
||||
global-exclude *.o *.obj *.so *.a *.dylib *.pxd *.dll *.lib *.py[cod]
|
||||
|
||||
prune */.git
|
||||
global-exclude .git *~ *.swp
|
||||
# [2] https://packaging.python.org/en/latest/specifications/source-distribution-format/#source-distribution-archive-features
|
||||
|
||||
@ -161,7 +161,7 @@ They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv)
|
||||
|
||||
#### Prerequisites
|
||||
If you are installing from source, you will need:
|
||||
- Python 3.9 or later
|
||||
- Python 3.10 or later
|
||||
- A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required, on Linux)
|
||||
- Visual Studio or Visual Studio Build Tool (Windows only)
|
||||
|
||||
@ -275,7 +275,7 @@ conda install pkg-config libuv
|
||||
pip install mkl-static mkl-include
|
||||
# Add these packages if torch.distributed is needed.
|
||||
# Distributed package support on Windows is a prototype feature and is subject to changes.
|
||||
conda install -c conda-forge libuv
|
||||
conda install -c conda-forge libuv=1.51
|
||||
```
|
||||
|
||||
#### Install PyTorch
|
||||
|
||||
@ -468,7 +468,7 @@ inline Tensor _sum_to(
|
||||
// if we assume no reduction due to unbacked we ensure that at runtime.
|
||||
TORCH_MAYBE_SYM_CHECK(
|
||||
sym_eq(shape[i - leading_dims], sizes[i]),
|
||||
"non-reduction path was assumed due to unabcked symbols expected those two sizes to be the same:",
|
||||
"non-reduction path was assumed due to unbacked symbols expected those two sizes to be the same:",
|
||||
shape[i - leading_dims],
|
||||
", ",
|
||||
sizes[i])
|
||||
|
||||
@ -9,11 +9,6 @@
|
||||
|
||||
namespace at::functionalization {
|
||||
|
||||
ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
|
||||
if (out_idx == this->out_index) return *this;
|
||||
return ViewMeta(forward_fn, reverse_fn, has_symbolic_inputs, is_multi_output, is_as_strided, out_idx);
|
||||
}
|
||||
|
||||
// Note [Functionalization: Alias Removal Part 2]
|
||||
// See Note [Functionalization: Alias Removal] for more details.
|
||||
// This function applies a single update from one of the views to the StorageImpl.
|
||||
@ -42,12 +37,12 @@ ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
|
||||
static const Tensor apply_update(const FunctionalStorageImpl::Update& update, const Tensor& base) {
|
||||
at::Tensor t = update.new_val;
|
||||
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
|
||||
if (update.view_metas.empty()) return t;
|
||||
if (update.view_metas.empty()) { return t; }
|
||||
|
||||
std::vector<at::Tensor> tmp_values({base});
|
||||
tmp_values.reserve(update.view_metas.size());
|
||||
for (size_t i = 0; i < update.view_metas.size() - 1; ++i) {
|
||||
at::Tensor next_view = update.view_metas[i].forward_fn(tmp_values.back(), update.view_metas[i].out_index);
|
||||
at::Tensor next_view = update.view_metas[i]->forward(tmp_values.back());
|
||||
// NB: We only actually need tmp_values for ops like select/slice/diagonal/squeeze/as_strided
|
||||
// All of these ops require additional information to recover the sizes of the original tensor.
|
||||
// If need to, we could probably apply this optimization and only bother computing tmp_values
|
||||
@ -55,9 +50,8 @@ static const Tensor apply_update(const FunctionalStorageImpl::Update& update, co
|
||||
tmp_values.push_back(std::move(next_view));
|
||||
}
|
||||
for(int64_t i = static_cast<int64_t>(update.view_metas.size()) - 1; i >= 0; --i) {
|
||||
int64_t out_idx = update.view_metas[i].out_index;
|
||||
// Each view inverse is implemented in ViewInverses.cpp.
|
||||
t = update.view_metas[i].reverse_fn(tmp_values[i], t, out_idx);
|
||||
t = update.view_metas[i]->reverse(tmp_values[i], t);
|
||||
}
|
||||
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
|
||||
return t;
|
||||
@ -111,13 +105,13 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
|
||||
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base_));
|
||||
}
|
||||
|
||||
void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<ViewMeta>& metas) {
|
||||
void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<std::shared_ptr<ViewMeta>>& metas) {
|
||||
TORCH_CHECK(!frozen_, "cannot mutate tensors with frozen storage");
|
||||
|
||||
if (metas.size() > 1) {
|
||||
for (size_t i = 1; i < metas.size(); ++i) {
|
||||
// Skipping this check for XLA. Would be good to add it back, but it is failing XLA CI
|
||||
TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i].is_as_strided,
|
||||
TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i]->is_as_strided,
|
||||
"During torch.compile, encountered a mutation on a view chain of length ", metas.size(), ", where view ", i,
|
||||
" was an as_strided() call. as_strided() is non-compositional, and therefore is not possible to functionalize properly today,"
|
||||
"so this behavior is banned in compile. As a workaround, you can either remove the mutation from the model code, or you "
|
||||
|
||||
@ -8,44 +8,89 @@ namespace at::functionalization {
|
||||
|
||||
// See Note [Functionalization Pass In Core]
|
||||
|
||||
enum class InverseReturnMode {
|
||||
/// Specifies that functional inverses should always return a view.
|
||||
AlwaysView,
|
||||
/// Specifies that functional inverses should always return a non-view / copy.
|
||||
NeverView,
|
||||
/// Specifies that functional inverses should return a view unless a (copying)
|
||||
/// scatter
|
||||
/// inverse exists, in which case that will be used instead.
|
||||
/// This avoids as_strided() calls that can be difficult for subclasses to
|
||||
/// handle.
|
||||
ViewOrScatterInverse,
|
||||
};
|
||||
|
||||
#define FUNCTIONALIZATION_VIEWMETA_NAME(TYPE) \
|
||||
static const char* name() { \
|
||||
return #TYPE; \
|
||||
}
|
||||
|
||||
#define FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(...) \
|
||||
using SerializableTuple = std::tuple<__VA_ARGS__>
|
||||
|
||||
// ViewMeta is a class used by the functionalization pass to navigate between
|
||||
// a base tensor and a view tensor.
|
||||
// For example, if I call `b = a.view1(...)`
|
||||
// the functionalization pass will generate and store a ViewMeta on b that looks
|
||||
// like:
|
||||
// the functionalization pass will generate and store a ViewMeta specialization
|
||||
// for `view1` operation on b that looks like:
|
||||
//
|
||||
// ViewMeta(
|
||||
// [<captures>](const Tensor& base, int64_t mutated_view_idx) {
|
||||
// return base.view1(...);
|
||||
// },
|
||||
// [<captures>](const at::Tensor& base, const at::Tensor& mutated_view,
|
||||
// int64_t mutated_view_idx) -> at::Tensor {
|
||||
// return at::functionalization::impl::view1_inverse(base, mutated_view,
|
||||
// ...);
|
||||
// struct TORCH_API view1_ViewMeta : public ViewMeta {
|
||||
// FUNCTIONALIZATION_VIEWMETA_NAME(view1_ViewMeta);
|
||||
// FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
|
||||
// bool /* reapply_views */,
|
||||
// const std::vector<int64_t>&);
|
||||
//
|
||||
// view1_ViewMeta(const SerializableTuple& tpl)
|
||||
// : view1_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
|
||||
//
|
||||
// view1_ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
|
||||
// : ViewMeta(/*has_symbolic_inputs=*/false),
|
||||
// reapply_views(reapply_views),
|
||||
// size(size) {}
|
||||
//
|
||||
// Tensor forward(const Tensor& base) override {
|
||||
// return base.view1(...);
|
||||
// }
|
||||
//
|
||||
// The forward_fn lambda describes how to replay view1 on a tensor.
|
||||
// Tensor reverse(const Tensor& base, const Tensor& mutated_view) override {
|
||||
// return at::functionalization::impl::view1_inverse(base, mutated_view,
|
||||
// ...);
|
||||
// }
|
||||
//
|
||||
// The reverse_fn lambda describes how, given a tensor that is already a view,
|
||||
// SerializableTuple to_serializable_tuple() {
|
||||
// return std::make_tuple(reapply_views, size);
|
||||
// }
|
||||
//
|
||||
// bool reapply_views;
|
||||
// std::vector<int64_t> size;
|
||||
// };
|
||||
//
|
||||
// The forward function describes how to replay view1 on a tensor.
|
||||
//
|
||||
// The reverse function describes how, given a tensor that is already a view,
|
||||
// how to get the corresponding base tensor. See Note [Functionalization Pass:
|
||||
// View Inverses] for details.
|
||||
//
|
||||
// `SerializedTuple` is a typedef that defines an `std::tuple<...>` type
|
||||
// representing the `ViewMeta` instance state. Methods that take in/return such
|
||||
// a type are used for supporting pickle serialization.
|
||||
struct ViewMeta {
|
||||
ViewMeta(
|
||||
std::function<Tensor(const Tensor&, int64_t)> forward,
|
||||
std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse,
|
||||
bool has_symbolic_inputs,
|
||||
bool is_multi_output = false,
|
||||
bool is_as_strided = false,
|
||||
int64_t out_idx = 0)
|
||||
: forward_fn(std::move(forward)),
|
||||
reverse_fn(std::move(reverse)),
|
||||
out_index(out_idx),
|
||||
: out_index(out_idx),
|
||||
is_multi_output(is_multi_output),
|
||||
is_as_strided(is_as_strided),
|
||||
has_symbolic_inputs(has_symbolic_inputs) {}
|
||||
|
||||
std::function<Tensor(const Tensor&, int64_t)> forward_fn;
|
||||
std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse_fn;
|
||||
virtual ~ViewMeta() = default;
|
||||
|
||||
virtual Tensor forward(const Tensor& base) = 0;
|
||||
virtual Tensor reverse(const Tensor& base, const Tensor& mutated_view) = 0;
|
||||
|
||||
// See Note [out_idx in ViewMeta]
|
||||
int64_t out_index;
|
||||
|
||||
@ -57,10 +102,17 @@ struct ViewMeta {
|
||||
// Tells us if this view operation has any symbolic inputs
|
||||
bool has_symbolic_inputs;
|
||||
|
||||
// Returns a copy of the current ViewMeta, if out_idx matches the current
|
||||
// out_index. Otherwise, returns a new ViewMeta with the same forward/reverse
|
||||
// Returns a new ViewMeta with the same forward/reverse
|
||||
// functions, but a new out index.
|
||||
ViewMeta to_out_idx(int64_t out_idx);
|
||||
//
|
||||
// This method should be implemented by those `ViewMeta` that have more than
|
||||
// one output.
|
||||
virtual std::shared_ptr<ViewMeta> to_out_index(int64_t out_index) {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
false,
|
||||
"ViewMeta::to_out_index not implemented. ",
|
||||
"Likely because there's only one output.");
|
||||
}
|
||||
};
|
||||
|
||||
// FunctionalStorageImpl is a subclass of StorageImpl used by the
|
||||
@ -93,14 +145,14 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
|
||||
const at::Tensor new_val;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
|
||||
const std::vector<ViewMeta> view_metas;
|
||||
const std::vector<std::shared_ptr<ViewMeta>> view_metas;
|
||||
};
|
||||
|
||||
explicit FunctionalStorageImpl(const Tensor& value);
|
||||
|
||||
void add_update(
|
||||
const Tensor& updated_val,
|
||||
const std::vector<ViewMeta>& view_metas);
|
||||
const std::vector<std::shared_ptr<ViewMeta>>& view_metas);
|
||||
bool apply_updates();
|
||||
const Tensor& base() {
|
||||
return base_;
|
||||
|
||||
@ -129,17 +129,19 @@ void FunctionalTensorWrapper::freeze_storage() const {
|
||||
// - view_value: The output tensor that we need to wrap.
|
||||
// - base: The "base" of the view that `view_value` was generated from.
|
||||
// See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic.
|
||||
FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, const functionalization::ViewMeta& meta)
|
||||
: c10::TensorImpl(
|
||||
c10::DispatchKeySet(DispatchKey::Functionalize),
|
||||
view_value.dtype(),
|
||||
base->storage().data_ptr().device()
|
||||
),
|
||||
value_(view_value),
|
||||
is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output),
|
||||
was_storage_changed_(base->was_storage_changed_),
|
||||
is_symbolic_(base->is_symbolic_)
|
||||
{
|
||||
FunctionalTensorWrapper::FunctionalTensorWrapper(
|
||||
const Tensor& view_value,
|
||||
const FunctionalTensorWrapper* base,
|
||||
const std::shared_ptr<functionalization::ViewMeta>& meta)
|
||||
: c10::TensorImpl(
|
||||
c10::DispatchKeySet(DispatchKey::Functionalize),
|
||||
view_value.dtype(),
|
||||
base->storage().data_ptr().device()),
|
||||
value_(view_value),
|
||||
is_multi_output_view_(
|
||||
base->is_multi_output_view_ || meta->is_multi_output),
|
||||
was_storage_changed_(base->was_storage_changed_),
|
||||
is_symbolic_(base->is_symbolic_) {
|
||||
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(value_));
|
||||
TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
|
||||
set_constructor_metadata();
|
||||
@ -148,11 +150,10 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const
|
||||
view_metas_ = base->view_metas_; // copy
|
||||
}
|
||||
view_metas_.push_back(meta);
|
||||
maybe_mark_symbolic(meta);
|
||||
maybe_mark_symbolic(meta.get());
|
||||
storage_ = base->storage_; // alias this tensor's storage with the base tensor's
|
||||
}
|
||||
|
||||
|
||||
functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_storage_impl() const {
|
||||
return static_cast<functionalization::FunctionalStorageImpl*>(storage_.unsafeGetStorageImpl());
|
||||
}
|
||||
@ -176,18 +177,18 @@ bool FunctionalTensorWrapper::is_up_to_date() const {
|
||||
}
|
||||
|
||||
// See Note [Functionalization Pass - Inplace View Ops]
|
||||
void FunctionalTensorWrapper::mutate_view_meta(const at::functionalization::ViewMeta& meta) {
|
||||
void FunctionalTensorWrapper::mutate_view_meta(const std::shared_ptr<at::functionalization::ViewMeta>& meta) {
|
||||
view_metas_.push_back(meta);
|
||||
// Manually track the fact that this tensor received a metadata mutation!
|
||||
has_metadata_mutation_ = true;
|
||||
// Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation.
|
||||
maybe_mark_symbolic(meta);
|
||||
maybe_mark_symbolic(meta.get());
|
||||
// Note [Functionalization Pass - Inplace View Ops]
|
||||
// So, these ops are special - they're mutation AND view ops. They get special codegen.
|
||||
// An example is transpose_, e.g. `a.transpose_()`
|
||||
// Calling transpose_() should ensure that a gets an alias, and append the new ViewMeta to a's current list of ViewMetas.
|
||||
at::AutoDispatchSkipFunctionalize guard;
|
||||
value_ = meta.forward_fn(value_, meta.out_index);
|
||||
value_ = meta->forward(value_);
|
||||
TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
|
||||
}
|
||||
|
||||
@ -368,15 +369,8 @@ void FunctionalTensorWrapper::sync_() {
|
||||
regenerate_from_base();
|
||||
}
|
||||
|
||||
Tensor FunctionalTensorWrapper::apply_view_metas(const Tensor& base) {
|
||||
auto t = base;
|
||||
|
||||
// Reapply views to get the viewed tensor from the base in alias_
|
||||
for (auto& view_meta: view_metas_) {
|
||||
t = view_meta.forward_fn(t, view_meta.out_index);
|
||||
}
|
||||
|
||||
return t;
|
||||
const std::vector<std::shared_ptr<functionalization::ViewMeta>>& FunctionalTensorWrapper::view_metas() const {
|
||||
return view_metas_;
|
||||
}
|
||||
|
||||
void FunctionalTensorWrapper::regenerate_from_base() {
|
||||
@ -385,7 +379,7 @@ void FunctionalTensorWrapper::regenerate_from_base() {
|
||||
auto t = storage_impl->base();
|
||||
|
||||
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
|
||||
t = apply_view_metas(t);
|
||||
t = at::functionalization::impl::apply_view_meta_sequence(t, view_metas_);
|
||||
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
|
||||
|
||||
replace_(t, /*from_lazy_regenerate=*/true);
|
||||
@ -727,11 +721,11 @@ bool isFunctionalTensor(const std::optional<Tensor>& t) {
|
||||
}
|
||||
|
||||
bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
|
||||
if (t_list.empty()) return false;
|
||||
if (t_list.empty()) { return false; }
|
||||
auto functional_count = 0;
|
||||
for (const auto i : c10::irange(t_list.size())) {
|
||||
auto const & e= t_list[i];
|
||||
if (!e.has_value() || !e->defined()) continue;
|
||||
if (!e.has_value() || !e->defined()) { continue; }
|
||||
if (isFunctionalTensor(e)) {
|
||||
++functional_count;
|
||||
}
|
||||
@ -741,10 +735,10 @@ bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
|
||||
|
||||
template <typename T>
|
||||
static bool isFunctionalTensorIListRef(c10::IListRef<T> list) {
|
||||
if (list.size() == 0) return false;
|
||||
if (list.size() == 0) { return false; }
|
||||
auto functional_count = 0;
|
||||
for (const auto& tensor : list) {
|
||||
if (!tensor.defined()) continue;
|
||||
if (!tensor.defined()) { continue; }
|
||||
if (isFunctionalTensor(tensor)) {
|
||||
++functional_count;
|
||||
}
|
||||
@ -762,20 +756,28 @@ void freeze_functional_tensor(const Tensor& tensor) {
|
||||
functional_base_impl->freeze_storage();
|
||||
}
|
||||
|
||||
Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta, int64_t out_idx) {
|
||||
Tensor create_functional_tensor_with_view_meta(
|
||||
const at::Tensor& view_to_wrap,
|
||||
const at::Tensor& base,
|
||||
const std::shared_ptr<functionalization::ViewMeta>& meta,
|
||||
int64_t out_idx) {
|
||||
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap));
|
||||
TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base));
|
||||
auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base);
|
||||
auto meta_ = meta;
|
||||
if (out_idx != 0) {
|
||||
// Note [out_idx in ViewMeta]
|
||||
// When a view op outputs multiple tensors, each output needs its own separate ViewMeta.
|
||||
// Each ViewMeta also tracks the index of the particular output tensor, which is needed in the reverse function.
|
||||
meta = meta.to_out_idx(out_idx);
|
||||
meta_ = meta->to_out_index(out_idx);
|
||||
}
|
||||
return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta);
|
||||
return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta_);
|
||||
}
|
||||
|
||||
std::vector<Tensor> create_functional_tensor_with_view_meta(ITensorListRef view_to_wrap, const at::Tensor& base, const functionalization::ViewMeta& meta) {
|
||||
std::vector<Tensor> create_functional_tensor_with_view_meta(
|
||||
ITensorListRef view_to_wrap,
|
||||
const at::Tensor& base,
|
||||
const std::shared_ptr<functionalization::ViewMeta>& meta) {
|
||||
std::vector<Tensor> outputs(view_to_wrap.size());
|
||||
int64_t i = 0;
|
||||
for (const auto& tensor : view_to_wrap) {
|
||||
@ -785,12 +787,22 @@ std::vector<Tensor> create_functional_tensor_with_view_meta(ITensorListRef view_
|
||||
return outputs;
|
||||
}
|
||||
|
||||
void mutate_view_meta(const at::Tensor& self, const functionalization::ViewMeta& meta) {
|
||||
void mutate_view_meta(const at::Tensor& self, const std::shared_ptr<functionalization::ViewMeta>& meta) {
|
||||
TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self));
|
||||
auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self);
|
||||
self_impl->mutate_view_meta(meta);
|
||||
}
|
||||
|
||||
Tensor apply_view_meta_sequence(
|
||||
const Tensor& base,
|
||||
const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence) {
|
||||
Tensor r = base;
|
||||
for (auto& vm : sequence) {
|
||||
r = vm->forward(r);
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
// Note [Propagating strides in the functionalization pass]
|
||||
// In order to properly compute stride information, the functionalization pass
|
||||
// calls each {view} reference implementations with meta tensors.
|
||||
@ -884,7 +896,7 @@ void functionalize_op_helper(const c10::OperatorHandle& op, torch::jit::Stack* s
|
||||
const auto& ivalue = returns[idx];
|
||||
if (ivalue.isTensor()) {
|
||||
const auto& t = ivalue.toTensor();
|
||||
if (!t.defined()) continue;
|
||||
if (!t.defined()) { continue; }
|
||||
at::functionalization::impl::sync(t);
|
||||
auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t));
|
||||
(*stack)[returns_begin + idx] = t_new;
|
||||
|
||||
@ -56,7 +56,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
|
||||
explicit FunctionalTensorWrapper(
|
||||
const Tensor& view_value,
|
||||
const FunctionalTensorWrapper* base,
|
||||
const functionalization::ViewMeta& meta);
|
||||
const std::shared_ptr<functionalization::ViewMeta>& meta);
|
||||
|
||||
// Get the underlying, actual tensor, that doesn't know anything about
|
||||
// functionalization.
|
||||
@ -99,17 +99,17 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
|
||||
->are_all_mutations_under_no_grad_or_inference_mode();
|
||||
}
|
||||
|
||||
void maybe_mark_symbolic(const functionalization::ViewMeta& meta) {
|
||||
is_symbolic_ = is_symbolic_ | meta.has_symbolic_inputs;
|
||||
void maybe_mark_symbolic(functionalization::ViewMeta* meta) {
|
||||
is_symbolic_ = is_symbolic_ | meta->has_symbolic_inputs;
|
||||
}
|
||||
|
||||
bool is_symbolic() const {
|
||||
return is_symbolic_;
|
||||
}
|
||||
|
||||
// Runs the forward_fn of every ViewMeta collected in the current instance
|
||||
// to some other base.
|
||||
Tensor apply_view_metas(const Tensor& base);
|
||||
// Retrieves the ViewMeta sequence of this tensor.
|
||||
const std::vector<std::shared_ptr<functionalization::ViewMeta>>& view_metas()
|
||||
const;
|
||||
|
||||
// Sync's the underlying tensor with its alias, if it's out of date. This
|
||||
// involves two steps: 1) Apply any pending updates/mutations to the alias 2)
|
||||
@ -146,7 +146,8 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
|
||||
// from the base tensor. This method is used by inplace-view ops like
|
||||
// transpose_. It appends a ViewMeta to the existing stack, and refreshes the
|
||||
// tensor by replaying the views off of the alias.
|
||||
void mutate_view_meta(const at::functionalization::ViewMeta& meta);
|
||||
void mutate_view_meta(
|
||||
const std::shared_ptr<at::functionalization::ViewMeta>& meta);
|
||||
|
||||
// Custom implementation of self.set_(src)
|
||||
void set__impl(const FunctionalTensorWrapper* other);
|
||||
@ -285,7 +286,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
|
||||
bool is_symbolic_ = false;
|
||||
|
||||
size_t generation_ = 0;
|
||||
std::vector<at::functionalization::ViewMeta> view_metas_;
|
||||
std::vector<std::shared_ptr<at::functionalization::ViewMeta>> view_metas_;
|
||||
|
||||
protected:
|
||||
static void copy_tensor_metadata(
|
||||
@ -377,16 +378,20 @@ TORCH_API void propagate_xla_data_direct(
|
||||
Tensor create_functional_tensor_with_view_meta(
|
||||
const Tensor& view_to_wrap,
|
||||
const Tensor& base,
|
||||
functionalization::ViewMeta meta,
|
||||
const std::shared_ptr<functionalization::ViewMeta>& meta,
|
||||
int64_t out_idx = 0);
|
||||
std::vector<Tensor> create_functional_tensor_with_view_meta(
|
||||
ITensorListRef view_to_wrap,
|
||||
const Tensor& base,
|
||||
const functionalization::ViewMeta& meta);
|
||||
const std::shared_ptr<functionalization::ViewMeta>& meta);
|
||||
|
||||
void mutate_view_meta(
|
||||
const Tensor& self,
|
||||
const functionalization::ViewMeta& meta);
|
||||
const std::shared_ptr<functionalization::ViewMeta>& meta);
|
||||
|
||||
TORCH_API Tensor apply_view_meta_sequence(
|
||||
const Tensor& base,
|
||||
const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence);
|
||||
|
||||
void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out);
|
||||
void set_sizes_strides_offset(
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
#include <ATen/FunctionalizeFallbackKernel.h>
|
||||
|
||||
#include <ATen/core/dispatch/Dispatcher.h>
|
||||
#include <ATen/core/LegacyTypeDispatch.h>
|
||||
#include <ATen/EmptyTensor.h>
|
||||
@ -7,7 +9,6 @@
|
||||
#include <torch/library.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <c10/util/strides.h>
|
||||
#include <ATen/EmptyTensor.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/ATen.h>
|
||||
@ -28,6 +29,31 @@
|
||||
#include <utility>
|
||||
#endif
|
||||
|
||||
namespace at::functionalization {
|
||||
|
||||
Tensor resize__ViewMeta::forward(const Tensor& base) {
|
||||
if (reapply_views) {
|
||||
return base.as_strided(size, c10::contiguous_strides(size));
|
||||
} else {
|
||||
return at::as_strided_copy(base, size, c10::contiguous_strides(size));
|
||||
}
|
||||
}
|
||||
|
||||
Tensor resize__ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
|
||||
return base.as_strided_scatter(
|
||||
mutated_view, size, c10::contiguous_strides(size));
|
||||
}
|
||||
|
||||
Tensor _unsafe_view_ViewMeta::forward(const Tensor& base) {
|
||||
return at::_unsafe_view_symint(base, size);
|
||||
}
|
||||
|
||||
Tensor _unsafe_view_ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
|
||||
return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
|
||||
}
|
||||
|
||||
} // namespace at::functionalization
|
||||
|
||||
namespace {
|
||||
void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet [[maybe_unused]], torch::jit::Stack* stack) {
|
||||
const auto& schema = op.schema();
|
||||
@ -106,7 +132,9 @@ namespace {
|
||||
const auto& ivalue = returns[idx];
|
||||
if (ivalue.isTensor() && should_wrap_outputs) {
|
||||
const auto& t = ivalue.toTensor();
|
||||
if (!t.defined()) continue;
|
||||
if (!t.defined()) {
|
||||
continue;
|
||||
}
|
||||
auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t));
|
||||
(*stack)[returns_begin + idx] = t_new;
|
||||
} else if (ivalue.isTensorList() && should_wrap_outputs) {
|
||||
@ -169,19 +197,8 @@ static const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatch
|
||||
// The output of resizing is equivalent to taking a slice of a larger tensor.
|
||||
// We have to emulate this "slicing" with an as_strided call.
|
||||
auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS();
|
||||
at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
|
||||
[reapply_views = reapply_views, size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
|
||||
if (reapply_views) {
|
||||
return base.as_strided(size, c10::contiguous_strides(size));
|
||||
} else {
|
||||
return at::as_strided_copy(base, size, c10::contiguous_strides(size));
|
||||
}
|
||||
},
|
||||
[size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
|
||||
return base.as_strided_scatter(mutated_view, size, c10::contiguous_strides(size));
|
||||
},
|
||||
/*has_symbolic_inputs=*/false
|
||||
);
|
||||
auto view_meta = std::make_shared<at::functionalization::resize__ViewMeta>(
|
||||
reapply_views, size.vec());
|
||||
at::functionalization::impl::mutate_view_meta(self, view_meta);
|
||||
return self;
|
||||
}
|
||||
@ -300,17 +317,11 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt
|
||||
tmp_output = at::_unsafe_view_symint(self_, size);
|
||||
}
|
||||
|
||||
bool has_symbolic_inputs = std::any_of(size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
|
||||
|
||||
at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
|
||||
[size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
|
||||
return at::_unsafe_view_symint(base, size);
|
||||
},
|
||||
[size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
|
||||
return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
|
||||
},
|
||||
/*has_symbolic_inputs=*/has_symbolic_inputs
|
||||
);
|
||||
bool has_symbolic_inputs = std::any_of(
|
||||
size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
|
||||
auto view_meta =
|
||||
std::make_shared<at::functionalization::_unsafe_view_ViewMeta>(
|
||||
has_symbolic_inputs, size.vec());
|
||||
|
||||
auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, self, std::move(view_meta));
|
||||
// See Note [Propagating strides in the functionalization pass]
|
||||
|
||||
58
aten/src/ATen/FunctionalizeFallbackKernel.h
Normal file
58
aten/src/ATen/FunctionalizeFallbackKernel.h
Normal file
@ -0,0 +1,58 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/FunctionalStorageImpl.h>
|
||||
|
||||
namespace at::functionalization {
|
||||
|
||||
// `ViewMeta` implementation for `resize_` operation.
|
||||
struct TORCH_API resize__ViewMeta : public ViewMeta {
|
||||
FUNCTIONALIZATION_VIEWMETA_NAME(resize__ViewMeta)
|
||||
FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
|
||||
bool /* reapply_views */,
|
||||
const std::vector<int64_t>&);
|
||||
|
||||
resize__ViewMeta(const SerializableTuple& tpl)
|
||||
: resize__ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
|
||||
|
||||
resize__ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
|
||||
: ViewMeta(/*has_symbolic_inputs=*/false),
|
||||
reapply_views(reapply_views),
|
||||
size(size) {}
|
||||
|
||||
Tensor forward(const Tensor& base) override;
|
||||
Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
|
||||
|
||||
SerializableTuple to_serializable_tuple() {
|
||||
return std::make_tuple(reapply_views, size);
|
||||
}
|
||||
|
||||
bool reapply_views;
|
||||
std::vector<int64_t> size;
|
||||
};
|
||||
|
||||
// `ViewMeta` implementation for `_unsafe_view` operation.
|
||||
struct TORCH_API _unsafe_view_ViewMeta : public ViewMeta {
|
||||
FUNCTIONALIZATION_VIEWMETA_NAME(_unsafe_view_ViewMeta)
|
||||
FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
|
||||
bool /* has_symbolic_inputs */,
|
||||
const std::vector<c10::SymInt>&);
|
||||
|
||||
_unsafe_view_ViewMeta(const SerializableTuple& tpl)
|
||||
: _unsafe_view_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
|
||||
|
||||
_unsafe_view_ViewMeta(
|
||||
bool has_symbolic_inputs,
|
||||
const std::vector<c10::SymInt>& size)
|
||||
: ViewMeta(has_symbolic_inputs), size(size) {}
|
||||
|
||||
Tensor forward(const Tensor& base) override;
|
||||
Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
|
||||
|
||||
SerializableTuple to_serializable_tuple() {
|
||||
return std::make_tuple(has_symbolic_inputs, size);
|
||||
}
|
||||
|
||||
std::vector<c10::SymInt> size;
|
||||
};
|
||||
|
||||
} // namespace at::functionalization
|
||||
@ -1,32 +1,22 @@
|
||||
#include <ATen/core/PythonOpRegistrationTrampoline.h>
|
||||
#include <c10/core/impl/PyInterpreterHooks.h>
|
||||
|
||||
// TODO: delete this
|
||||
namespace at::impl {
|
||||
|
||||
// The strategy is that all python interpreters attempt to register themselves
|
||||
// as the main interpreter, but only one wins. Only that interpreter is
|
||||
// allowed to interact with the C++ dispatcher. Furthermore, when we execute
|
||||
// logic on that interpreter, we do so hermetically, never setting pyobj field
|
||||
// on Tensor.
|
||||
|
||||
std::atomic<c10::impl::PyInterpreter*>
|
||||
PythonOpRegistrationTrampoline::interpreter_{nullptr};
|
||||
c10::impl::PyInterpreter* PythonOpRegistrationTrampoline::interpreter_ = nullptr;
|
||||
|
||||
c10::impl::PyInterpreter* PythonOpRegistrationTrampoline::getInterpreter() {
|
||||
return PythonOpRegistrationTrampoline::interpreter_.load();
|
||||
return c10::impl::getGlobalPyInterpreter();
|
||||
}
|
||||
|
||||
bool PythonOpRegistrationTrampoline::registerInterpreter(
|
||||
c10::impl::PyInterpreter* interp) {
|
||||
c10::impl::PyInterpreter* expected = nullptr;
|
||||
interpreter_.compare_exchange_strong(expected, interp);
|
||||
if (expected != nullptr) {
|
||||
// This is the second (or later) Python interpreter, which means we need
|
||||
// non-trivial hermetic PyObject TLS
|
||||
c10::impl::HermeticPyObjectTLS::init_state();
|
||||
if (interpreter_ != nullptr) {
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
interpreter_ = interp;
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace at::impl
|
||||
|
||||
@ -2,19 +2,21 @@
|
||||
|
||||
#include <ATen/core/dispatch/Dispatcher.h>
|
||||
|
||||
// TODO: this can probably live in c10
|
||||
// TODO: We can get rid of this
|
||||
|
||||
|
||||
namespace at::impl {
|
||||
|
||||
// Manages the single Python interpreter instance for PyTorch.
|
||||
class TORCH_API PythonOpRegistrationTrampoline final {
|
||||
static std::atomic<c10::impl::PyInterpreter*> interpreter_;
|
||||
static c10::impl::PyInterpreter* interpreter_;
|
||||
|
||||
public:
|
||||
// Returns true if you successfully registered yourself (that means
|
||||
// you are in the hot seat for doing the operator registrations!)
|
||||
// Register the Python interpreter. Returns true on first registration,
|
||||
// false if an interpreter was already registered.
|
||||
static bool registerInterpreter(c10::impl::PyInterpreter*);
|
||||
|
||||
// Returns the registered interpreter via the global PyInterpreter hooks.
|
||||
// Returns nullptr if no interpreter has been registered yet.
|
||||
static c10::impl::PyInterpreter* getInterpreter();
|
||||
};
|
||||
|
||||
@ -149,5 +149,105 @@ static inline void pack_vnni4(
|
||||
#endif
|
||||
}
|
||||
|
||||
// This is a helper function for transpose_pack_vnni4
|
||||
// Transform a [4, 16] block (with incontiguous output)
|
||||
// Src:
|
||||
// a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15 a16
|
||||
// b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12 b13 b14 b15 b16
|
||||
// c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 c16
|
||||
// d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15 d16
|
||||
// Dst:
|
||||
// a1 a2 a3 a4 b1 b2 b3 b4 c1 c2 c3 c4 d1 d2 d3 d4
|
||||
// a5 a6 a7 a8 b5 b6 b7 b8 c5 c6 c7 c8 d5 d6 d7 d8
|
||||
// a9 a10 a11 a12 b9 b10 b11 b12 c9 c10 c11 c12 d9 d10 d11 d12
|
||||
// a13 a14 a15 a16 b13 b14 b15 b16 c13 c14 c15 c16 d13 d14 d15 d16
|
||||
template <typename scalar_t, typename = std::enable_if_t<sizeof(scalar_t) == 1>>
|
||||
static inline void transpose_vnni4_pad_4x16_block(
|
||||
const scalar_t* src,
|
||||
scalar_t* dst,
|
||||
int64_t ld_src,
|
||||
int64_t ld_dst,
|
||||
int krem = 4) {
|
||||
#if defined(CPU_CAPABILITY_AVX512)
|
||||
__m128i r[4];
|
||||
for (int i = 0; i < krem; ++i) {
|
||||
r[i] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i * ld_src));
|
||||
}
|
||||
for (int i = krem; i < 4; ++i) {
|
||||
r[i] = _mm_setzero_si128();
|
||||
}
|
||||
|
||||
// Transpose 4x16 bytes using unpack and shuffle
|
||||
__m128i t0 = _mm_unpacklo_epi32(r[0], r[1]);
|
||||
__m128i t1 = _mm_unpackhi_epi32(r[0], r[1]);
|
||||
__m128i t2 = _mm_unpacklo_epi32(r[2], r[3]);
|
||||
__m128i t3 = _mm_unpackhi_epi32(r[2], r[3]);
|
||||
|
||||
__m128i r0 = _mm_unpacklo_epi64(t0, t2);
|
||||
__m128i r1 = _mm_unpackhi_epi64(t0, t2);
|
||||
__m128i r2 = _mm_unpacklo_epi64(t1, t3);
|
||||
__m128i r3 = _mm_unpackhi_epi64(t1, t3);
|
||||
|
||||
// Store output
|
||||
if (krem == 4) {
|
||||
// normal case
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst), r0);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst), r1);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst * 2), r2);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst * 3), r3);
|
||||
} else {
|
||||
// masked case
|
||||
__mmask16 mask = (1ULL << (krem * 4)) - 1;
|
||||
_mm_mask_storeu_epi8(dst, mask, r0);
|
||||
_mm_mask_storeu_epi8(reinterpret_cast<__m128i*>(dst + ld_dst), mask, r1);
|
||||
_mm_mask_storeu_epi8(
|
||||
reinterpret_cast<__m128i*>(dst + ld_dst * 2), mask, r2);
|
||||
_mm_mask_storeu_epi8(
|
||||
reinterpret_cast<__m128i*>(dst + ld_dst * 3), mask, r3);
|
||||
}
|
||||
#else
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"transpose_vnni4_pad_4x16_block is only supported when AVX-512 is supported")
|
||||
#endif
|
||||
}
|
||||
|
||||
// Do the transpose packing fusion with VNNI4
|
||||
// Reorder [K, N] → [N/4, K, 4] (VNNI4-style layout for bit8)
|
||||
template <typename scalar_t, typename = std::enable_if_t<sizeof(scalar_t) == 1>>
|
||||
static inline void transpose_pack_vnni4(
|
||||
const scalar_t* src,
|
||||
scalar_t* dst,
|
||||
int64_t ld_src,
|
||||
int64_t K,
|
||||
int64_t N) {
|
||||
#if defined(CPU_CAPABILITY_AVX512)
|
||||
TORCH_CHECK(
|
||||
N % 16 == 0, "N needs to be multiple of 16 for transpose_pack_vnni4");
|
||||
int64_t bk = 0;
|
||||
int64_t _K = K / 4 * 4;
|
||||
for (; bk < _K; bk += 4) {
|
||||
int64_t bn = 0;
|
||||
for (; bn < N; bn += 16) {
|
||||
transpose_vnni4_pad_4x16_block(
|
||||
src + bk * ld_src + bn, dst + bn * K + bk * 4, ld_src, K * 4);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle leftover K rows (< 4)
|
||||
if (K % 4 != 0) {
|
||||
int krem = K - bk;
|
||||
int64_t bn = 0;
|
||||
for (; bn < N; bn += 16) {
|
||||
transpose_vnni4_pad_4x16_block(
|
||||
src + bk * ld_src + bn, dst + bn * K + bk * 4, ld_src, K * 4, krem);
|
||||
}
|
||||
}
|
||||
#else
|
||||
TORCH_CHECK(
|
||||
false, "transpose_pack_vnni4 is only supported when AVX-512 is supported")
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
|
||||
@ -281,6 +281,9 @@ bool CUDAHooks::compiledWithMIOpen() const {
|
||||
|
||||
bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const {
|
||||
#if AT_CUDNN_ENABLED()
|
||||
if (!hasCUDA()) {
|
||||
return false;
|
||||
}
|
||||
// NOTE: extra parenthesis around numbers disable clang warnings about
|
||||
// dead code
|
||||
return true;
|
||||
@ -291,6 +294,9 @@ bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const {
|
||||
|
||||
bool CUDAHooks::supportsDepthwiseConvolutionWithCuDNN() const {
|
||||
#if AT_CUDNN_ENABLED()
|
||||
if (!hasCUDA()) {
|
||||
return false;
|
||||
}
|
||||
cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
|
||||
// Check for Volta cores
|
||||
if (prop->major >= 7) {
|
||||
@ -305,6 +311,9 @@ bool CUDAHooks::supportsDepthwiseConvolutionWithCuDNN() const {
|
||||
|
||||
bool CUDAHooks::supportsBFloat16ConvolutionWithCuDNNv8() const {
|
||||
#if AT_CUDNN_ENABLED()
|
||||
if (!hasCUDA()) {
|
||||
return false;
|
||||
}
|
||||
cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
|
||||
// Check for Volta cores
|
||||
if (prop->major >= 8) {
|
||||
|
||||
@ -465,8 +465,11 @@ inline bool mps_conv_use_channels_last(const at::Tensor& input, const at::Tensor
|
||||
return false;
|
||||
}
|
||||
|
||||
auto fmt = input.suggest_memory_format();
|
||||
return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d;
|
||||
auto is_channel_last = [](const at::Tensor& t) {
|
||||
auto fmt = t.suggest_memory_format();
|
||||
return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d;
|
||||
};
|
||||
return is_channel_last(input) || is_channel_last(weight);
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
@ -32,10 +32,6 @@
|
||||
#include <ATen/native/mkldnn/Utils.h>
|
||||
#endif
|
||||
|
||||
#ifdef USE_MPS
|
||||
#include <ATen/mps/MPSDevice.h>
|
||||
#endif
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
@ -410,11 +406,23 @@ struct ConvParams {
|
||||
// cudnn and miopen are guaranteed not to be on mobile, and T102591915 / T110194934 suggest
|
||||
// that maybe the compiledWithCuDNN() check sometimes segfaults (though I can't imagine how)
|
||||
#if !defined(C10_MOBILE)
|
||||
if (!detail::getCUDAHooks().compiledWithCuDNN()) {
|
||||
if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) {
|
||||
return false;
|
||||
}
|
||||
static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
|
||||
// broken on cuDNN 9.8
|
||||
if (cudnn_version >= 90800) {
|
||||
if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&
|
||||
(input.scalar_type() == at::kBFloat16 || input.scalar_type() == at::kHalf) &&
|
||||
weight.dim() == 5) {
|
||||
for (int i = 2; i < weight.dim(); i++) {
|
||||
if (weight.size(i) != 1) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (needs_64bit_indexing_no_split(input, weight)) {
|
||||
static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
|
||||
if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
|
||||
TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
|
||||
" if the V8 API is not enabled or before cuDNN version 9.3+."
|
||||
@ -422,9 +430,6 @@ struct ConvParams {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (!input.is_cuda() || !cudnn_enabled) {
|
||||
return false;
|
||||
}
|
||||
if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16) {
|
||||
if (!(detail::getCUDAHooks().supportsBFloat16ConvolutionWithCuDNNv8() && at::native::cudnnv8_enabled_check_debug())) {
|
||||
return false;
|
||||
@ -443,16 +448,19 @@ struct ConvParams {
|
||||
|
||||
// Use cudnn for FP16 depthwise convolutions
|
||||
bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const {
|
||||
if (!detail::getCUDAHooks().compiledWithCuDNN()) {
|
||||
if (!cudnn_enabled || !detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda()) {
|
||||
return false;
|
||||
}
|
||||
if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous && use_cudnn(input, weight)) {
|
||||
// always use cudnn_depthwise for channels_last format
|
||||
return true;
|
||||
}
|
||||
// native kernel doesn't support 64-bit non-splittable case
|
||||
if (cudnn_enabled && !(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
|
||||
if (!(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
|
||||
static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1;
|
||||
// TODO(eqy): remove this once cuDNN fixes 64-bit depthwise support, first broken in 9.11x
|
||||
if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
|
||||
if (cudnn_version < 0 || cudnn_version > 91000) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
|
||||
TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
|
||||
" if the V8 API is not enabled or before cuDNN version 9.3+."
|
||||
@ -462,6 +470,10 @@ struct ConvParams {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
|
||||
// always use cudnn_depthwise for channels_last format
|
||||
return true;
|
||||
}
|
||||
if (detail::getCUDAHooks().supportsDepthwiseConvolutionWithCuDNN()) {
|
||||
bool kernel_cond = (use_cudnn(input, weight) &&
|
||||
input.scalar_type() == kHalf && // only for FP16
|
||||
@ -1429,12 +1441,8 @@ static inline at::MemoryFormat determine_backend_memory_format(
|
||||
}
|
||||
break;
|
||||
case ConvBackend::Mps:
|
||||
case ConvBackend::MpsTranspose:
|
||||
if (mps_conv_use_channels_last(input, weight)) {
|
||||
#ifdef USE_MPS
|
||||
if (!mps::is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_15_0_PLUS)) {
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
backend_memory_format = (k == 5) ? MemoryFormat::ChannelsLast3d : MemoryFormat::ChannelsLast;
|
||||
}
|
||||
break;
|
||||
|
||||
@ -9,6 +9,7 @@
|
||||
#include <ATen/native/TransposeType.h>
|
||||
#include <ATen/native/Unfold3d.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <c10/util/safe_numerics.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
@ -174,6 +175,23 @@ static inline void slow_conv3d_shape_check(
|
||||
const int64_t input_height = input.size(dim_height);
|
||||
const int64_t input_width = input.size(dim_width);
|
||||
|
||||
constexpr int64_t MAX_SAFE_PAD = (1LL << 61);
|
||||
|
||||
TORCH_CHECK_VALUE(
|
||||
pad_height <= MAX_SAFE_PAD,
|
||||
"Padding height too large: pad_height=",
|
||||
pad_height);
|
||||
|
||||
TORCH_CHECK_VALUE(
|
||||
pad_width <= MAX_SAFE_PAD,
|
||||
"Padding width too large: pad_width=",
|
||||
pad_width);
|
||||
|
||||
TORCH_CHECK_VALUE(
|
||||
pad_depth <= MAX_SAFE_PAD,
|
||||
"Padding depth too large: pad_depth=",
|
||||
pad_depth);
|
||||
|
||||
const int64_t exact_input_depth = input_depth + 2 * pad_depth;
|
||||
const int64_t exact_input_height = input_height + 2 * pad_height;
|
||||
const int64_t exact_input_width = input_width + 2 * pad_width;
|
||||
@ -221,6 +239,14 @@ static inline void slow_conv3d_shape_check(
|
||||
output_width,
|
||||
"). Output size is too small");
|
||||
|
||||
uint64_t kernel_product;
|
||||
TORCH_CHECK(
|
||||
!c10::mul_overflows(kernel_height, kernel_width, &kernel_product),
|
||||
"Kernel height x width product is too large: kernel_height=",
|
||||
kernel_height,
|
||||
", kernel_width=",
|
||||
kernel_width);
|
||||
|
||||
if (weight.defined()) {
|
||||
int64_t n_input_plane = weight.size(1);
|
||||
if (weight.dim() == 2) {
|
||||
|
||||
@ -23,6 +23,7 @@
|
||||
#include <ATen/ops/linspace.h>
|
||||
#endif
|
||||
|
||||
#include <cmath>
|
||||
#include <numeric>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
@ -202,6 +203,46 @@ select_outer_bin_edges(const Tensor& input, std::optional<c10::ArrayRef<double>>
|
||||
return std::make_pair(leftmost_edges, rightmost_edges);
|
||||
}
|
||||
|
||||
|
||||
/* Bin edges correction based on the precision representation.
|
||||
* To maintain the backward compatibility we take max(std::nextafter<>, +1)
|
||||
* and min(std::nextafter<>, -1) for scalar types. For other types +/- 1 as usual.
|
||||
*/
|
||||
void bins_edges_correction(const ScalarType& t, double &leftmost_edge, double &rightmost_edge)
|
||||
{
|
||||
#define UPDATE_WITH_LIMIT(real_type, scalartype) \
|
||||
case ScalarType::scalartype: \
|
||||
leftmost_edge = std::min( \
|
||||
static_cast<double>( \
|
||||
std::nexttoward( \
|
||||
static_cast<real_type>(leftmost_edge), \
|
||||
std::numeric_limits<real_type>::lowest() \
|
||||
) \
|
||||
), \
|
||||
leftmost_edge - 1. \
|
||||
); \
|
||||
rightmost_edge = std::max( \
|
||||
static_cast<double>( \
|
||||
std::nexttoward( \
|
||||
static_cast<real_type>(rightmost_edge), \
|
||||
std::numeric_limits<real_type>::max() \
|
||||
) \
|
||||
), \
|
||||
rightmost_edge + 1. \
|
||||
); \
|
||||
break;
|
||||
|
||||
switch (t) {
|
||||
UPDATE_WITH_LIMIT(double, Double)
|
||||
UPDATE_WITH_LIMIT(float, Float)
|
||||
default:
|
||||
// Fallback to the default behavior for other types
|
||||
leftmost_edge -= 1;
|
||||
rightmost_edge += 1;
|
||||
}
|
||||
#undef UPDATE_WITH_LIMIT
|
||||
}
|
||||
|
||||
/* histc's version of the logic for outermost bin edges.
|
||||
*/
|
||||
std::pair<double, double> histc_select_outer_bin_edges(const Tensor& input,
|
||||
@ -216,8 +257,7 @@ std::pair<double, double> histc_select_outer_bin_edges(const Tensor& input,
|
||||
}
|
||||
|
||||
if (leftmost_edge == rightmost_edge) {
|
||||
leftmost_edge -= 1;
|
||||
rightmost_edge += 1;
|
||||
bins_edges_correction(input.dtype().toScalarType(), leftmost_edge, rightmost_edge);
|
||||
}
|
||||
|
||||
TORCH_CHECK(!(std::isinf(leftmost_edge) || std::isinf(rightmost_edge) ||
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
#include <ATen/core/ATen_fwd.h>
|
||||
#include <c10/core/ScalarType.h>
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/AccumulateType.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
@ -1878,19 +1880,18 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) {
|
||||
|
||||
Tensor xtensor = self.expand(padded_size);
|
||||
|
||||
Tensor result;
|
||||
Tensor urtensor;
|
||||
if (self.is_quantized()) {
|
||||
result = at::empty_quantized(target_size, self);
|
||||
urtensor = at::empty_quantized(target_size, self);
|
||||
} else {
|
||||
result = at::empty(target_size, self.options());
|
||||
urtensor = at::empty(target_size, self.options());
|
||||
}
|
||||
|
||||
// return an empty tensor if one of the repeat dimensions is zero
|
||||
if (zero_tensor) {
|
||||
return result;
|
||||
return urtensor;
|
||||
}
|
||||
|
||||
Tensor urtensor = at::alias(result);
|
||||
for (const auto i : c10::irange(xtensor.dim())) {
|
||||
// can't unfold with step 0, so make sure step is at least 1
|
||||
// (it doesn't matter what it is in that case, because the size is 0).
|
||||
@ -1900,7 +1901,22 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) {
|
||||
|
||||
urtensor.copy_(xtensor.expand_as(urtensor));
|
||||
|
||||
return result;
|
||||
// Combine the dimensions to produce the target_size.
|
||||
// xtensor dims: [a0, ..., ad-1]
|
||||
// urtensor dims: [a0, ..., ad-1, b0, ..., bd-1]
|
||||
// b dims are produced by unfold.
|
||||
// Transform urtensor to [a0 * b0, ..., ad-1 * bd-1]
|
||||
const int64_t n_dims = xtensor.dim();
|
||||
auto range_a = at::arange(xtensor.dim(), at::TensorOptions(at::kLong));
|
||||
auto range_b = range_a + n_dims;
|
||||
auto stacked = stack({std::move(range_a), std::move(range_b)}, 1).flatten();
|
||||
auto permutation = IntArrayRef(stacked.data_ptr<int64_t>(), n_dims * 2);
|
||||
// Permute from [a0, ..., ad-1, b0, ..., bd-1] to [a0, b0, ..., ad-1, bd-1]
|
||||
urtensor = urtensor.permute(permutation);
|
||||
// Reshape from [a0, b0, ..., ad-1, bd-1] to [a0 * b0, ..., ad-1 * bd-1]
|
||||
urtensor = urtensor.reshape(target_size);
|
||||
|
||||
return urtensor;
|
||||
}
|
||||
|
||||
Tensor tile_symint(const Tensor& self, SymIntArrayRef reps) {
|
||||
|
||||
@ -42,6 +42,19 @@ void bfloat16_copy_kernel_cuda(TensorIteratorBase &iter) {
|
||||
});
|
||||
}
|
||||
|
||||
#ifdef USE_ROCM
|
||||
void bfloat16tofloat32_copy_kernel_cuda(TensorIteratorBase &iter) {
|
||||
gpu_kernel_nocast(iter, [] GPU_LAMBDA(at::BFloat16 value) {
|
||||
return static_cast<float>(value);
|
||||
});
|
||||
}
|
||||
void float16tofloat32_copy_kernel_cuda(TensorIteratorBase &iter) {
|
||||
gpu_kernel_nocast(iter, [] GPU_LAMBDA(at::Half value) {
|
||||
return static_cast<float>(value);
|
||||
});
|
||||
}
|
||||
#endif
|
||||
|
||||
void float8_copy_kernel_cuda(TensorIteratorBase &iter) {
|
||||
ScalarType dtype = iter.dtype(0);
|
||||
ScalarType other_dtype = iter.dtype(1);
|
||||
@ -187,7 +200,17 @@ void direct_copy_kernel_cuda(TensorIteratorBase &iter) {
|
||||
} else {
|
||||
float16_copy_kernel_cuda(iter);
|
||||
}
|
||||
} else if (isBitsType(dtype)) {
|
||||
}
|
||||
#ifdef USE_ROCM
|
||||
else if ((iter.dtype(1) == kBFloat16 || iter.dtype(1) == kHalf) && dtype == kFloat) {
|
||||
if (iter.dtype(1) == kBFloat16) {
|
||||
bfloat16tofloat32_copy_kernel_cuda(iter);
|
||||
} else {
|
||||
float16tofloat32_copy_kernel_cuda(iter);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
else if (isBitsType(dtype)) {
|
||||
TORCH_CHECK(dtype == iter.dtype(1), "copy_() does not support casting "
|
||||
"bits types to different bits types. Source dtype is ", iter.dtype(1), "target dtype is ", dtype);
|
||||
AT_DISPATCH_BIT_TYPES(dtype, "copy_", [&] {
|
||||
|
||||
@ -223,9 +223,6 @@ void grid_sampler_single_element(
|
||||
auto input_size = input_sizes[input_dim];
|
||||
auto coord = static_cast<opmath_t<T>>(coords[coord_dim]);
|
||||
|
||||
// Interpret nan as -1
|
||||
coord = isnan(coord) ? -1 : coord;
|
||||
|
||||
if (!align_corners) {
|
||||
// Map unaligned grid space to aligned grid space
|
||||
auto corner_alignment_factor = static_cast<opmath_t<T>>(input_size) /
|
||||
|
||||
@ -52,9 +52,7 @@ static void fill_depthwise_conv_desc(MPSGraphDepthwiseConvolution3DOpDescriptor*
|
||||
NSUInteger dilationRateInX,
|
||||
NSUInteger dilationRateInY,
|
||||
NSUInteger paddingHorizontal,
|
||||
NSUInteger paddingVertical,
|
||||
c10::MemoryFormat memory_format,
|
||||
NSUInteger groups) {
|
||||
NSUInteger paddingVertical) {
|
||||
descriptor_.strides =
|
||||
@[ @1, [[NSNumber alloc] initWithInteger:strideInY], [[NSNumber alloc] initWithInteger:strideInX] ];
|
||||
descriptor_.dilationRates =
|
||||
@ -103,7 +101,7 @@ static void fill_conv_desc(MPSGraphConvolution2DOpDescriptor* descriptor_,
|
||||
descriptor_.groups = groups;
|
||||
}
|
||||
|
||||
static Tensor _mps_convolution_impl(const Tensor& input_t_,
|
||||
static Tensor _mps_convolution_impl(const Tensor& input_t,
|
||||
const Tensor& weight_t,
|
||||
const std::optional<Tensor>& bias_opt,
|
||||
IntArrayRef padding,
|
||||
@ -111,12 +109,15 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
|
||||
IntArrayRef dilation,
|
||||
int64_t groups,
|
||||
std::optional<IntArrayRef> input_shape) {
|
||||
const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
|
||||
Tensor input_t = input_t_;
|
||||
bool is3DConv = input_t.dim() == 5;
|
||||
if (!is_macOS_15_0_or_newer || is3DConv) {
|
||||
input_t = input_t.contiguous();
|
||||
}
|
||||
constexpr auto kChannelsLast = MemoryFormat::ChannelsLast;
|
||||
constexpr auto kContiguous = MemoryFormat::Contiguous;
|
||||
const bool is_macos_15_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
|
||||
|
||||
const bool is3DConv = input_t.dim() == 5;
|
||||
const auto memory_format = input_t.suggest_memory_format();
|
||||
const auto input_suggested_layout = memory_format == kChannelsLast && is_macos_15_plus ? kChannelsLast : kContiguous;
|
||||
const bool is_channels_last = mps_conv_use_channels_last(input_t, weight_t) && !is3DConv;
|
||||
const bool bias_defined = bias_opt ? bias_opt->defined() : false;
|
||||
|
||||
TORCH_CHECK(isFloatingType(input_t.scalar_type()), "Convolution is supported only for Floating types");
|
||||
|
||||
@ -126,15 +127,6 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
|
||||
checkAllSameType(c, {input, weight});
|
||||
checkAllSameGPU(c, {input, weight});
|
||||
|
||||
bool bias_defined;
|
||||
|
||||
if (bias_opt == std::nullopt)
|
||||
bias_defined = false;
|
||||
else
|
||||
bias_defined = bias_opt->defined();
|
||||
|
||||
auto memory_format = input_t.suggest_memory_format();
|
||||
bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast) && !is3DConv;
|
||||
auto output_t =
|
||||
at::empty(input_shape.has_value() ? input_shape.value()
|
||||
: conv_output_size(input->sizes(), weight->sizes(), padding, stride, dilation),
|
||||
@ -142,12 +134,18 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
|
||||
std::nullopt,
|
||||
kMPS,
|
||||
std::nullopt,
|
||||
is_macOS_15_0_or_newer ? memory_format : MemoryFormat::Contiguous);
|
||||
is_channels_last ? kChannelsLast : kContiguous);
|
||||
if (output_t.numel() == 0) {
|
||||
return output_t;
|
||||
}
|
||||
TensorArg output{output_t, "result", 0};
|
||||
|
||||
// TODO: Remove me when MacOS-14 is no longer supported
|
||||
std::optional<Tensor> output_c;
|
||||
if (!is_macos_15_plus && is_channels_last) {
|
||||
output_c = at::empty_like(output_t, output_t.options().memory_format(kContiguous));
|
||||
}
|
||||
|
||||
if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_1_PLUS)) {
|
||||
// On macOS < 15.1, MPS convolution kernel does not support output channels > 2^16
|
||||
for (auto elem : output_t.sizes()) {
|
||||
@ -186,32 +184,22 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
|
||||
getArrayRefString(dilation),
|
||||
getArrayRefString(padding),
|
||||
groups,
|
||||
is_channels_last,
|
||||
input_suggested_layout == kChannelsLast,
|
||||
mps::getTensorsStringKey({input_t, weight_t}),
|
||||
bias_defined,
|
||||
bias_shape_key);
|
||||
|
||||
MPSShape* inputShape = mps::getMPSShape(input_t, memory_format);
|
||||
MPSShape* outputShape = mps::getMPSShape(output_t, memory_format);
|
||||
MPSNDArray* inputNDArray = nil;
|
||||
MPSNDArray* outputNDArray = nil;
|
||||
|
||||
if (input_t.is_contiguous(memory_format) && output_t.is_contiguous(memory_format) && is_macOS_15_0_or_newer) {
|
||||
inputNDArray = getMPSNDArray(input_t, inputShape);
|
||||
outputNDArray = getMPSNDArray(output_t, outputShape);
|
||||
}
|
||||
|
||||
auto inputShape = mps::getMPSShape(input_t, input_suggested_layout);
|
||||
auto outputShape = mps::getMPSShape(output_t, input_suggested_layout);
|
||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
MPSShape* weightShape = mps::getMPSShape(weight_t);
|
||||
bool isDepthwiseConv = ((groups > 1 && (weightShape[1].intValue == 1)) && inputShape.count >= 4 &&
|
||||
weightShape.count >= 4 && !is_channels_last);
|
||||
bool isDepthwiseConv =
|
||||
(groups > 1 && weight_t.size(1) == 1) && input_t.dim() >= 4 && weight_t.dim() >= 4 && !is_channels_last;
|
||||
|
||||
MPSGraphTensor* inputTensor =
|
||||
mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(input_t.scalar_type()), inputShape);
|
||||
MPSGraphTensor* weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
|
||||
MPSGraphTensor* outputTensor;
|
||||
auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(input_t), inputShape);
|
||||
auto weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
|
||||
MPSGraphTensor* outputTensor = nil;
|
||||
if (is3DConv) {
|
||||
MPSGraphConvolution3DOpDescriptor* conv3dDescriptor_ = [[MPSGraphConvolution3DOpDescriptor new] autorelease];
|
||||
auto conv3dDescriptor_ = [[MPSGraphConvolution3DOpDescriptor new] autorelease];
|
||||
fill_conv3d_desc(conv3dDescriptor_,
|
||||
stride[2],
|
||||
stride[1],
|
||||
@ -229,17 +217,9 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
|
||||
descriptor:conv3dDescriptor_
|
||||
name:nil];
|
||||
} else if (isDepthwiseConv) {
|
||||
MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor_ =
|
||||
[[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease];
|
||||
fill_depthwise_conv_desc(depthWiseConv3dDescriptor_,
|
||||
stride[1],
|
||||
stride[0],
|
||||
dilation[1],
|
||||
dilation[0],
|
||||
padding[1],
|
||||
padding[0],
|
||||
memory_format,
|
||||
groups);
|
||||
auto depthWiseConv3dDescriptor_ = [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease];
|
||||
fill_depthwise_conv_desc(
|
||||
depthWiseConv3dDescriptor_, stride[1], stride[0], dilation[1], dilation[0], padding[1], padding[0]);
|
||||
|
||||
MPSGraphTensor* weightTransposeTensor = [mpsGraph transposeTensor:weightTensor
|
||||
dimension:-3
|
||||
@ -258,7 +238,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
|
||||
dilation[0],
|
||||
padding[1],
|
||||
padding[0],
|
||||
memory_format,
|
||||
input_suggested_layout,
|
||||
groups);
|
||||
|
||||
outputTensor = [mpsGraph convolution2DWithSourceTensor:inputTensor
|
||||
@ -270,13 +250,6 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
|
||||
MPSGraphTensor* biasTensor = nil;
|
||||
if (bias_defined) {
|
||||
biasTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(bias_opt.value()));
|
||||
}
|
||||
|
||||
if (is_channels_last && !is_macOS_15_0_or_newer) {
|
||||
outputTensor = mps::convertNHWCtoNCHW(mpsGraph, outputTensor);
|
||||
}
|
||||
|
||||
if (bias_defined) {
|
||||
outputTensor = [mpsGraph additionWithPrimaryTensor:outputTensor secondaryTensor:biasTensor name:nil];
|
||||
}
|
||||
newCachedGraph->inputTensor_ = inputTensor;
|
||||
@ -285,27 +258,26 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
|
||||
newCachedGraph->outputTensor_ = outputTensor;
|
||||
});
|
||||
|
||||
auto inputPlaceholder = inputNDArray ? Placeholder(cachedGraph->inputTensor_, inputNDArray)
|
||||
: Placeholder(cachedGraph->inputTensor_, input_t, inputShape);
|
||||
auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t);
|
||||
auto inputPlaceholder = input_suggested_layout == kContiguous
|
||||
? Placeholder(cachedGraph->inputTensor_, output_c || is3DConv ? input_t.contiguous() : input_t)
|
||||
: Placeholder(cachedGraph->inputTensor_, getMPSNDArray(input_t, inputShape));
|
||||
auto outputPlaceholder = input_suggested_layout == kContiguous
|
||||
? Placeholder(cachedGraph->outputTensor_, output_c ? *output_c : output_t)
|
||||
: Placeholder(cachedGraph->outputTensor_, getMPSNDArray(output_t, outputShape));
|
||||
auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, output_c ? weight_t.contiguous() : weight_t);
|
||||
auto biasPlaceholder = Placeholder();
|
||||
// Reshape the bias to be broadcastable with output of conv2d or conv3d
|
||||
if (bias_defined) {
|
||||
if (is3DConv) {
|
||||
biasPlaceholder = Placeholder(cachedGraph->biasTensor_, (bias_opt.value()).view({1, bias_shape[0], 1, 1, 1}));
|
||||
biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias_opt->view({1, bias_shape[0], 1, 1, 1}));
|
||||
} else if (input_suggested_layout == kChannelsLast) {
|
||||
biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias_opt->view({1, 1, 1, bias_shape[0]}));
|
||||
} else {
|
||||
if (is_channels_last && is_macOS_15_0_or_newer) {
|
||||
biasPlaceholder = Placeholder(cachedGraph->biasTensor_, (bias_opt.value()).view({1, 1, 1, bias_shape[0]}));
|
||||
} else {
|
||||
biasPlaceholder = Placeholder(cachedGraph->biasTensor_, (bias_opt.value()).view({1, bias_shape[0], 1, 1}));
|
||||
}
|
||||
biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias_opt->view({1, bias_shape[0], 1, 1}));
|
||||
}
|
||||
}
|
||||
auto outputPlaceholder = outputNDArray ? Placeholder(cachedGraph->outputTensor_, outputNDArray)
|
||||
: Placeholder(cachedGraph->outputTensor_, output_t);
|
||||
|
||||
NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
|
||||
[[[NSMutableDictionary alloc] initWithCapacity:3] autorelease];
|
||||
auto feeds = [[[NSMutableDictionary alloc] initWithCapacity:3] autorelease];
|
||||
feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData();
|
||||
feeds[weightsPlaceholder.getMPSGraphTensor()] = weightsPlaceholder.getMPSGraphTensorData();
|
||||
if (bias_defined) {
|
||||
@ -315,6 +287,10 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
|
||||
runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
|
||||
}
|
||||
|
||||
if (output_c) {
|
||||
output_t.copy_(*output_c);
|
||||
}
|
||||
|
||||
return output_t;
|
||||
}
|
||||
|
||||
@ -351,14 +327,21 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
|
||||
TensorArg grad_output{grad_output_t, "grad_output", 1}, weight{weight_t, "weight", 2};
|
||||
checkAllSameType(c, {grad_output, weight});
|
||||
checkAllSameGPU(c, {grad_output, weight});
|
||||
auto memory_format = grad_output_t.suggest_memory_format();
|
||||
bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast) && !is3DConv;
|
||||
auto grad_input_t = at::empty(input_size, grad_output_t.options(), std::nullopt);
|
||||
constexpr auto kChannelsLast = at::MemoryFormat::ChannelsLast;
|
||||
bool is_channels_last = mps_conv_use_channels_last(grad_output_t, weight_t) && !is3DConv;
|
||||
auto grad_input_t =
|
||||
at::empty(input_size, grad_output_t.options(), is_channels_last ? std::optional(kChannelsLast) : std::nullopt);
|
||||
|
||||
// Avoid "grad_input" when this is being used as transposed convolution
|
||||
TensorArg grad_input{grad_input_t, "result", 0};
|
||||
convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups);
|
||||
|
||||
// TODO: Remove me when MacOS-14 is no longer supported
|
||||
std::optional<Tensor> grad_input_c;
|
||||
if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS) && is_channels_last) {
|
||||
grad_input_c = at::empty_like(grad_input_t, grad_input_t.options().memory_format(MemoryFormat::Contiguous));
|
||||
}
|
||||
|
||||
// Derive from MPSCachedGraph
|
||||
struct CachedGraph : public MPSCachedGraph {
|
||||
CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
|
||||
@ -370,7 +353,6 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
|
||||
// Add backward with input
|
||||
@autoreleasepool {
|
||||
MPSStream* stream = getCurrentMPSStream();
|
||||
|
||||
MPSShape* mps_input_shape = getMPSShape(input_size);
|
||||
std::string key = fmt::format("mps_{}_convolution_backward_input:{}:{}:{}:{}:{}:{}",
|
||||
is3DConv ? "3d_" : "",
|
||||
@ -411,15 +393,8 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
|
||||
} else if (isDepthwiseConv) {
|
||||
MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor_ =
|
||||
[[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease];
|
||||
fill_depthwise_conv_desc(depthWiseConv3dDescriptor_,
|
||||
stride[1],
|
||||
stride[0],
|
||||
dilation[1],
|
||||
dilation[0],
|
||||
padding[1],
|
||||
padding[0],
|
||||
at::MemoryFormat::Contiguous,
|
||||
groups);
|
||||
fill_depthwise_conv_desc(
|
||||
depthWiseConv3dDescriptor_, stride[1], stride[0], dilation[1], dilation[0], padding[1], padding[0]);
|
||||
MPSGraphTensor* weightTransposeTensor = [mpsGraph transposeTensor:weightTensor
|
||||
dimension:-3
|
||||
withDimension:-4
|
||||
@ -454,14 +429,18 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
|
||||
newCachedGraph->gradInputTensor_ = gradInputTensor;
|
||||
});
|
||||
|
||||
auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t);
|
||||
auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t);
|
||||
auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, *grad_input);
|
||||
auto gradOutputPlaceholder =
|
||||
Placeholder(cachedGraph->gradOutputTensor_, grad_input_c ? grad_output_t.contiguous() : grad_output_t);
|
||||
auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, grad_input_c ? weight_t.contiguous() : weight_t);
|
||||
auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input_c ? *grad_input_c : grad_input_t);
|
||||
|
||||
auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, weightsPlaceholder);
|
||||
runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
|
||||
}
|
||||
return *grad_input;
|
||||
if (grad_input_c) {
|
||||
grad_input_t.copy_(*grad_input_c);
|
||||
}
|
||||
return grad_input_t;
|
||||
}
|
||||
|
||||
static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
|
||||
@ -474,9 +453,11 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
|
||||
bool bias_defined) {
|
||||
using namespace at::native::mps;
|
||||
using namespace mps;
|
||||
bool is3DConv = input_t.dim() == 5;
|
||||
const bool is3DConv = input_t.dim() == 5;
|
||||
TORCH_CHECK(isFloatingType(grad_output_t.scalar_type()), "Convolution is supported only for Floating types");
|
||||
CheckedFrom c = "mps_convolution_backward_weights";
|
||||
constexpr auto kChannelsLast = at::MemoryFormat::ChannelsLast;
|
||||
bool is_channels_last = mps_conv_use_channels_last(input_t, grad_output_t) && !is3DConv;
|
||||
|
||||
// For uniformity with everything else, although it seems grad_weight
|
||||
// would be unambiguous too.
|
||||
@ -487,7 +468,8 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
|
||||
checkAllSameGPU(c, {grad_output, input});
|
||||
|
||||
auto grad_weight_t =
|
||||
at::empty(weight_size, grad_output_t.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
|
||||
at::empty(weight_size, grad_output_t.options(), is_channels_last ? std::optional(kChannelsLast) : std::nullopt);
|
||||
|
||||
TensorArg grad_weight{grad_weight_t, "result", 0};
|
||||
|
||||
convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups);
|
||||
@ -500,16 +482,23 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
|
||||
MPSGraphTensor* gradWeightTensor_ = nil;
|
||||
};
|
||||
|
||||
// TODO: Remove me when MacOS-14 is no longer supported
|
||||
std::optional<Tensor> grad_weight_c;
|
||||
if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS) && is_channels_last) {
|
||||
grad_weight_c = at::empty_like(grad_weight_t, grad_weight_t.options().memory_format(MemoryFormat::Contiguous));
|
||||
}
|
||||
|
||||
@autoreleasepool {
|
||||
MPSStream* stream = getCurrentMPSStream();
|
||||
|
||||
MPSShape* mps_weight_shape = getMPSShape(weight_size);
|
||||
std::string key = fmt::format("mps_{}convolution_backward_weights:{}:{}:{}:{}:{}",
|
||||
std::string key = fmt::format("mps_{}convolution_backward_weights:{}:{}:{}:{}:{}:{}",
|
||||
is3DConv ? "3d_" : "",
|
||||
getArrayRefString(stride),
|
||||
getArrayRefString(dilation),
|
||||
getArrayRefString(padding),
|
||||
groups,
|
||||
is_channels_last,
|
||||
getTensorsStringKey({grad_output_t, input_t, grad_weight_t}));
|
||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
MPSShape* inputShape = getMPSShape(input_t);
|
||||
@ -541,15 +530,8 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
|
||||
} else if (isDepthwiseConv) {
|
||||
MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor_ =
|
||||
[[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease];
|
||||
fill_depthwise_conv_desc(depthWiseConv3dDescriptor_,
|
||||
stride[1],
|
||||
stride[0],
|
||||
dilation[1],
|
||||
dilation[0],
|
||||
padding[1],
|
||||
padding[0],
|
||||
at::MemoryFormat::Contiguous,
|
||||
groups);
|
||||
fill_depthwise_conv_desc(
|
||||
depthWiseConv3dDescriptor_, stride[1], stride[0], dilation[1], dilation[0], padding[1], padding[0]);
|
||||
NSNumber* outputFeatChannelDim = mps_weight_shape[0];
|
||||
MPSShape* weightShapeTranspose = @[ @1, outputFeatChannelDim, mps_weight_shape[2], mps_weight_shape[3] ];
|
||||
MPSGraphTensor* gradWeightTensorTranspose =
|
||||
@ -583,14 +565,19 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
|
||||
newCachedGraph->gradWeightTensor_ = gradWeightTensor;
|
||||
});
|
||||
|
||||
auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t);
|
||||
auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t);
|
||||
auto outputPlaceholder = Placeholder(cachedGraph->gradWeightTensor_, grad_weight_t);
|
||||
auto gradOutputPlaceholder =
|
||||
Placeholder(cachedGraph->gradOutputTensor_, grad_weight_c ? grad_output_t.contiguous() : grad_output_t);
|
||||
auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, grad_weight_c ? input_t.contiguous() : input_t);
|
||||
auto outputPlaceholder =
|
||||
Placeholder(cachedGraph->gradWeightTensor_, grad_weight_c ? *grad_weight_c : grad_weight_t);
|
||||
|
||||
auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, inputPlaceholder);
|
||||
runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
|
||||
}
|
||||
|
||||
if (grad_weight_c) {
|
||||
grad_weight_t.copy_(*grad_weight_c);
|
||||
}
|
||||
return grad_weight_t;
|
||||
}
|
||||
|
||||
|
||||
@ -9,11 +9,22 @@
|
||||
#else
|
||||
#include <ATen/ops/_unique2.h>
|
||||
#include <ATen/ops/_unique2_native.h>
|
||||
#include <ATen/ops/arange.h>
|
||||
#include <ATen/ops/argsort.h>
|
||||
#include <ATen/ops/cat.h>
|
||||
#include <ATen/ops/cumsum.h>
|
||||
#include <ATen/ops/full.h>
|
||||
#include <ATen/ops/masked_select.h>
|
||||
#include <ATen/ops/nonzero.h>
|
||||
#include <ATen/ops/ones.h>
|
||||
#include <ATen/ops/ones_like.h>
|
||||
#include <ATen/ops/slice.h>
|
||||
#include <ATen/ops/unique_consecutive.h>
|
||||
#include <ATen/ops/unique_consecutive_native.h>
|
||||
#include <ATen/ops/unique_dim_consecutive.h>
|
||||
#include <ATen/ops/unique_dim_consecutive_native.h>
|
||||
#include <ATen/ops/unique_dim_native.h>
|
||||
#include <ATen/ops/zeros.h>
|
||||
#endif
|
||||
|
||||
namespace at::native {
|
||||
@ -305,4 +316,85 @@ std::tuple<Tensor, Tensor, Tensor> _unique2_mps(const Tensor& self,
|
||||
return _unique_impl_mps(self, return_inverse, return_counts, false, std::nullopt);
|
||||
}
|
||||
|
||||
static Tensor lexsort_rows_perm_mps(const Tensor& mat_2d) {
|
||||
const auto rows = mat_2d.size(0), cols = mat_2d.size(1);
|
||||
if (rows <= 1 || cols == 0) {
|
||||
return arange(rows, mat_2d.options().dtype(kLong));
|
||||
}
|
||||
|
||||
auto perm = arange(rows, mat_2d.options().dtype(kLong));
|
||||
for (auto c = cols - 1; c >= 0; --c) {
|
||||
auto keys = mat_2d.select(1, c).index_select(0, perm);
|
||||
const auto idx = argsort(keys, /*dim=*/0, /*descending=*/false);
|
||||
perm = perm.index_select(0, idx);
|
||||
}
|
||||
return perm;
|
||||
}
|
||||
|
||||
static std::tuple<Tensor, Tensor, Tensor> unique_dim_sorted_mps_impl(const Tensor& self,
|
||||
int64_t dim,
|
||||
bool return_inverse,
|
||||
bool return_counts) {
|
||||
dim = maybe_wrap_dim(dim, self.dim());
|
||||
|
||||
auto sizes = self.sizes().vec();
|
||||
auto num_zero_dims = std::count(sizes.begin(), sizes.end(), (int64_t)0);
|
||||
if (self.size(dim) == 0) {
|
||||
auto output = at::empty(sizes, self.options());
|
||||
auto inverse_indices = at::empty({0}, self.options().dtype(kLong));
|
||||
auto counts = at::empty({0}, self.options().dtype(kLong));
|
||||
return {output, inverse_indices, counts};
|
||||
}
|
||||
|
||||
auto transposed = self.moveaxis(dim, 0);
|
||||
auto orig_sizes = transposed.sizes().vec();
|
||||
auto rows = transposed.size(0);
|
||||
auto input_flat = transposed.contiguous().view({rows, -1});
|
||||
|
||||
auto perm = lexsort_rows_perm_mps(input_flat);
|
||||
auto input_sorted = input_flat.index_select(0, perm);
|
||||
|
||||
Tensor is_unique = at::zeros({rows}, self.options().dtype(kBool));
|
||||
if (rows > 0) {
|
||||
is_unique.narrow(0, 0, 1).fill_(true);
|
||||
}
|
||||
if (rows > 1) {
|
||||
auto a = input_sorted.narrow(0, 1, rows - 1);
|
||||
auto b = input_sorted.narrow(0, 0, rows - 1);
|
||||
auto row_changed = a.ne(b).any(1);
|
||||
is_unique.narrow(0, 1, rows - 1).copy_(row_changed);
|
||||
}
|
||||
|
||||
auto unique_pos = nonzero(is_unique).squeeze(1);
|
||||
auto group_id = cumsum(is_unique.to(kLong), 0).sub(1);
|
||||
|
||||
auto unique_rows_2d = input_sorted.index_select(0, unique_pos);
|
||||
|
||||
Tensor inverse_indices = empty({0}, self.options().dtype(kLong));
|
||||
if (return_inverse) {
|
||||
inverse_indices = empty({rows}, self.options().dtype(kLong));
|
||||
inverse_indices.index_copy_(0, perm, group_id);
|
||||
}
|
||||
|
||||
Tensor counts = empty({0}, self.options().dtype(kLong));
|
||||
if (return_counts) {
|
||||
const auto num_unique = unique_pos.size(0);
|
||||
counts = zeros({num_unique}, self.options().dtype(kLong));
|
||||
counts.scatter_add_(0, group_id, ones_like(group_id, group_id.options().dtype(kLong)));
|
||||
}
|
||||
|
||||
orig_sizes[0] = unique_rows_2d.size(0);
|
||||
auto output = unique_rows_2d.view(orig_sizes).moveaxis(0, dim);
|
||||
|
||||
return std::make_tuple(std::move(output), std::move(inverse_indices), std::move(counts));
|
||||
}
|
||||
|
||||
std::tuple<Tensor, Tensor, Tensor> unique_dim_mps(const Tensor& self,
|
||||
int64_t dim,
|
||||
const bool /*sorted*/,
|
||||
const bool return_inverse,
|
||||
const bool return_counts) {
|
||||
return unique_dim_sorted_mps_impl(self, dim, return_inverse, return_counts);
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
@ -1409,7 +1409,7 @@
|
||||
- func: _sparse_broadcast_to(Tensor(a) self, int[] size) -> Tensor(a)
|
||||
variants: function
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: sparse_broadcast_to
|
||||
SparseCPU, SparseCUDA, SparseMPS: sparse_broadcast_to
|
||||
|
||||
- func: cat(Tensor[] tensors, int dim=0) -> Tensor
|
||||
structured_delegate: cat.out
|
||||
@ -6450,6 +6450,7 @@
|
||||
dispatch:
|
||||
CPU: unique_dim_cpu
|
||||
CUDA: unique_dim_cuda
|
||||
MPS: unique_dim_mps
|
||||
tags: dynamic_output_shape
|
||||
autogen: unique_dim.out
|
||||
|
||||
|
||||
@ -158,12 +158,46 @@ c10::intrusive_ptr<EmbeddingPackedParamsBase> PackedEmbeddingBagWeight::prepack(
|
||||
return packed_ptr;
|
||||
}
|
||||
|
||||
#ifdef USE_FBGEMM
|
||||
namespace {
|
||||
/// Number of columns in the rowwise min/max buffer passed to the quantization function(s)
|
||||
constexpr int kRowwiseMinMaxNumCols = 2;
|
||||
|
||||
bool _validate_rowwise_min_max(
|
||||
const at::Tensor& weight,
|
||||
const std::optional<at::Tensor>& rowwise_min_max_opt) {
|
||||
const auto is_valid_rowwise_min_max = rowwise_min_max_opt.has_value();
|
||||
|
||||
if (is_valid_rowwise_min_max) {
|
||||
TORCH_CHECK(
|
||||
(rowwise_min_max_opt->dim() == 2 &&
|
||||
rowwise_min_max_opt->size(0) == weight.size(0) &&
|
||||
rowwise_min_max_opt->size(1) == kRowwiseMinMaxNumCols),
|
||||
"'rowwise_min_max' must be a 2D tensor with shape [num_rows(weight), 2].");
|
||||
}
|
||||
|
||||
return is_valid_rowwise_min_max;
|
||||
}
|
||||
|
||||
auto _get_rowwise_min_max_contig(
|
||||
const std::optional<at::Tensor>& rowwise_min_max_opt) {
|
||||
return rowwise_min_max_opt.has_value()
|
||||
? rowwise_min_max_opt->expect_contiguous(rowwise_min_max_opt->suggest_memory_format())
|
||||
: at::borrow_from_optional_tensor(rowwise_min_max_opt);
|
||||
}
|
||||
}
|
||||
#endif // USE_FBGEMM
|
||||
|
||||
namespace at::native {
|
||||
|
||||
// Note - This is a temporary pack function for embedding bag which quantizes
|
||||
// and packs the float weight tensor. In the next step it will be replaced by a
|
||||
// quantize and pack function once we support FP scale and FP zero_point
|
||||
//
|
||||
// The optional rowwise_min_max argument is to support callers to pass in the min/max
|
||||
// values of the weight tensor. If the rowwise_min_max is not provided, the min/max
|
||||
// values will be computed from the weight tensor.
|
||||
//
|
||||
// Python example examining a packed 8bit zero_point and scale:
|
||||
//
|
||||
// >> x = torch.from_numpy(np.array([[[10, 20], [30, 40]],[[50, 60], [70, 80]]],
|
||||
@ -221,7 +255,10 @@ namespace at::native {
|
||||
//
|
||||
// [[50. , 60.00000035],
|
||||
// [70. , 80.00000035]]])
|
||||
Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight) {
|
||||
Tensor& qembeddingbag_byte_prepack_out(
|
||||
Tensor& output,
|
||||
const Tensor& weight,
|
||||
const std::optional<Tensor>& rowwise_min_max_opt) {
|
||||
// The "last" dimension of an N-Dimensioned batch of embedding bags is
|
||||
// quantization channel. E.g. for a 2D embedding bag, this has
|
||||
// [ row, col ] dimensions, for batched of embedding bags, dimensions might be
|
||||
@ -256,9 +293,16 @@ Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight) {
|
||||
auto* output_data = output.data_ptr<uint8_t>();
|
||||
|
||||
#ifdef USE_FBGEMM
|
||||
// Move these outside of the ifdef when we support non-FBGEMM flow.
|
||||
const auto is_valid_rowwise_min_max = _validate_rowwise_min_max(weight, rowwise_min_max_opt);
|
||||
const auto rowwise_min_max_contig = _get_rowwise_min_max_contig(rowwise_min_max_opt);
|
||||
|
||||
if (weight_contig->scalar_type() == at::ScalarType::Half) {
|
||||
const auto weight_data =
|
||||
static_cast<fbgemm::float16*>(weight_contig->data_ptr());
|
||||
const auto rowwise_min_max_data = is_valid_rowwise_min_max
|
||||
? static_cast<fbgemm::float16*>(rowwise_min_max_contig->data_ptr())
|
||||
: nullptr;
|
||||
at::parallel_for(
|
||||
0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
|
||||
fbgemm::FloatOrHalfToFused8BitRowwiseQuantizedSBFloat<
|
||||
@ -266,17 +310,21 @@ Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight) {
|
||||
weight_data + start_idx * embedding_cols,
|
||||
end_idx - start_idx,
|
||||
embedding_cols,
|
||||
output_data + start_idx * output_columns);
|
||||
output_data + start_idx * output_columns,
|
||||
(is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
|
||||
});
|
||||
} else {
|
||||
const auto weight_data = weight_contig->data_ptr<float>();
|
||||
const auto rowwise_min_max_data =
|
||||
is_valid_rowwise_min_max ? rowwise_min_max_contig->data_ptr<float>() : nullptr;
|
||||
at::parallel_for(
|
||||
0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
|
||||
fbgemm::FloatOrHalfToFused8BitRowwiseQuantizedSBFloat<float>(
|
||||
weight_data + start_idx * embedding_cols,
|
||||
end_idx - start_idx,
|
||||
embedding_cols,
|
||||
output_data + start_idx * output_columns);
|
||||
output_data + start_idx * output_columns,
|
||||
(is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
|
||||
});
|
||||
}
|
||||
|
||||
@ -326,6 +374,22 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
|
||||
return output;
|
||||
}
|
||||
|
||||
static Tensor qembeddingbag_byte_prepack_with_rowwise_min_max(
|
||||
const Tensor& weight,
|
||||
const Tensor& rowwise_min_max) {
|
||||
const auto weight_contig =
|
||||
weight.expect_contiguous(weight.suggest_memory_format());
|
||||
Tensor output = at::detail::empty_cpu(
|
||||
{0},
|
||||
at::kByte,
|
||||
weight_contig->layout(),
|
||||
weight_contig->device(),
|
||||
std::nullopt,
|
||||
std::nullopt);
|
||||
qembeddingbag_byte_prepack_out(output, weight, rowwise_min_max);
|
||||
return output;
|
||||
}
|
||||
|
||||
Tensor qembeddingbag_byte_prepack_meta(const Tensor& weight) {
|
||||
const auto weight_contig =
|
||||
weight.expect_contiguous(weight.suggest_memory_format());
|
||||
@ -335,7 +399,7 @@ Tensor qembeddingbag_byte_prepack_meta(const Tensor& weight) {
|
||||
"'embedding_bag_byte_prepack' only support float32 or float16.");
|
||||
const auto weight_sizes = weight.sym_sizes();
|
||||
const auto cols_dim = weight.ndimension() - 1;
|
||||
const auto embedding_cols = weight_sizes[cols_dim];
|
||||
const auto& embedding_cols = weight_sizes[cols_dim];
|
||||
// Add 8 bytes per column to store FP32 scale and zero_point per row.
|
||||
const auto output_columns = embedding_cols + 2 * sizeof(float);
|
||||
|
||||
@ -359,7 +423,8 @@ Tensor _qembeddingbag_nbit_prepack_helper(
|
||||
int bit_width,
|
||||
const bool optimized_qparams,
|
||||
const int64_t nbins,
|
||||
const double ratio) {
|
||||
const double ratio,
|
||||
const std::optional<Tensor>& rowwise_min_max_opt = std::nullopt) {
|
||||
TORCH_CHECK(
|
||||
weight.scalar_type() == at::ScalarType::Float ||
|
||||
weight.scalar_type() == at::ScalarType::Half,
|
||||
@ -401,10 +466,17 @@ Tensor _qembeddingbag_nbit_prepack_helper(
|
||||
auto* output_data = output.data_ptr<uint8_t>();
|
||||
|
||||
#ifdef USE_FBGEMM
|
||||
// Move these outside of the ifdef when we support non-FBGEMM flow.
|
||||
const auto is_valid_rowwise_min_max = _validate_rowwise_min_max(weight, rowwise_min_max_opt);
|
||||
const auto rowwise_min_max_contig = _get_rowwise_min_max_contig(rowwise_min_max_opt);
|
||||
|
||||
if (!optimized_qparams) {
|
||||
if (weight_contig.scalar_type() == at::ScalarType::Half) {
|
||||
const auto weight_data =
|
||||
static_cast<fbgemm::float16*>(weight_contig.data_ptr());
|
||||
const auto rowwise_min_max_data = is_valid_rowwise_min_max
|
||||
? static_cast<fbgemm::float16*>(rowwise_min_max_contig->data_ptr())
|
||||
: nullptr;
|
||||
at::parallel_for(
|
||||
0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
|
||||
fbgemm::FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf<
|
||||
@ -413,10 +485,13 @@ Tensor _qembeddingbag_nbit_prepack_helper(
|
||||
weight_data + start_idx * embedding_cols,
|
||||
end_idx - start_idx,
|
||||
static_cast<int>(embedding_cols),
|
||||
output_data + start_idx * output_shape[1]);
|
||||
output_data + start_idx * output_shape[1],
|
||||
(is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
|
||||
});
|
||||
} else {
|
||||
const auto weight_data = weight_contig.data_ptr<float>();
|
||||
const auto rowwise_min_max_data =
|
||||
is_valid_rowwise_min_max ? rowwise_min_max_contig->data_ptr<float>() : nullptr;
|
||||
at::parallel_for(
|
||||
0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
|
||||
fbgemm::FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf<float>(
|
||||
@ -424,7 +499,8 @@ Tensor _qembeddingbag_nbit_prepack_helper(
|
||||
weight_data + start_idx * embedding_cols,
|
||||
end_idx - start_idx,
|
||||
static_cast<int>(embedding_cols),
|
||||
output_data + start_idx * output_shape[1]);
|
||||
output_data + start_idx * output_shape[1],
|
||||
(is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
|
||||
});
|
||||
}
|
||||
} else {
|
||||
@ -514,6 +590,16 @@ Tensor qembeddingbag_4bit_prepack(
|
||||
weight, 4 /*bit_width*/, optimized_qparams, nbins, ratio);
|
||||
}
|
||||
|
||||
Tensor qembeddingbag_4bit_prepack_with_rowwise_min_max(
|
||||
const Tensor& weight,
|
||||
const Tensor& rowwise_min_max,
|
||||
const bool optimized_qparams,
|
||||
const int64_t nbins,
|
||||
const double ratio) {
|
||||
return _qembeddingbag_nbit_prepack_helper(
|
||||
weight, 4 /*bit_width*/, optimized_qparams, nbins, ratio, rowwise_min_max);
|
||||
}
|
||||
|
||||
// Applies 2-bit row-wise quantization by determining the range
|
||||
// (maximum - minimum) and bias (minimum value) of each row in the input
|
||||
// matrix, and then scaling each element to an 2-bit number between 0 and
|
||||
@ -531,6 +617,16 @@ Tensor qembeddingbag_2bit_prepack(
|
||||
weight, 2 /*bit_width*/, optimized_qparams, nbins, ratio);
|
||||
}
|
||||
|
||||
Tensor qembeddingbag_2bit_prepack_with_rowwise_min_max(
|
||||
const Tensor& weight,
|
||||
const Tensor& rowwise_min_max,
|
||||
const bool optimized_qparams,
|
||||
const int64_t nbins,
|
||||
const double ratio) {
|
||||
return _qembeddingbag_nbit_prepack_helper(
|
||||
weight, 2 /*bit_width*/, optimized_qparams, nbins, ratio, rowwise_min_max);
|
||||
}
|
||||
|
||||
class QEmbeddingPackWeights final {
|
||||
public:
|
||||
static c10::intrusive_ptr<EmbeddingPackedParamsBase> run(const at::Tensor& weight) {
|
||||
@ -542,12 +638,21 @@ TORCH_LIBRARY_IMPL(quantized, CPU, m) {
|
||||
m.impl(
|
||||
TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte_prepack"),
|
||||
TORCH_FN(qembeddingbag_byte_prepack));
|
||||
m.impl(
|
||||
TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte_prepack_with_rowwise_min_max"),
|
||||
TORCH_FN(qembeddingbag_byte_prepack_with_rowwise_min_max));
|
||||
m.impl(
|
||||
TORCH_SELECTIVE_NAME("quantized::embedding_bag_4bit_prepack"),
|
||||
TORCH_FN(qembeddingbag_4bit_prepack));
|
||||
m.impl(
|
||||
TORCH_SELECTIVE_NAME("quantized::embedding_bag_4bit_prepack_with_rowwise_min_max"),
|
||||
TORCH_FN(qembeddingbag_4bit_prepack_with_rowwise_min_max));
|
||||
m.impl(
|
||||
TORCH_SELECTIVE_NAME("quantized::embedding_bag_2bit_prepack"),
|
||||
TORCH_FN(qembeddingbag_2bit_prepack));
|
||||
m.impl(
|
||||
TORCH_SELECTIVE_NAME("quantized::embedding_bag_2bit_prepack_with_rowwise_min_max"),
|
||||
TORCH_FN(qembeddingbag_2bit_prepack_with_rowwise_min_max));
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
|
||||
|
||||
@ -3,7 +3,10 @@
|
||||
|
||||
namespace at::native {
|
||||
|
||||
Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight);
|
||||
Tensor& qembeddingbag_byte_prepack_out(
|
||||
Tensor& output,
|
||||
const Tensor& weight,
|
||||
const std::optional<Tensor>& rowwise_min_max_opt = std::nullopt);
|
||||
|
||||
Tensor qembeddingbag_byte_prepack(const Tensor& weight);
|
||||
|
||||
|
||||
@ -121,9 +121,12 @@ TORCH_LIBRARY(quantized, m) {
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_unpack(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase W_prepack) -> Tensor W_origin"), {at::Tag::pt2_compliant_tag});
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_prepack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_unpack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_prepack_with_rowwise_min_max(Tensor weight, Tensor rowwise_min_max) -> Tensor"), {at::Tag::pt2_compliant_tag});
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_prepack(Tensor weight, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_prepack_with_rowwise_min_max(Tensor weight, Tensor rowwise_min_max, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_unpack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_prepack(Tensor weight, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_prepack_with_rowwise_min_max(Tensor weight, Tensor rowwise_min_max, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_unpack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"), {at::Tag::pt2_compliant_tag});
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"), {at::Tag::pt2_compliant_tag});
|
||||
|
||||
@ -120,7 +120,7 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
|
||||
// buffer (in bytes)
|
||||
size_t orig_m = sparse_input.size(0);
|
||||
size_t div = orig_m * sparse_input.itemsize();
|
||||
size_t new_n = (compressed_size + div - 1) / div; // floor
|
||||
size_t new_n = (compressed_size + div - 1) / div; // ceil(s,d) = (s+d-1)/d
|
||||
auto compressed_tensor = sparse_input.new_empty({(int64_t)orig_m, (int64_t)new_n});
|
||||
|
||||
auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
|
||||
@ -155,7 +155,7 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
|
||||
TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
|
||||
handle_initialized = true;
|
||||
}
|
||||
// cupsarselt constructs
|
||||
// cuSPARSELt constructs
|
||||
cusparseLtMatmulDescriptor_t matmul;
|
||||
cusparseLtMatmulPlan_t plan;
|
||||
cusparseLtMatmulAlgSelection_t alg_sel;
|
||||
|
||||
@ -176,6 +176,28 @@ bool check_head_dim_size_flash(sdp_params const& params, bool debug) {
|
||||
}
|
||||
return false;
|
||||
}
|
||||
if constexpr(caller_is_meff) {
|
||||
bool is_half = (params.query.dtype() == at::kHalf) ||
|
||||
(params.query.dtype() == at::kBFloat16);
|
||||
const int64_t alignment = is_half ? 8 : 4;
|
||||
if (!(query_size_last % alignment == 0 && query_size_last > 0 &&
|
||||
value_size_last % alignment == 0 && value_size_last > 0)) {
|
||||
if (debug) {
|
||||
TORCH_WARN(
|
||||
"Mem efficient attention requires last dimension of inputs to be divisible by ",
|
||||
alignment,
|
||||
". ",
|
||||
"Got Query.size(-1): ",
|
||||
query_size_last,
|
||||
", Key.size(-1): ",
|
||||
params.key.sym_size(-1),
|
||||
", Value.size(-1): ",
|
||||
params.value.sym_size(-1),
|
||||
" instead.");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -666,6 +688,15 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
|
||||
TORCH_WARN(CUDNN_VERSION, " cuDNN version too old to use cuDNN Attention (< v9.0.0)");
|
||||
}
|
||||
return false;
|
||||
#endif
|
||||
#if defined(CUDNN_VERSION)
|
||||
static auto cudnn_version = cudnnGetVersion();
|
||||
if (params.dropout > 0.0 && cudnn_version > 91100 && cudnn_version < 91400) {
|
||||
if (debug) {
|
||||
TORCH_WARN(CUDNN_VERSION, " cuDNN version does not support droppout in SDPA (9.11 - 9.13).");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
// Define gate functions that determine if a flash kernel can be ran
|
||||
// Replace with std::to_array when we migrate to c++20
|
||||
|
||||
@ -462,10 +462,11 @@ mha_varlen_fwd_aot(const at::Tensor &q, // total_q x num_heads x head_size, tot
|
||||
using sdp::aotriton_adapter::mk_aotensor;
|
||||
using sdp::aotriton_adapter::mk_aoscalartensor;
|
||||
using sdp::aotriton_adapter::mk_philoxtensor;
|
||||
using sdp::aotriton_adapter::mk_atomictensor;
|
||||
using sdp::aotriton_adapter::cast_dtype;
|
||||
at::Tensor atomic_counter;
|
||||
if (is_causal) {
|
||||
atomic_counter = at::zeros({1}, q.options());
|
||||
atomic_counter = at::zeros({1}, q.options().dtype(at::kInt));
|
||||
}
|
||||
aotriton::TensorView<4> empty_bias(0, {0,0,0,0}, {0,0,0,0}, cast_dtype(q.dtype()));
|
||||
auto seed = use_philox_state ? mk_philoxtensor(philox_state.seed_.ptr) : mk_aoscalartensor(seed_t);
|
||||
@ -474,7 +475,7 @@ mha_varlen_fwd_aot(const at::Tensor &q, // total_q x num_heads x head_size, tot
|
||||
auto nullscalar = mk_philoxtensor(nullptr);
|
||||
auto seed_output = use_philox_state ? mk_philoxtensor(seed_t.data_ptr<int64_t>()) : nullscalar;
|
||||
auto offset_output = use_philox_state ? mk_philoxtensor(offset_t.data_ptr<int64_t>()) : nullscalar;
|
||||
auto persistent_counter = is_causal ? mk_philoxtensor(atomic_counter.data_ptr<int64_t>()) : nullscalar;
|
||||
auto persistent_counter = mk_atomictensor(is_causal ? atomic_counter.data_ptr<int32_t>() : nullptr);
|
||||
if (uses_swa || AOTRITON_ALWAYS_V3_API) {
|
||||
#if AOTRITON_V3_API
|
||||
using aotriton::v3::flash::CausalType;
|
||||
|
||||
@ -2,22 +2,12 @@
|
||||
|
||||
// ${generated_comment}
|
||||
|
||||
#include <ATen/FunctionalStorageImpl.h>
|
||||
#include <ATen/Tensor.h>
|
||||
|
||||
namespace at {
|
||||
namespace functionalization {
|
||||
|
||||
enum class InverseReturnMode {
|
||||
/// Specifies that functional inverses should always return a view.
|
||||
AlwaysView,
|
||||
/// Specifies that functional inverses should always return a non-view / copy.
|
||||
NeverView,
|
||||
/// Specifies that functional inverses should return a view unless a (copying) scatter
|
||||
/// inverse exists, in which case that will be used instead.
|
||||
/// This avoids as_strided() calls that can be difficult for subclasses to handle.
|
||||
ViewOrScatterInverse,
|
||||
};
|
||||
|
||||
struct FunctionalInverses {
|
||||
|
||||
${view_inverse_declarations}
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
#include <ATen/core/LegacyTypeDispatch.h>
|
||||
#include <ATen/EmptyTensor.h>
|
||||
#include <ATen/FunctionalTensorWrapper.h>
|
||||
#include <ATen/FunctionalInverses.h>
|
||||
#include <ATen/ViewMetaClasses.h>
|
||||
#include <ATen/MemoryOverlap.h>
|
||||
#include <torch/library.h>
|
||||
|
||||
|
||||
19
aten/src/ATen/templates/ViewMetaClasses.cpp
Normal file
19
aten/src/ATen/templates/ViewMetaClasses.cpp
Normal file
@ -0,0 +1,19 @@
|
||||
// ${generated_comment}
|
||||
|
||||
#include <ATen/FunctionalInverses.h>
|
||||
#include <ATen/ViewMetaClasses.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Operators.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#else
|
||||
${op_headers}
|
||||
#endif
|
||||
|
||||
namespace at {
|
||||
namespace functionalization {
|
||||
|
||||
${view_meta_implementations}
|
||||
|
||||
} // namespace functionalization
|
||||
} // namespace at
|
||||
12
aten/src/ATen/templates/ViewMetaClasses.h
Normal file
12
aten/src/ATen/templates/ViewMetaClasses.h
Normal file
@ -0,0 +1,12 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
// ${generated_comment}
|
||||
|
||||
#include <ATen/FunctionalStorageImpl.h>
|
||||
|
||||
namespace at {
|
||||
namespace functionalization {
|
||||
|
||||
${view_meta_declarations}
|
||||
|
||||
} // namespace functionalization
|
||||
} // namespace at
|
||||
11
aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp
Normal file
11
aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp
Normal file
@ -0,0 +1,11 @@
|
||||
#include <ATen/ViewMetaClasses.h>
|
||||
#include <torch/csrc/functionalization/Module.h>
|
||||
|
||||
namespace torch::functionalization {
|
||||
|
||||
void initGenerated(PyObject* module) {
|
||||
auto functionalization = py::handle(module).cast<py::module>();
|
||||
$view_meta_bindings
|
||||
}
|
||||
|
||||
} // namespace torch::functionalization
|
||||
@ -1561,6 +1561,38 @@ namespace {
|
||||
<< "Failure Details:\nTest Seed to reproduce: " << seed;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(CPU_CAPABILITY_AVX512)
|
||||
TYPED_TEST(Quantization8BitTests, TransposePackVNNI4) {
|
||||
using VT = ValueType<TypeParam>;
|
||||
constexpr auto K = 197;
|
||||
constexpr auto N = 64;
|
||||
constexpr auto L = K * N;
|
||||
constexpr auto ld_src = N;
|
||||
constexpr auto ld_dst = K * 4;
|
||||
CACHE_ALIGN VT x[L];
|
||||
CACHE_ALIGN VT y[L];
|
||||
CACHE_ALIGN VT ref[L];
|
||||
auto seed = TestSeed();
|
||||
ValueGen<VT> generator(VT(-100), VT(100), seed);
|
||||
for (const auto i : c10::irange(L)) {
|
||||
x[i] = generator.get();
|
||||
}
|
||||
at::vec::transpose_pack_vnni4(x, y, ld_src, K, N);
|
||||
int64_t _N = N / 4;
|
||||
for (int64_t k = 0; k < K; k++) {
|
||||
for(int64_t n = 0; n < _N; n++) {
|
||||
for(int64_t l = 0; l < 4; l++) {
|
||||
ref[n * ld_dst + k * 4 + l] =
|
||||
c10::load(&(x[k * ld_src + n * 4 + l]));
|
||||
}
|
||||
}
|
||||
}
|
||||
for (const auto i : c10::irange(L)) {
|
||||
ASSERT_EQ(y[i], ref[i])
|
||||
<< "Failure Details:\nTest Seed to reproduce: " << seed;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
TYPED_TEST(FunctionalTests, Map) {
|
||||
using vec = TypeParam;
|
||||
|
||||
@ -78,6 +78,8 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
|
||||
"google/gemma-3-4b-it",
|
||||
"openai/whisper-tiny",
|
||||
"Qwen/Qwen3-0.6B",
|
||||
"mistralai/Mistral-7B-Instruct-v0.3",
|
||||
"openai/gpt-oss-20b",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@ -61,6 +61,8 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
|
||||
"google/gemma-3-4b-it",
|
||||
"openai/whisper-tiny",
|
||||
"Qwen/Qwen3-0.6B",
|
||||
"mistralai/Mistral-7B-Instruct-v0.3",
|
||||
"openai/gpt-oss-20b",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,0
|
||||
|
||||
|
||||
|
||||
mistralai/Mistral-7B-Instruct-v0.3,pass,0
|
||||
|
||||
|
||||
|
||||
openai/gpt-oss-20b,pass,0
|
||||
|
||||
|
@ -187,3 +187,11 @@ openai/whisper-tiny,fail_to_run,0
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,fail_to_run,0
|
||||
|
||||
|
||||
|
||||
mistralai/Mistral-7B-Instruct-v0.3,fail_to_run,0
|
||||
|
||||
|
||||
|
||||
openai/gpt-oss-20b,fail_to_run,0
|
||||
|
||||
|
@ -191,3 +191,11 @@ openai/whisper-tiny,pass_due_to_skip,0
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/gpt-oss-20b,pass_due_to_skip,0
|
||||
|
||||
|
@ -191,3 +191,11 @@ openai/whisper-tiny,pass_due_to_skip,0
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/gpt-oss-20b,pass_due_to_skip,0
|
||||
|
||||
|
@ -191,3 +191,11 @@ openai/whisper-tiny,pass_due_to_skip,0
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/gpt-oss-20b,pass_due_to_skip,0
|
||||
|
||||
|
@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,0
|
||||
|
||||
|
||||
|
||||
mistralai/Mistral-7B-Instruct-v0.3,pass,0
|
||||
|
||||
|
||||
|
||||
openai/gpt-oss-20b,pass,0
|
||||
|
||||
|
@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,0
|
||||
|
||||
|
||||
|
||||
mistralai/Mistral-7B-Instruct-v0.3,pass,0
|
||||
|
||||
|
||||
|
||||
openai/gpt-oss-20b,pass,0
|
||||
|
||||
|
@ -318,7 +318,7 @@ timm_vovnet,pass,0
|
||||
|
||||
|
||||
|
||||
torch_multimodal_clip,pass,3
|
||||
torch_multimodal_clip,pass,0
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,0
|
||||
|
||||
|
||||
|
||||
mistralai/Mistral-7B-Instruct-v0.3,pass,0
|
||||
|
||||
|
||||
|
||||
openai/gpt-oss-20b,pass,0
|
||||
|
||||
|
@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,0
|
||||
|
||||
|
||||
|
||||
mistralai/Mistral-7B-Instruct-v0.3,pass,0
|
||||
|
||||
|
||||
|
||||
openai/gpt-oss-20b,pass,0
|
||||
|
||||
|
@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,0
|
||||
|
||||
|
||||
|
||||
mistralai/Mistral-7B-Instruct-v0.3,pass,0
|
||||
|
||||
|
||||
|
||||
openai/gpt-oss-20b,pass,0
|
||||
|
||||
|
@ -11,6 +11,8 @@ skip:
|
||||
- GPTJForQuestionAnswering
|
||||
# Model too big
|
||||
- google/gemma-3-4b-it
|
||||
- openai/gpt-oss-20b
|
||||
- mistralai/Mistral-7B-Instruct-v0.3
|
||||
|
||||
device:
|
||||
cpu:
|
||||
@ -19,6 +21,8 @@ skip:
|
||||
- google/gemma-3-4b-it
|
||||
- openai/whisper-tiny
|
||||
- Qwen/Qwen3-0.6B
|
||||
- mistralai/Mistral-7B-Instruct-v0.3
|
||||
- openai/gpt-oss-20b
|
||||
|
||||
control_flow:
|
||||
- AllenaiLongformerBase
|
||||
@ -79,6 +83,8 @@ batch_size:
|
||||
google/gemma-3-4b-it: 8
|
||||
openai/whisper-tiny: 8
|
||||
Qwen/Qwen3-0.6B: 8
|
||||
mistralai/Mistral-7B-Instruct-v0.3: 8
|
||||
openai/gpt-oss-20b: 8
|
||||
|
||||
|
||||
tolerance:
|
||||
|
||||
@ -99,4 +99,6 @@ HF_LLM_MODELS: dict[str, Benchmark] = {
|
||||
"google/gemma-3-4b-it": TextGenerationBenchmark,
|
||||
"openai/whisper-tiny": WhisperBenchmark,
|
||||
"Qwen/Qwen3-0.6B": TextGenerationBenchmark,
|
||||
"mistralai/Mistral-7B-Instruct-v0.3": TextGenerationBenchmark,
|
||||
"openai/gpt-oss-20b": TextGenerationBenchmark,
|
||||
}
|
||||
|
||||
@ -51,3 +51,5 @@ google/gemma-2-2b,8
|
||||
google/gemma-3-4b-it,8
|
||||
openai/whisper-tiny,8
|
||||
Qwen/Qwen3-0.6B,8
|
||||
mistralai/Mistral-7B-Instruct-v0.3, 8
|
||||
openai/gpt-oss-20b, 8
|
||||
|
||||
@ -6,4 +6,4 @@
|
||||
4. (Optional) flip a flag that you know will change the benchmark and run again with b.txt `PYTHONPATH=./ python benchmarks/[YOUR_BENCHMARK].py a.txt`
|
||||
5. Compare `a.txt` and `b.txt` located within the `benchmarks/dynamo/pr_time_benchmarks` folder to make sure things look as you expect
|
||||
6. Check in your new benchmark file and submit a new PR
|
||||
7. In a few days, if your benchmark is stable, bug Laith Sakka to enable running your benchmark on all PRs. If your a meta employee, you can find the dashboard here: internalfb.com/intern/unidash/dashboard/pt2_diff_time_metrics
|
||||
7. In a few days, if your benchmark is stable, bug Laith Sakka to enable running your benchmark on all PRs. If you are a meta employee, you can find the dashboard here: https://internalfb.com/intern/unidash/dashboard/pt2_diff_time_metrics
|
||||
|
||||
@ -0,0 +1,111 @@
|
||||
import sys
|
||||
|
||||
from benchmark_base import BenchmarkBase
|
||||
|
||||
import torch
|
||||
from torch.autograd.grad_mode import inference_mode
|
||||
|
||||
|
||||
class Benchmark(BenchmarkBase):
|
||||
def __init__(self, requires_grad, inference_mode, backward, dynamic):
|
||||
assert not (inference_mode and backward), (
|
||||
"inference_mode and backward cannot be both True"
|
||||
)
|
||||
|
||||
self._requires_grad = requires_grad
|
||||
self._inference_mode = inference_mode
|
||||
self._backward = backward
|
||||
|
||||
super().__init__(
|
||||
category="runtime_overhead",
|
||||
backend="inductor",
|
||||
device="cuda",
|
||||
dynamic=dynamic,
|
||||
)
|
||||
|
||||
def name(self):
|
||||
prefix = f"{self.category()}_{self.backend()}"
|
||||
if self._requires_grad:
|
||||
prefix += "_requires_grad"
|
||||
if self._inference_mode:
|
||||
prefix += "_inference_mode"
|
||||
if self._backward:
|
||||
prefix += "_backward"
|
||||
if self.is_dynamic():
|
||||
prefix += "_dynamic"
|
||||
return prefix
|
||||
|
||||
def description(self):
|
||||
return "runtime of a compiled add1 op small input"
|
||||
|
||||
def _prepare_once(self):
|
||||
torch._dynamo.reset()
|
||||
self.a = torch.ones(2, device=self.device(), requires_grad=self._requires_grad)
|
||||
|
||||
@torch.compile(
|
||||
backend=self.backend(),
|
||||
fullgraph=True,
|
||||
dynamic=self.is_dynamic(),
|
||||
)
|
||||
def add1(a):
|
||||
return a + 1
|
||||
|
||||
self._add1 = add1
|
||||
|
||||
# warmup
|
||||
for _ in range(10):
|
||||
if self._backward:
|
||||
self.forward_val = self._add1(self.a).sum()
|
||||
self.forward_val.backward()
|
||||
else:
|
||||
self._work()
|
||||
|
||||
def _prepare(self):
|
||||
if self._backward:
|
||||
self.forward_val = self._add1(self.a).sum()
|
||||
|
||||
def _work(self):
|
||||
if self._inference_mode:
|
||||
with inference_mode():
|
||||
self._add1(self.a)
|
||||
elif self._backward:
|
||||
self.forward_val.backward()
|
||||
else:
|
||||
self._add1(self.a)
|
||||
|
||||
|
||||
def main():
|
||||
result_path = sys.argv[1]
|
||||
all = [
|
||||
Benchmark(
|
||||
requires_grad=False, inference_mode=False, backward=False, dynamic=False
|
||||
),
|
||||
Benchmark(
|
||||
requires_grad=False, inference_mode=True, backward=False, dynamic=False
|
||||
),
|
||||
Benchmark(
|
||||
requires_grad=True, inference_mode=False, backward=False, dynamic=False
|
||||
),
|
||||
Benchmark(
|
||||
requires_grad=True, inference_mode=False, backward=True, dynamic=False
|
||||
),
|
||||
Benchmark(
|
||||
requires_grad=False, inference_mode=False, backward=False, dynamic=True
|
||||
),
|
||||
Benchmark(
|
||||
requires_grad=False, inference_mode=True, backward=False, dynamic=True
|
||||
),
|
||||
Benchmark(
|
||||
requires_grad=True, inference_mode=False, backward=False, dynamic=True
|
||||
),
|
||||
Benchmark(
|
||||
requires_grad=True, inference_mode=False, backward=True, dynamic=True
|
||||
),
|
||||
]
|
||||
|
||||
for benchmark in all:
|
||||
benchmark.enable_instruction_count().collect_all().append_results(result_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -156,7 +156,7 @@ ROOT = "//" if IS_OSS else "//xplat/caffe2"
|
||||
# for targets in subfolders
|
||||
ROOT_PATH = "//" if IS_OSS else "//xplat/caffe2/"
|
||||
|
||||
C10 = "//c10:c10" if IS_OSS else ("//xplat/caffe2/c10:c10_ovrsource" if is_arvr_mode() else "//xplat/caffe2/c10:c10")
|
||||
C10 = "//c10:c10" if IS_OSS else "//xplat/caffe2/c10:c10"
|
||||
|
||||
# a dictionary maps third party library name to fbsource and oss target
|
||||
THIRD_PARTY_LIBS = {
|
||||
@ -391,6 +391,8 @@ def get_aten_generated_files(enabled_backends):
|
||||
"CompositeExplicitAutogradFunctions_inl.h",
|
||||
"CompositeExplicitAutogradNonFunctionalFunctions.h",
|
||||
"CompositeExplicitAutogradNonFunctionalFunctions_inl.h",
|
||||
"ViewMetaClasses.h",
|
||||
"ViewMetaClasses.cpp",
|
||||
"VmapGeneratedPlumbing.h",
|
||||
"core/ATenOpList.cpp",
|
||||
"core/TensorBody.h",
|
||||
@ -948,7 +950,6 @@ def define_buck_targets(
|
||||
[
|
||||
("torch/csrc/api/include", "torch/**/*.h"),
|
||||
("", "torch/csrc/**/*.h"),
|
||||
("", "torch/csrc/**/*.hpp"),
|
||||
("", "torch/nativert/**/*.h"),
|
||||
("", "torch/headeronly/**/*.h"),
|
||||
("", "torch/script.h"),
|
||||
@ -1193,6 +1194,7 @@ def define_buck_targets(
|
||||
"NativeMetaFunctions.h": ":gen_aten[NativeMetaFunctions.h]",
|
||||
"Operators.h": ":gen_aten[Operators.h]",
|
||||
"RedispatchFunctions.h": ":gen_aten[RedispatchFunctions.h]",
|
||||
"ViewMetaClasses.h": ":gen_aten[ViewMetaClasses.h]",
|
||||
"core/TensorBody.h": ":gen_aten[core/TensorBody.h]",
|
||||
"core/aten_interned_strings.h": ":gen_aten[core/aten_interned_strings.h]",
|
||||
"core/enum_tag.h": ":gen_aten[core/enum_tag.h]",
|
||||
@ -2048,7 +2050,6 @@ def define_buck_targets(
|
||||
("", "caffe2/utils/*.h"),
|
||||
("", "caffe2/core/*.h"),
|
||||
("", "torch/csrc/*.h"),
|
||||
("", "torch/csrc/*.hpp"),
|
||||
("", "torch/csrc/api/include/torch/*.h"),
|
||||
("", "torch/csrc/autograd/*.h"),
|
||||
("", "torch/csrc/autograd/*/*.h"),
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user