Compare commits

..

1 Commits

Author SHA1 Message Date
c56b575e61 Add eager mode in inductor 2025-09-22 14:32:25 -07:00
653 changed files with 14286 additions and 20374 deletions

View File

@ -241,7 +241,7 @@ def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
try: try:
with socket.create_connection((addr, port), timeout=timeout): with socket.create_connection((addr, port), timeout=timeout):
return return
except (ConnectionRefusedError, TimeoutError): # noqa: PERF203 except (ConnectionRefusedError, socket.timeout): # noqa: PERF203
if i == attempt_cnt - 1: if i == attempt_cnt - 1:
raise raise
time.sleep(timeout) time.sleep(timeout)
@ -1004,7 +1004,7 @@ if __name__ == "__main__":
install_condaforge_python(host, args.python_version) install_condaforge_python(host, args.python_version)
sys.exit(0) sys.exit(0)
python_version = args.python_version if args.python_version is not None else "3.10" python_version = args.python_version if args.python_version is not None else "3.9"
if args.use_torch_from_pypi: if args.use_torch_from_pypi:
configure_system(host, compiler=args.compiler, python_version=python_version) configure_system(host, compiler=args.compiler, python_version=python_version)

View File

@ -69,8 +69,7 @@ RUN bash ./install_cuda.sh 13.0
ENV DESIRED_CUDA=13.0 ENV DESIRED_CUDA=13.0
FROM ${ROCM_IMAGE} as rocm FROM ${ROCM_IMAGE} as rocm
ARG PYTORCH_ROCM_ARCH ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
ADD ./common/install_mkl.sh install_mkl.sh ADD ./common/install_mkl.sh install_mkl.sh
RUN bash ./install_mkl.sh && rm install_mkl.sh RUN bash ./install_mkl.sh && rm install_mkl.sh
ENV MKLROOT /opt/intel ENV MKLROOT /opt/intel

View File

@ -36,12 +36,6 @@ case ${DOCKER_TAG_PREFIX} in
;; ;;
rocm*) rocm*)
BASE_TARGET=rocm BASE_TARGET=rocm
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
# add gfx950 conditionally starting in ROCm 7.0
if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
fi
EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
;; ;;
*) *)
echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}" echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}"

View File

@ -262,10 +262,13 @@ case "$tag" in
TRITON_CPU=yes TRITON_CPU=yes
;; ;;
pytorch-linux-jammy-linter) pytorch-linux-jammy-linter)
PYTHON_VERSION=3.10 # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
# We will need to update mypy version eventually, but that's for another day. The task
# would be to upgrade mypy to 1.0.0 with Python 3.11
PYTHON_VERSION=3.9
;; ;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter) pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)
PYTHON_VERSION=3.10 PYTHON_VERSION=3.9
CUDA_VERSION=12.8.1 CUDA_VERSION=12.8.1
;; ;;
pytorch-linux-jammy-aarch64-py3.10-gcc11) pytorch-linux-jammy-aarch64-py3.10-gcc11)

View File

@ -1 +1 @@
v2.28.3-1 v2.27.5-1

View File

@ -1 +1 @@
v2.28.3-1 v2.27.7-1

View File

@ -1 +1 @@
bbb06c0334a6772b92d24bde54956e675c8c6604 5ae38bdb0dc066c5823e34dc9797afb9de42c866

View File

@ -12,8 +12,8 @@ function do_install() {
rocm_version_nodot=${rocm_version//./} rocm_version_nodot=${rocm_version//./}
# https://github.com/icl-utk-edu/magma/pull/65 # Version 2.7.2 + ROCm related updates
MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2" magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
rocm_dir="/opt/rocm" rocm_dir="/opt/rocm"

View File

@ -40,16 +40,12 @@ case ${DOCKER_TAG_PREFIX} in
;; ;;
rocm*) rocm*)
# we want the patch version of 6.4 instead # we want the patch version of 6.4 instead
if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2" GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
fi fi
BASE_TARGET=rocm BASE_TARGET=rocm
GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
# add gfx950 conditionally starting in ROCm 7.0
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
fi
DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}" DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
;; ;;
*) *)

View File

@ -82,7 +82,7 @@ case ${image} in
;; ;;
manylinux2_28-builder:rocm*) manylinux2_28-builder:rocm*)
# we want the patch version of 6.4 instead # we want the patch version of 6.4 instead
if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2" GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
fi fi
TARGET=rocm_final TARGET=rocm_final
@ -90,10 +90,6 @@ case ${image} in
DEVTOOLSET_VERSION="11" DEVTOOLSET_VERSION="11"
GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
# add gfx950 conditionally starting in ROCm 7.0
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
fi
DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}" DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
;; ;;
manylinux2_28-builder:xpu) manylinux2_28-builder:xpu)

View File

@ -112,6 +112,8 @@ ninja==1.11.1.3
#Pinned versions: 1.11.1.3 #Pinned versions: 1.11.1.3
#test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x"
numba==0.55.2 ; python_version == "3.9" and platform_machine != "s390x"
numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x" numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x" numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
#Description: Just-In-Time Compiler for Numerical Functions #Description: Just-In-Time Compiler for Numerical Functions
@ -132,7 +134,7 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
#test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py, #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
#test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py, #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
#test_binary_ufuncs.py #test_binary_ufuncs.py
numpy==1.22.4; python_version == "3.10" numpy==1.22.4; python_version == "3.9" or python_version == "3.10"
numpy==1.26.2; python_version == "3.11" or python_version == "3.12" numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
numpy==2.1.2; python_version >= "3.13" numpy==2.1.2; python_version >= "3.13"
@ -324,6 +326,8 @@ pywavelets==1.7.0 ; python_version >= "3.12"
lxml==5.3.0 lxml==5.3.0
#Description: This is a requirement of unittest-xml-reporting #Description: This is a requirement of unittest-xml-reporting
# Python-3.9 binaries
PyGithub==2.3.0 PyGithub==2.3.0
sympy==1.13.3 sympy==1.13.3

View File

@ -1,15 +1,8 @@
sphinx==5.3.0 sphinx==5.3.0
#Description: This is used to generate PyTorch docs #Description: This is used to generate PyTorch docs
#Pinned versions: 5.3.0 #Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2
standard-imghdr==3.13.0; python_version >= "3.13"
#Description: This is needed by Sphinx, so it needs to be added here.
# The reasons are as follows:
# 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr);
# 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
# Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@d53b0ffb9b1cda68260693ea98f3483823c88d8e#egg=pytorch_sphinx_theme2
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
# something related to Docker setup. We can investigate this later. # something related to Docker setup. We can investigate this later.

View File

@ -72,7 +72,7 @@ def sample_vllm_test_library():
] ]
), ),
"pytest -v -s entrypoints/llm/test_generate.py", "pytest -v -s entrypoints/llm/test_generate.py",
"pytest -v -s entrypoints/offline_mode", "VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
], ],
}, },
"vllm_regression_test": { "vllm_regression_test": {

View File

@ -1,11 +1,11 @@
SHELL=/usr/bin/env bash SHELL=/usr/bin/env bash
DOCKER_CMD ?= docker DOCKER_CMD ?= docker
DESIRED_ROCM ?= 7.0 DESIRED_ROCM ?= 6.4
DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM)) DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
PACKAGE_NAME = magma-rocm PACKAGE_NAME = magma-rocm
# inherit this from underlying docker image, do not pass this env var to docker # inherit this from underlying docker image, do not pass this env var to docker
#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201 #PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
-v $(shell git rev-parse --show-toplevel)/.ci:/builder \ -v $(shell git rev-parse --show-toplevel)/.ci:/builder \
@ -16,7 +16,6 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
magma-rocm/build_magma.sh magma-rocm/build_magma.sh
.PHONY: all .PHONY: all
all: magma-rocm70
all: magma-rocm64 all: magma-rocm64
all: magma-rocm63 all: magma-rocm63
@ -25,11 +24,6 @@ clean:
$(RM) -r magma-* $(RM) -r magma-*
$(RM) -r output $(RM) -r output
.PHONY: magma-rocm70
magma-rocm70: DESIRED_ROCM := 7.0
magma-rocm70:
$(DOCKER_RUN)
.PHONY: magma-rocm64 .PHONY: magma-rocm64
magma-rocm64: DESIRED_ROCM := 6.4 magma-rocm64: DESIRED_ROCM := 6.4
magma-rocm64: magma-rocm64:

View File

@ -6,8 +6,8 @@ set -eou pipefail
# The script expects DESIRED_CUDA and PACKAGE_NAME to be set # The script expects DESIRED_CUDA and PACKAGE_NAME to be set
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
# https://github.com/icl-utk-edu/magma/pull/65 # Version 2.7.2 + ROCm related updates
MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
# Folders for the build # Folders for the build
PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
@ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE
# Fetch magma sources and verify checksum # Fetch magma sources and verify checksum
pushd ${PACKAGE_DIR} pushd ${PACKAGE_DIR}
git clone https://github.com/jeffdaily/magma git clone https://bitbucket.org/icl/magma.git
pushd magma pushd magma
git checkout ${MAGMA_VERSION} git checkout ${MAGMA_VERSION}
popd popd

View File

@ -58,7 +58,7 @@ time python tools/setup_helpers/generate_code.py \
# Build the docs # Build the docs
pushd docs/cpp pushd docs/cpp
time make VERBOSE=1 html time make VERBOSE=1 html -j
popd popd
popd popd

View File

@ -55,7 +55,7 @@ test_python_shard() {
setup_test_python setup_test_python
time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "$1" "$NUM_TEST_SHARDS" time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --shard "$1" "$NUM_TEST_SHARDS"
assert_git_not_dirty assert_git_not_dirty
} }

View File

@ -322,29 +322,23 @@ test_python_shard() {
# modify LD_LIBRARY_PATH to ensure it has the conda env. # modify LD_LIBRARY_PATH to ensure it has the conda env.
# This set of tests has been shown to be buggy without it for the split-build # This set of tests has been shown to be buggy without it for the split-build
time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
assert_git_not_dirty assert_git_not_dirty
} }
test_python() { test_python() {
# shellcheck disable=SC2086 # shellcheck disable=SC2086
time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
assert_git_not_dirty assert_git_not_dirty
} }
test_python_smoke() { test_python_smoke() {
# Smoke tests for H100/B200 # Smoke tests for H100
time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
assert_git_not_dirty assert_git_not_dirty
} }
test_python_smoke_b200() {
# Targeted smoke tests for B200 - staged approach to avoid too many failures
time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
assert_git_not_dirty
}
test_h100_distributed() { test_h100_distributed() {
# Distributed tests at H100 # Distributed tests at H100
time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
@ -390,7 +384,6 @@ test_dynamo_wrapped_shard() {
--exclude-distributed-tests \ --exclude-distributed-tests \
--exclude-torch-export-tests \ --exclude-torch-export-tests \
--exclude-aot-dispatch-tests \ --exclude-aot-dispatch-tests \
--exclude-quantization-tests \
--shard "$1" "$NUM_TEST_SHARDS" \ --shard "$1" "$NUM_TEST_SHARDS" \
--verbose \ --verbose \
--upload-artifacts-while-running --upload-artifacts-while-running
@ -568,6 +561,43 @@ else
DYNAMO_BENCHMARK_FLAGS+=(--device cuda) DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
fi fi
# Validate backend availability for dynamo_eager configs
if [[ "${TEST_CONFIG}" == *dynamo_eager* ]]; then
echo "Validating eager backend availability for TEST_CONFIG: ${TEST_CONFIG}"
if ! python -c "import torch; backends = torch._dynamo.list_backends(); print('Available backends:', backends); assert 'eager' in backends, f'eager backend not available. Available: {backends}'"; then
echo "ERROR: eager backend not available in this environment"
echo "This might be due to missing dependencies or incorrect PyTorch installation"
exit 1
fi
echo "eager backend validation successful"
# Additional validation: test that torch.compile works with eager backend
echo "Testing torch.compile with eager backend..."
if ! python -c "
import torch
import torch._dynamo as dynamo
def test_func(x):
return x * 2
# Test that eager backend works
try:
compiled_func = torch.compile(test_func, backend='eager')
result = compiled_func(torch.tensor([1.0, 2.0]))
print('torch.compile with eager backend test successful')
except Exception as e:
print(f'ERROR: torch.compile with eager backend failed: {e}')
exit(1)
"; then
echo "ERROR: torch.compile with eager backend failed"
exit 1
fi
fi
# Debug logging for backend selection
echo "TEST_CONFIG: ${TEST_CONFIG}"
echo "DYNAMO_BENCHMARK_FLAGS: ${DYNAMO_BENCHMARK_FLAGS[*]}"
test_cachebench() { test_cachebench() {
TEST_REPORTS_DIR=$(pwd)/test/test-reports TEST_REPORTS_DIR=$(pwd)/test/test-reports
mkdir -p "$TEST_REPORTS_DIR" mkdir -p "$TEST_REPORTS_DIR"
@ -629,6 +659,16 @@ test_perf_for_dashboard() {
shift shift
local backend=inductor local backend=inductor
# Allow surfacing eager metrics in CI by switching backend based on TEST_CONFIG
if [[ "${TEST_CONFIG}" == *dynamo_eager* ]]; then
backend=eager
elif [[ "${TEST_CONFIG}" == *aot_eager* ]]; then
backend=aot_eager
fi
# Debug logging for backend selection in test_perf_for_dashboard
echo "test_perf_for_dashboard: TEST_CONFIG=${TEST_CONFIG}, selected backend=${backend}"
echo "DASHBOARD_TAG=${DASHBOARD_TAG}"
local modes=() local modes=()
if [[ "$DASHBOARD_TAG" == *training-true* ]]; then if [[ "$DASHBOARD_TAG" == *training-true* ]]; then
modes+=(training) modes+=(training)
@ -682,20 +722,37 @@ test_perf_for_dashboard() {
fi fi
if [[ "$DASHBOARD_TAG" == *default-true* ]]; then if [[ "$DASHBOARD_TAG" == *default-true* ]]; then
$TASKSET python "benchmarks/dynamo/$suite.py" \ echo "Running benchmark: ${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}"
echo "Command: $TASKSET python benchmarks/dynamo/$suite.py ${target_flag[*]} --$mode --$dtype --backend $backend --disable-cudagraphs $*"
if ! $TASKSET python "benchmarks/dynamo/$suite.py" \
"${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \ "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
--output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv" --output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"; then
echo "ERROR: Benchmark failed for ${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}"
echo "This might indicate an issue with the eager backend or benchmark configuration"
exit 1
fi
echo "Benchmark completed successfully: ${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}"
fi fi
if [[ "$DASHBOARD_TAG" == *cudagraphs-true* ]]; then if [[ "$DASHBOARD_TAG" == *cudagraphs-true* ]]; then
$TASKSET python "benchmarks/dynamo/$suite.py" \ echo "Running benchmark: ${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}"
if ! $TASKSET python "benchmarks/dynamo/$suite.py" \
"${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" \ "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" \
--output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv" --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"; then
echo "ERROR: Benchmark failed for ${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}"
exit 1
fi
echo "Benchmark completed successfully: ${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}"
fi fi
if [[ "$DASHBOARD_TAG" == *dynamic-true* ]]; then if [[ "$DASHBOARD_TAG" == *dynamic-true* ]]; then
$TASKSET python "benchmarks/dynamo/$suite.py" \ echo "Running benchmark: ${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}"
if ! $TASKSET python "benchmarks/dynamo/$suite.py" \
"${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --dynamic-shapes \ "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --dynamic-shapes \
--dynamic-batch-only "$@" \ --dynamic-batch-only "$@" \
--output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}.csv" --output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}.csv"; then
echo "ERROR: Benchmark failed for ${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}"
exit 1
fi
echo "Benchmark completed successfully: ${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}"
fi fi
if [[ "$DASHBOARD_TAG" == *cppwrapper-true* ]]; then if [[ "$DASHBOARD_TAG" == *cppwrapper-true* ]]; then
TORCHINDUCTOR_CPP_WRAPPER=1 $TASKSET python "benchmarks/dynamo/$suite.py" \ TORCHINDUCTOR_CPP_WRAPPER=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
@ -1163,12 +1220,6 @@ test_distributed() {
fi fi
} }
test_quantization() {
echo "Testing quantization"
python test/test_quantization.py
}
test_rpc() { test_rpc() {
echo "Testing RPC C++ tests" echo "Testing RPC C++ tests"
# NB: the ending test_rpc must match the current function name for the current # NB: the ending test_rpc must match the current function name for the current
@ -1586,7 +1637,7 @@ test_executorch() {
test_linux_aarch64() { test_linux_aarch64() {
python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \ python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \ test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops profiler/test_memory_profiler \ test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \ distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \
--shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
@ -1630,25 +1681,6 @@ test_operator_benchmark() {
--expected "expected_ci_operator_benchmark_eager_float32_cpu.csv" --expected "expected_ci_operator_benchmark_eager_float32_cpu.csv"
} }
test_operator_microbenchmark() {
TEST_REPORTS_DIR=$(pwd)/test/test-reports
mkdir -p "$TEST_REPORTS_DIR"
TEST_DIR=$(pwd)
cd benchmarks/operator_benchmark/pt_extension
python -m pip install .
cd "${TEST_DIR}"/benchmarks/operator_benchmark
for OP_BENCHMARK_TESTS in matmul mm addmm bmm; do
$TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
--output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \
--benchmark-name "PyTorch operator microbenchmark" --use-compile
$TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
--output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}.json" \
--benchmark-name "PyTorch operator microbenchmark"
done
}
if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
(cd test && python -c "import torch; print(torch.__config__.show())") (cd test && python -c "import torch; print(torch.__config__.show())")
@ -1681,8 +1713,6 @@ elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
test_executorch test_executorch
elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then
test_python_legacy_jit test_python_legacy_jit
elif [[ "$TEST_CONFIG" == 'quantization' ]]; then
test_quantization
elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
# TODO: run some C++ tests # TODO: run some C++ tests
echo "no-op at the moment" echo "no-op at the moment"
@ -1705,8 +1735,6 @@ elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then
test_operator_benchmark cpu ${TEST_MODE} test_operator_benchmark cpu ${TEST_MODE}
fi fi
elif [[ "${TEST_CONFIG}" == *operator_microbenchmark* ]]; then
test_operator_microbenchmark
elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
test_inductor_distributed test_inductor_distributed
elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
@ -1809,14 +1837,10 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
test_xpu_bin test_xpu_bin
elif [[ "${TEST_CONFIG}" == smoke ]]; then elif [[ "${TEST_CONFIG}" == smoke ]]; then
test_python_smoke test_python_smoke
elif [[ "${TEST_CONFIG}" == smoke_b200 ]]; then
test_python_smoke_b200
elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
test_h100_distributed test_h100_distributed
elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
test_h100_symm_mem test_h100_symm_mem
elif [[ "${TEST_CONFIG}" == "b200-symm-mem" ]]; then
test_h100_symm_mem
elif [[ "${TEST_CONFIG}" == h100_cutlass_backend ]]; then elif [[ "${TEST_CONFIG}" == h100_cutlass_backend ]]; then
test_h100_cutlass_backend test_h100_cutlass_backend
else else

View File

@ -25,7 +25,7 @@ echo Copying over test times file
robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files" robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files"
echo Run nn tests echo Run nn tests
python run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
if ERRORLEVEL 1 goto fail if ERRORLEVEL 1 goto fail
popd popd

View File

@ -63,7 +63,7 @@ if errorlevel 1 exit /b 1
call %CONDA_HOME%\condabin\activate.bat testenv call %CONDA_HOME%\condabin\activate.bat testenv
if errorlevel 1 exit /b 1 if errorlevel 1 exit /b 1
call conda install -y -q -c conda-forge libuv=1.51 call conda install -y -q -c conda-forge libuv=1.39
call conda install -y -q intel-openmp call conda install -y -q intel-openmp
echo "install and test libtorch" echo "install and test libtorch"

View File

@ -69,8 +69,6 @@ readability-string-compare,
' '
HeaderFilterRegex: '^(aten/|c10/|torch/).*$' HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
WarningsAsErrors: '*' WarningsAsErrors: '*'
LineFilter:
- name: '/usr/include/.*'
CheckOptions: CheckOptions:
cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor: true cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor: true
cppcoreguidelines-special-member-functions.AllowImplicitlyDeletedCopyOrMove: true cppcoreguidelines-special-member-functions.AllowImplicitlyDeletedCopyOrMove: true

View File

@ -22,9 +22,6 @@ self-hosted-runner:
- linux.arm64.m7g.4xlarge - linux.arm64.m7g.4xlarge
- linux.arm64.m7g.4xlarge.ephemeral - linux.arm64.m7g.4xlarge.ephemeral
- linux.arm64.r7g.12xlarge.memory - linux.arm64.r7g.12xlarge.memory
- linux.aws.h100
- linux.aws.h100.4
- linux.aws.h100.8
- linux.4xlarge.nvidia.gpu - linux.4xlarge.nvidia.gpu
- linux.8xlarge.nvidia.gpu - linux.8xlarge.nvidia.gpu
- linux.16xlarge.nvidia.gpu - linux.16xlarge.nvidia.gpu

View File

@ -59,7 +59,7 @@ runs:
set -x set -x
# Create new py_tmp env with python-version # Create new py_tmp env with python-version
${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp libuv ${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp
PYTHON3=$(${CONDA_RUN} -n py_tmp which python3) PYTHON3=$(${CONDA_RUN} -n py_tmp which python3)
EXIT_CODE=$? EXIT_CODE=$?

View File

@ -1 +1 @@
da63274d9f3d06ba5815b5c8786a7194923a0234 367a480bd3534edf27a8dac3c6f7ea8af9d1ed45

View File

@ -525,21 +525,6 @@
- Lint - Lint
- pull - pull
- name: typechecking
patterns:
- 'pyrefly.toml'
- 'mypy.ini'
- 'mypy-strict.ini'
approved_by:
- lolpack
- maggiemoss
- ndmitchell
- kinto0
mandatory_checks_name:
- EasyCLA
- Lint
- pull
- name: superuser - name: superuser
patterns: patterns:
- '*' - '*'

View File

@ -1,44 +1,41 @@
tracking_issue: 24422 tracking_issue: 24422
ciflow_tracking_issue: 64124 ciflow_tracking_issue: 64124
ciflow_push_tags: ciflow_push_tags:
- ciflow/b200
- ciflow/b200-symm-mem
- ciflow/binaries - ciflow/binaries
- ciflow/binaries_libtorch - ciflow/binaries_libtorch
- ciflow/binaries_wheel - ciflow/binaries_wheel
- ciflow/h100 - ciflow/triton_binaries
- ciflow/h100-cutlass-backend
- ciflow/h100-distributed
- ciflow/h100-symm-mem
- ciflow/inductor - ciflow/inductor
- ciflow/inductor-cu126
- ciflow/inductor-micro-benchmark
- ciflow/inductor-micro-benchmark-cpu-x86
- ciflow/inductor-perf-compare
- ciflow/inductor-perf-test-nightly-rocm
- ciflow/inductor-perf-test-nightly-x86-zen
- ciflow/inductor-periodic - ciflow/inductor-periodic
- ciflow/inductor-rocm - ciflow/inductor-rocm
- ciflow/inductor-perf-test-nightly-rocm
- ciflow/inductor-perf-compare
- ciflow/inductor-micro-benchmark
- ciflow/inductor-micro-benchmark-cpu-x86
- ciflow/inductor-perf-test-nightly-x86-zen
- ciflow/inductor-cu126
- ciflow/linux-aarch64 - ciflow/linux-aarch64
- ciflow/mps - ciflow/mps
- ciflow/nightly - ciflow/nightly
- ciflow/op-benchmark
- ciflow/periodic - ciflow/periodic
- ciflow/periodic-rocm-mi300 - ciflow/periodic-rocm-mi300
- ciflow/pull
- ciflow/quantization-periodic
- ciflow/riscv64
- ciflow/rocm - ciflow/rocm
- ciflow/rocm-mi300 - ciflow/rocm-mi300
- ciflow/s390 - ciflow/s390
- ciflow/riscv64
- ciflow/slow - ciflow/slow
- ciflow/torchbench
- ciflow/triton_binaries
- ciflow/trunk - ciflow/trunk
- ciflow/unstable - ciflow/unstable
- ciflow/vllm
- ciflow/win-arm64
- ciflow/xpu - ciflow/xpu
- ciflow/vllm
- ciflow/torchbench
- ciflow/op-benchmark
- ciflow/pull
- ciflow/h100
- ciflow/h100-distributed
- ciflow/win-arm64
- ciflow/h100-symm-mem
- ciflow/h100-cutlass-backend
retryable_workflows: retryable_workflows:
- pull - pull
- trunk - trunk
@ -47,4 +44,4 @@ retryable_workflows:
- inductor-A100-perf-nightly - inductor-A100-perf-nightly
labeler_config: labeler.yml labeler_config: labeler.yml
label_to_label_config: label_to_label.yml label_to_label_config: label_to_label.yml
mergebot: true mergebot: True

View File

@ -30,7 +30,7 @@ CUDA_ARCHES_CUDNN_VERSION = {
} }
# NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this # NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
ROCM_ARCHES = ["6.4", "7.0"] ROCM_ARCHES = ["6.3", "6.4"]
XPU_ARCHES = ["xpu"] XPU_ARCHES = ["xpu"]
@ -53,7 +53,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | " "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | "
"nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | " "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | " "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
"nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | " "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | " "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
"nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | " "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
"nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | " "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
@ -70,7 +70,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | " "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | "
"nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | " "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | " "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
"nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | " "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | " "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
"nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | " "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
"nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | " "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
@ -87,7 +87,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | " "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
"nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | " "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
"nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | " "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
"nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | " "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
"nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | " "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
"nvidia-nvtx==13.0.39; platform_system == 'Linux' | " "nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
"nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | " "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "

View File

@ -155,7 +155,7 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
package_type="manywheel", package_type="manywheel",
build_configs=generate_binary_build_matrix.generate_wheels_matrix( build_configs=generate_binary_build_matrix.generate_wheels_matrix(
OperatingSystem.LINUX, OperatingSystem.LINUX,
arches=["13.0"], arches=["12.8"],
python_versions=["3.12"], python_versions=["3.12"],
), ),
branches="main", branches="main",

View File

@ -71,15 +71,12 @@ jobs:
with:!{{ upload.binary_env_as_input(config) }} with:!{{ upload.binary_env_as_input(config) }}
{%- if "aarch64" in build_environment %} {%- if "aarch64" in build_environment %}
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
{%- elif "s390x" in build_environment %} {%- elif "s390x" in build_environment %}
runs_on: linux.s390x runs_on: linux.s390x
ALPINE_IMAGE: "docker.io/s390x/alpine" ALPINE_IMAGE: "docker.io/s390x/alpine"
timeout-minutes: 420 timeout-minutes: 420
{%- elif config["gpu_arch_type"] == "rocm" %}
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
{%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %} {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %}
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.24xlarge.ephemeral runs_on: linux.24xlarge.ephemeral

View File

@ -67,7 +67,7 @@ jobs:
# an OOM issue when running the job, so this upgrades the runner from 4xlarge # an OOM issue when running the job, so this upgrades the runner from 4xlarge
# to the next available tier of 12xlarge. So much memory just to generate cpp # to the next available tier of 12xlarge. So much memory just to generate cpp
# doc # doc
runner: ${{ inputs.runner_prefix }}linux.12xlarge.memory runner: ${{ inputs.runner_prefix }}linux.12xlarge
# TODO: Nightly cpp docs take longer and longer to finish (more than 3h now) # TODO: Nightly cpp docs take longer and longer to finish (more than 3h now)
# Let's try to figure out how this can be improved # Let's try to figure out how this can be improved
timeout-minutes: 360 timeout-minutes: 360

View File

@ -2,12 +2,6 @@ name: Get Changed Files
on: on:
workflow_call: workflow_call:
inputs:
all_files:
description: "Whether to return all files instead of just changed files"
required: false
type: boolean
default: false
outputs: outputs:
changed-files: changed-files:
description: "List of changed files (space-separated) or '*' if not in a PR" description: "List of changed files (space-separated) or '*' if not in a PR"
@ -32,23 +26,17 @@ jobs:
# Get the PR number from the github context # Get the PR number from the github context
PR_NUMBER="${{ github.event.number }}" PR_NUMBER="${{ github.event.number }}"
# Check if all_files is requested # Use gh CLI to get changed files in the PR with explicit repo
if [ "${{ inputs.all_files }}" = "true" ]; then CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
echo "all_files input is true, returning all files"
echo "changed-files=*" >> "$GITHUB_OUTPUT"
else
# Use gh CLI to get changed files in the PR with explicit repo
CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
if [ -z "$CHANGED_FILES" ]; then if [ -z "$CHANGED_FILES" ]; then
echo "No changed files found, setting to '*'" echo "No changed files found, setting to '*'"
CHANGED_FILES="*" CHANGED_FILES="*"
fi
echo "Changed files: $CHANGED_FILES"
echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
fi fi
echo "Changed files: $CHANGED_FILES"
echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
else else
echo "Not in PR context, setting changed files to '*'" echo "Not in PR context, setting changed files to '*'"
echo "changed-files=*" >> "$GITHUB_OUTPUT" echo "changed-files=*" >> "$GITHUB_OUTPUT"

View File

@ -273,8 +273,6 @@ jobs:
TEST_CONFIG: ${{ matrix.config }} TEST_CONFIG: ${{ matrix.config }}
SHARD_NUMBER: ${{ matrix.shard }} SHARD_NUMBER: ${{ matrix.shard }}
NUM_TEST_SHARDS: ${{ matrix.num_shards }} NUM_TEST_SHARDS: ${{ matrix.num_shards }}
EXTRA_FLAGS: ${{ matrix.extra_flags || '' }}
OP_BENCHMARK_TESTS: ${{ matrix.op_benchmark_tests }}
REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }} REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}

View File

@ -1,60 +0,0 @@
name: Limited CI for symmetric memory tests on B200
on:
pull_request:
paths:
- .github/workflows/b200-symm-mem.yml
workflow_dispatch:
push:
tags:
- ciflow/b200-symm-mem/*
schedule:
- cron: 22 8 * * * # about 1:22am PDT
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
get-label-type:
if: github.repository_owner == 'pytorch'
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm:
name: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '10.0'
test-matrix: |
{ include: [
{ config: "b200-symm-mem", shard: 1, num_shards: 1, runner: "linux.dgx.b200.8" },
]}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc11-sm100-test:
name: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm.outputs.test-matrix }}
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
secrets: inherit

View File

@ -36,7 +36,7 @@ jobs:
runs-on: linux.9xlarge.ephemeral runs-on: linux.9xlarge.ephemeral
strategy: strategy:
matrix: matrix:
tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "rocm7.0", "cpu"] tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "cpu"]
steps: steps:
- name: Build docker image - name: Build docker image
uses: pytorch/pytorch/.github/actions/binary-docker-build@main uses: pytorch/pytorch/.github/actions/binary-docker-build@main

View File

@ -52,8 +52,8 @@ jobs:
{ tag: "cuda12.9" }, { tag: "cuda12.9" },
{ tag: "cuda12.8" }, { tag: "cuda12.8" },
{ tag: "cuda12.6" }, { tag: "cuda12.6" },
{ tag: "rocm6.3" },
{ tag: "rocm6.4" }, { tag: "rocm6.4" },
{ tag: "rocm7.0" },
{ tag: "cpu" }, { tag: "cpu" },
] ]
steps: steps:

View File

@ -34,7 +34,7 @@ jobs:
id-token: write id-token: write
strategy: strategy:
matrix: matrix:
rocm_version: ["70", "64"] rocm_version: ["64", "63"]
steps: steps:
- name: Checkout PyTorch - name: Checkout PyTorch
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

View File

@ -52,8 +52,8 @@ jobs:
{ name: "manylinuxaarch64-builder", tag: "cuda13.0", runner: "linux.arm64.2xlarge.ephemeral" }, { name: "manylinuxaarch64-builder", tag: "cuda13.0", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinuxaarch64-builder", tag: "cuda12.8", runner: "linux.arm64.2xlarge.ephemeral" }, { name: "manylinuxaarch64-builder", tag: "cuda12.8", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinuxaarch64-builder", tag: "cuda12.6", runner: "linux.arm64.2xlarge.ephemeral" }, { name: "manylinuxaarch64-builder", tag: "cuda12.6", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "rocm6.3", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "rocm6.4", runner: "linux.9xlarge.ephemeral" }, { name: "manylinux2_28-builder", tag: "rocm6.4", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "rocm7.0", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28-builder", tag: "cpu", runner: "linux.9xlarge.ephemeral" }, { name: "manylinux2_28-builder", tag: "cpu", runner: "linux.9xlarge.ephemeral" },
{ name: "manylinux2_28_aarch64-builder", tag: "cpu-aarch64", runner: "linux.arm64.2xlarge.ephemeral" }, { name: "manylinux2_28_aarch64-builder", tag: "cpu-aarch64", runner: "linux.arm64.2xlarge.ephemeral" },
{ name: "manylinuxcxx11-abi-builder", tag: "cpu-cxx11-abi", runner: "linux.9xlarge.ephemeral" }, { name: "manylinuxcxx11-abi-builder", tag: "cpu-cxx11-abi", runner: "linux.9xlarge.ephemeral" },

View File

@ -50,12 +50,12 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ] py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
device: ["cuda", "rocm", "xpu", "aarch64"] device: ["cuda", "rocm", "xpu", "aarch64"]
docker-image: ["pytorch/manylinux2_28-builder:cpu"] docker-image: ["pytorch/manylinux2_28-builder:cpu"]
include: include:
- device: "rocm" - device: "rocm"
rocm_version: "7.0" rocm_version: "6.4"
runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
- device: "cuda" - device: "cuda"
rocm_version: "" rocm_version: ""
@ -108,6 +108,9 @@ jobs:
# Determine python executable for given version # Determine python executable for given version
case $PY_VERS in case $PY_VERS in
3.9)
PYTHON_EXECUTABLE=/opt/python/cp39-cp39/bin/python
;;
3.10) 3.10)
PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python
;; ;;
@ -191,7 +194,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ] py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
device: ["xpu"] device: ["xpu"]
timeout-minutes: 40 timeout-minutes: 40
env: env:

View File

@ -35,7 +35,6 @@ jobs:
contents: write contents: write
outputs: outputs:
pt_release_name: ${{ steps.release_name.outputs.pt_release_name }} pt_release_name: ${{ steps.release_name.outputs.pt_release_name }}
pt_pep517_release_name: ${{ steps.release_name.outputs.pt_pep517_release_name }}
steps: steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with: with:
@ -54,12 +53,8 @@ jobs:
tag_or_branch="${tag_or_branch#refs/heads/}" tag_or_branch="${tag_or_branch#refs/heads/}"
# replace directory separators with _ in branch name # replace directory separators with _ in branch name
tag_or_branch="${tag_or_branch//\//_}" tag_or_branch="${tag_or_branch//\//_}"
torch_version="$(python -c 'from tools.generate_torch_version import get_torch_version; print(get_torch_version())')" echo "PT_RELEASE_NAME=pytorch-$tag_or_branch" >> "$GITHUB_ENV"
{ echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV"
echo "PT_RELEASE_NAME=pytorch-$tag_or_branch";
echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz";
echo "PT_PEP517_RELEASE_FILE=torch-${torch_version}.tar.gz";
} >> "$GITHUB_ENV"
- name: Checkout optional submodules - name: Checkout optional submodules
run: python3 tools/optional_submodules.py run: python3 tools/optional_submodules.py
- name: Copy docs requirements for inclusion - name: Copy docs requirements for inclusion
@ -69,47 +64,30 @@ jobs:
cp .ci/docker/requirements-docs.txt docs/requirements.txt cp .ci/docker/requirements-docs.txt docs/requirements.txt
- name: Create source distribution - name: Create source distribution
run: | run: |
# Create new folder with specified name so extracting the archive yields that # Create new folder with specified name so extracting the archive yields that
rm -rf "/tmp/$PT_RELEASE_NAME" rm -rf "/tmp/$PT_RELEASE_NAME"
cp -r "$PWD" "/tmp/$PT_RELEASE_NAME" cp -r "$PWD" "/tmp/$PT_RELEASE_NAME"
mv "/tmp/$PT_RELEASE_NAME" . mv "/tmp/$PT_RELEASE_NAME" .
# Cleanup # Cleanup
rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci} rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci}
find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true
# Create archive # Create archive
tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME" tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME"
echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")" echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")"
- name: Create PEP 517 compatible source distribution
run: |
pip install build==1.2.2.post1 || exit 1
python -m build --sdist || exit 1
cd dist || exit 1
- name: Upload source distribution for release - name: Upload source distribution for release
if: ${{ github.event_name == 'release' }} if: ${{ github.event_name == 'release' }}
uses: softprops/action-gh-release@da05d552573ad5aba039eaac05058a918a7bf631 # v2.2.2 uses: softprops/action-gh-release@da05d552573ad5aba039eaac05058a918a7bf631 # v2.2.2
with: with:
files: | files: ${{env.PT_RELEASE_FILE}}
${{ env.PT_RELEASE_FILE }} - name: Upload source distribution to GHA artifacts for release tags
${{ env.PT_PEP517_RELEASE_FILE }}
- name: Upload source distribution to GHA artifacts # for release tags
if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }} if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
with: with:
name: ${{ env.PT_RELEASE_FILE }} name: ${{ env.PT_RELEASE_FILE }}
path: ${{ env.PT_RELEASE_FILE }} path: ${{ env.PT_RELEASE_FILE }}
- name: Upload PEP 517 source distribution to GHA artifacts # for release tags
if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
with:
name: ${{ env.PT_PEP517_RELEASE_FILE }}
path: dist/${{ env.PT_PEP517_RELEASE_FILE }}
- name: Set output - name: Set output
id: release_name id: release_name
run: | run: echo "pt_release_name=${{ env.PT_RELEASE_NAME }}.tar.gz" >> "${GITHUB_OUTPUT}"
{
echo "pt_release_name=${{ env.PT_RELEASE_FILE }}";
echo "pt_pep517_release_name=${{ env.PT_PEP517_RELEASE_FILE }}";
} >> "${GITHUB_OUTPUT}"
upload_source_code_to_s3: upload_source_code_to_s3:
if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }} if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
@ -125,9 +103,6 @@ jobs:
- uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
with: with:
name: ${{ needs.release.outputs.pt_release_name }} name: ${{ needs.release.outputs.pt_release_name }}
- uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
with:
name: ${{ needs.release.outputs.pt_pep517_release_name }}
- name: Configure AWS credentials(PyTorch account) - name: Configure AWS credentials(PyTorch account)
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with: with:
@ -138,9 +113,7 @@ jobs:
s3-bucket: pytorch s3-bucket: pytorch
s3-prefix: source_code/test s3-prefix: source_code/test
if-no-files-found: warn if-no-files-found: warn
path: | path: ${{ needs.release.outputs.pt_release_name }}
${{ needs.release.outputs.pt_release_name }}
${{ needs.release.outputs.pt_pep517_release_name }}
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }} group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}

View File

@ -70,7 +70,7 @@ jobs:
pytorch-linux-jammy-py3-clang18-asan, pytorch-linux-jammy-py3-clang18-asan,
pytorch-linux-jammy-py3-clang12-onnx, pytorch-linux-jammy-py3-clang12-onnx,
pytorch-linux-jammy-linter, pytorch-linux-jammy-linter,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter, pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
pytorch-linux-jammy-py3-clang12-executorch, pytorch-linux-jammy-py3-clang12-executorch,
pytorch-linux-jammy-py3.12-triton-cpu, pytorch-linux-jammy-py3.12-triton-cpu,
pytorch-linux-noble-riscv64-py3.12-gcc14 pytorch-linux-noble-riscv64-py3.12-gcc14

View File

@ -62,7 +62,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.10" DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cpu-aarch64 build_name: manywheel-py3_10-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
@ -128,11 +128,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DOCKER_IMAGE_TAG_PREFIX: cuda12.6
DESIRED_PYTHON: "3.10" DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-12_6 build_name: manywheel-py3_10-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -174,11 +174,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.10" DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-12_8 build_name: manywheel-py3_10-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -220,11 +220,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.10" DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-13_0 build_name: manywheel-py3_10-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -265,7 +265,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.11" DESIRED_PYTHON: "3.11"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cpu-aarch64 build_name: manywheel-py3_11-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
@ -331,11 +331,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DOCKER_IMAGE_TAG_PREFIX: cuda12.6
DESIRED_PYTHON: "3.11" DESIRED_PYTHON: "3.11"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-12_6 build_name: manywheel-py3_11-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -377,11 +377,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.11" DESIRED_PYTHON: "3.11"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-12_8 build_name: manywheel-py3_11-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -423,11 +423,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.11" DESIRED_PYTHON: "3.11"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-13_0 build_name: manywheel-py3_11-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -468,7 +468,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.12" DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cpu-aarch64 build_name: manywheel-py3_12-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
@ -534,11 +534,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DOCKER_IMAGE_TAG_PREFIX: cuda12.6
DESIRED_PYTHON: "3.12" DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-12_6 build_name: manywheel-py3_12-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -580,11 +580,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.12" DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-12_8 build_name: manywheel-py3_12-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -626,11 +626,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.12" DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-13_0 build_name: manywheel-py3_12-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -671,7 +671,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.13" DESIRED_PYTHON: "3.13"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cpu-aarch64 build_name: manywheel-py3_13-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
@ -737,11 +737,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DOCKER_IMAGE_TAG_PREFIX: cuda12.6
DESIRED_PYTHON: "3.13" DESIRED_PYTHON: "3.13"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-12_6 build_name: manywheel-py3_13-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -783,11 +783,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.13" DESIRED_PYTHON: "3.13"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-12_8 build_name: manywheel-py3_13-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -829,11 +829,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.13" DESIRED_PYTHON: "3.13"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-13_0 build_name: manywheel-py3_13-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -874,7 +874,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.13t" DESIRED_PYTHON: "3.13t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cpu-aarch64 build_name: manywheel-py3_13t-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
@ -940,11 +940,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DOCKER_IMAGE_TAG_PREFIX: cuda12.6
DESIRED_PYTHON: "3.13t" DESIRED_PYTHON: "3.13t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-12_6 build_name: manywheel-py3_13t-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -986,11 +986,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.13t" DESIRED_PYTHON: "3.13t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-12_8 build_name: manywheel-py3_13t-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1032,11 +1032,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.13t" DESIRED_PYTHON: "3.13t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-13_0 build_name: manywheel-py3_13t-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1077,7 +1077,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.14" DESIRED_PYTHON: "3.14"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cpu-aarch64 build_name: manywheel-py3_14-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
@ -1143,11 +1143,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DOCKER_IMAGE_TAG_PREFIX: cuda12.6
DESIRED_PYTHON: "3.14" DESIRED_PYTHON: "3.14"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-12_6 build_name: manywheel-py3_14-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1189,11 +1189,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.14" DESIRED_PYTHON: "3.14"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-12_8 build_name: manywheel-py3_14-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1235,11 +1235,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.14" DESIRED_PYTHON: "3.14"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14-cuda-aarch64-13_0 build_name: manywheel-py3_14-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1280,7 +1280,7 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
DESIRED_PYTHON: "3.14t" DESIRED_PYTHON: "3.14t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cpu-aarch64 build_name: manywheel-py3_14t-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
@ -1346,11 +1346,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DOCKER_IMAGE_TAG_PREFIX: cuda12.6
DESIRED_PYTHON: "3.14t" DESIRED_PYTHON: "3.14t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-12_6 build_name: manywheel-py3_14t-cuda-aarch64-12_6
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1392,11 +1392,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.14t" DESIRED_PYTHON: "3.14t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-12_8 build_name: manywheel-py3_14t-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1438,11 +1438,11 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DOCKER_IMAGE_TAG_PREFIX: cuda13.0
DESIRED_PYTHON: "3.14t" DESIRED_PYTHON: "3.14t"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.arm64.r7g.12xlarge.memory runs_on: linux.arm64.m7g.4xlarge.ephemeral
ALPINE_IMAGE: "arm64v8/alpine" ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_14t-cuda-aarch64-13_0 build_name: manywheel-py3_14t-cuda-aarch64-13_0
build_environment: linux-aarch64-binary-manywheel build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
timeout-minutes: 420 timeout-minutes: 420
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}

View File

@ -316,6 +316,120 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml uses: ./.github/workflows/_binary-upload.yml
libtorch-rocm6_3-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.3
GPU_ARCH_VERSION: "6.3"
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.3
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: libtorch-rocm6_3-shared-with-deps-release
build_environment: linux-binary-libtorch
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
libtorch-rocm6_3-shared-with-deps-release-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- libtorch-rocm6_3-shared-with-deps-release-build
- get-label-type
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.3
GPU_ARCH_VERSION: "6.3"
GPU_ARCH_TYPE: rocm
SKIP_ALL_TESTS: 1
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.3
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
- uses: actions/download-artifact@v4.1.7
name: Download Build Artifacts
with:
name: libtorch-rocm6_3-shared-with-deps-release
path: "${{ runner.temp }}/artifacts/"
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: ROCm set GPU_FLAG
run: |
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
- name: configure aws credentials
id: aws_creds
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
role-duration-seconds: 18000
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with:
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
docker-image-name: libtorch-cxx11-builder
custom-tag-prefix: rocm6.3
docker-build-dir: .ci/docker
working-directory: pytorch
- name: Pull Docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Test Pytorch binary
uses: ./pytorch/.github/actions/test-pytorch-binary
env:
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Teardown ROCm
uses: ./.github/actions/teardown-rocm
libtorch-rocm6_3-shared-with-deps-release-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: libtorch-rocm6_3-shared-with-deps-release-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.3
GPU_ARCH_VERSION: "6.3"
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.3
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
build_name: libtorch-rocm6_3-shared-with-deps-release
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
libtorch-rocm6_4-shared-with-deps-release-build: libtorch-rocm6_4-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }} if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml uses: ./.github/workflows/_binary-build-linux.yml
@ -333,7 +447,6 @@ jobs:
LIBTORCH_CONFIG: release LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps LIBTORCH_VARIANT: shared-with-deps
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: libtorch-rocm6_4-shared-with-deps-release build_name: libtorch-rocm6_4-shared-with-deps-release
build_environment: linux-binary-libtorch build_environment: linux-binary-libtorch
secrets: secrets:
@ -430,118 +543,3 @@ jobs:
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml uses: ./.github/workflows/_binary-upload.yml
libtorch-rocm7_0-shared-with-deps-release-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm7.0
GPU_ARCH_VERSION: "7.0"
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: libtorch-rocm7_0-shared-with-deps-release
build_environment: linux-binary-libtorch
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
libtorch-rocm7_0-shared-with-deps-release-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- libtorch-rocm7_0-shared-with-deps-release-build
- get-label-type
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm7.0
GPU_ARCH_VERSION: "7.0"
GPU_ARCH_TYPE: rocm
SKIP_ALL_TESTS: 1
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
- uses: actions/download-artifact@v4.1.7
name: Download Build Artifacts
with:
name: libtorch-rocm7_0-shared-with-deps-release
path: "${{ runner.temp }}/artifacts/"
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: ROCm set GPU_FLAG
run: |
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
- name: configure aws credentials
id: aws_creds
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
role-duration-seconds: 18000
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with:
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
docker-image-name: libtorch-cxx11-builder
custom-tag-prefix: rocm7.0
docker-build-dir: .ci/docker
working-directory: pytorch
- name: Pull Docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Test Pytorch binary
uses: ./pytorch/.github/actions/test-pytorch-binary
env:
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Teardown ROCm
uses: ./.github/actions/teardown-rocm
libtorch-rocm7_0-shared-with-deps-release-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: libtorch-rocm7_0-shared-with-deps-release-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: libtorch
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm7.0
GPU_ARCH_VERSION: "7.0"
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: libtorch-cxx11-builder
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
LIBTORCH_CONFIG: release
LIBTORCH_VARIANT: shared-with-deps
build_name: libtorch-rocm7_0-shared-with-deps-release
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml

View File

@ -42,7 +42,7 @@ jobs:
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }} curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }} curr_ref_type: ${{ github.ref_type }}
manywheel-py3_12-cuda13_0-build: manywheel-py3_12-cuda12_8-build:
if: ${{ github.repository_owner == 'pytorch' }} if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type needs: get-label-type
@ -51,22 +51,22 @@ jobs:
PACKAGE_TYPE: manywheel PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in # TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION # favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130 DESIRED_CUDA: cu128
GPU_ARCH_VERSION: "13.0" GPU_ARCH_VERSION: "12.8"
GPU_ARCH_TYPE: cuda GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.12" DESIRED_PYTHON: "3.12"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda13_0 build_name: manywheel-py3_12-cuda12_8
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
secrets: secrets:
github-token: ${{ secrets.GITHUB_TOKEN }} github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda13_0-test: # Testing manywheel-py3_12-cuda12_8-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }} if: ${{ github.repository_owner == 'pytorch' }}
needs: needs:
- manywheel-py3_12-cuda13_0-build - manywheel-py3_12-cuda12_8-build
- get-label-type - get-label-type
uses: ./.github/workflows/_binary-test-linux.yml uses: ./.github/workflows/_binary-test-linux.yml
with: with:
@ -74,13 +74,13 @@ jobs:
PACKAGE_TYPE: manywheel PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in # TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION # favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu130 DESIRED_CUDA: cu128
GPU_ARCH_VERSION: "13.0" GPU_ARCH_VERSION: "12.8"
GPU_ARCH_TYPE: cuda GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.12" DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-cuda13_0 build_name: manywheel-py3_12-cuda12_8
build_environment: linux-binary-manywheel build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner

File diff suppressed because it is too large Load Diff

View File

@ -60,7 +60,6 @@ jobs:
DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DOCKER_IMAGE_TAG_PREFIX: rocm6.4
DESIRED_PYTHON: "3.10" DESIRED_PYTHON: "3.10"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
timeout-minutes: 300
build_name: manywheel-py3_10-rocm6_4 build_name: manywheel-py3_10-rocm6_4
build_environment: linux-binary-manywheel-rocm build_environment: linux-binary-manywheel-rocm
secrets: secrets:

View File

@ -57,7 +57,7 @@ on:
description: The list of configs used the benchmark description: The list of configs used the benchmark
required: false required: false
type: string type: string
default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf,cachebench default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf,dynamo_eager_huggingface_perf,dynamo_eager_timm_perf,dynamo_eager_torchbench_perf,cachebench
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
@ -97,18 +97,35 @@ jobs:
{ config: "inductor_huggingface_perf", shard: 3, num_shards: 5, runner: "linux.aws.a100" }, { config: "inductor_huggingface_perf", shard: 3, num_shards: 5, runner: "linux.aws.a100" },
{ config: "inductor_huggingface_perf", shard: 4, num_shards: 5, runner: "linux.aws.a100" }, { config: "inductor_huggingface_perf", shard: 4, num_shards: 5, runner: "linux.aws.a100" },
{ config: "inductor_huggingface_perf", shard: 5, num_shards: 5, runner: "linux.aws.a100" }, { config: "inductor_huggingface_perf", shard: 5, num_shards: 5, runner: "linux.aws.a100" },
{ config: "dynamo_eager_huggingface_perf", shard: 1, num_shards: 5, runner: "linux.aws.a100" },
{ config: "dynamo_eager_huggingface_perf", shard: 2, num_shards: 5, runner: "linux.aws.a100" },
{ config: "dynamo_eager_huggingface_perf", shard: 3, num_shards: 5, runner: "linux.aws.a100" },
{ config: "dynamo_eager_huggingface_perf", shard: 4, num_shards: 5, runner: "linux.aws.a100" },
{ config: "dynamo_eager_huggingface_perf", shard: 5, num_shards: 5, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" }, { config: "inductor_timm_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" }, { config: "inductor_timm_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" }, { config: "inductor_timm_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" }, { config: "inductor_timm_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" }, { config: "inductor_timm_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" }, { config: "inductor_timm_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
{ config: "dynamo_eager_timm_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" },
{ config: "dynamo_eager_timm_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" },
{ config: "dynamo_eager_timm_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" },
{ config: "dynamo_eager_timm_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
{ config: "dynamo_eager_timm_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
{ config: "dynamo_eager_timm_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" }, { config: "inductor_torchbench_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" }, { config: "inductor_torchbench_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" }, { config: "inductor_torchbench_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" }, { config: "inductor_torchbench_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" }, { config: "inductor_torchbench_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" }, { config: "inductor_torchbench_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
{ config: "dynamo_eager_torchbench_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" },
{ config: "dynamo_eager_torchbench_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" },
{ config: "dynamo_eager_torchbench_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" },
{ config: "dynamo_eager_torchbench_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
{ config: "dynamo_eager_torchbench_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
{ config: "dynamo_eager_torchbench_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
{ config: "cachebench", shard: 1, num_shards: 2, runner: "linux.aws.a100" }, { config: "cachebench", shard: 1, num_shards: 2, runner: "linux.aws.a100" },
{ config: "cachebench", shard: 2, num_shards: 2, runner: "linux.aws.a100" }, { config: "cachebench", shard: 2, num_shards: 2, runner: "linux.aws.a100" },
]} ]}

View File

@ -31,8 +31,6 @@ jobs:
if: github.repository_owner == 'pytorch' if: github.repository_owner == 'pytorch'
name: Get changed files name: Get changed files
uses: ./.github/workflows/_get-changed-files.yml uses: ./.github/workflows/_get-changed-files.yml
with:
all_files: ${{ contains(github.event.pull_request.labels.*.name, 'lint-all-files') || contains(github.event.pull_request.labels.*.name, 'Reverted') }}
lintrunner-clang: lintrunner-clang:
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@ -55,7 +53,7 @@ jobs:
with: with:
timeout: 120 timeout: 120
runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter
# NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
# to run git rev-parse HEAD~:.ci/docker when a new image is needed # to run git rev-parse HEAD~:.ci/docker when a new image is needed
fetch-depth: 0 fetch-depth: 0
@ -266,10 +264,10 @@ jobs:
with: with:
submodules: false submodules: false
fetch-depth: 1 fetch-depth: 1
- name: Setup Python 3.10 - name: Setup Python 3.9
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with: with:
python-version: '3.10' python-version: '3.9'
architecture: x64 architecture: x64
cache: pip cache: pip
- name: Install dependencies - name: Install dependencies

View File

@ -1,46 +0,0 @@
name: operator_microbenchmark
on:
push:
tags:
- ciflow/op-benchmark/*
workflow_dispatch:
schedule:
# Run at 06:00 UTC everyday
- cron: 0 6 * * *
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
opmicrobenchmark-build:
if: github.repository_owner == 'pytorch'
name: opmicrobenchmark-build
uses: ./.github/workflows/_linux-build.yml
with:
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '8.0 9.0'
test-matrix: |
{ include: [
{ config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.h100" },
{ config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
]}
secrets: inherit
opmicrobenchmark-test:
name: opmicrobenchmark-test
uses: ./.github/workflows/_linux-test.yml
needs: opmicrobenchmark-build
with:
timeout-minutes: 500
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }}
secrets: inherit

View File

@ -127,8 +127,6 @@ jobs:
uses: ./.github/workflows/_linux-build.yml uses: ./.github/workflows/_linux-build.yml
needs: get-label-type needs: get-label-type
with: with:
# More memory is needed to build with asan
runner: linux.2xlarge.memory
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-clang18-asan build-environment: linux-jammy-py3.10-clang18-asan
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan

View File

@ -1,54 +0,0 @@
name: quantization-periodic
on:
push:
tags:
- ciflow/quantization-periodic/*
workflow_dispatch:
schedule:
# run weekly
- cron: "45 0 * * 0"
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
get-default-label-prefix:
name: get-default-label-prefix
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
periodic-quantization-build:
name: periodic-quantization-build
uses: ./.github/workflows/_linux-build.yml
needs: get-default-label-prefix
with:
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '8.9'
test-matrix: |
{ include: [
{ config: "quantization", shard: 1, num_shards: 1, runner: "${{ needs.get-default-label-prefix.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
]}
secrets: inherit
periodic-test-quantization:
name: periodic-test-quantization
uses: ./.github/workflows/_linux-test.yml
needs: periodic-quantization-build
with:
build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11
docker-image: ${{ needs.periodic-quantization-build.outputs.docker-image }}
test-matrix: ${{ needs.periodic-quantization-build.outputs.test-matrix }}
secrets: inherit

View File

@ -140,8 +140,6 @@ jobs:
uses: ./.github/workflows/_linux-build.yml uses: ./.github/workflows/_linux-build.yml
needs: get-label-type needs: get-label-type
with: with:
# More memory is needed to build with asan
runner: linux.2xlarge.memory
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.10-clang18-asan build-environment: linux-jammy-py3.10-clang18-asan
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan

View File

@ -1,76 +0,0 @@
# B200 Smoke Tests CI Workflow
#
# This workflow runs smoke tests on B200 hardware
#
# Flow:
# 1. Builds PyTorch with CUDA 12.8+ and sm100 architecture for B200
# 2. Runs smoke tests on linux.dgx.b200 runner
# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke() function
#
# Triggered by:
# - Pull requests modifying this workflow file
# - Manual dispatch
# - Schedule (every 6 hours)
# - Adding ciflow/b200 label to a PR (creates ciflow/b200/* tag)
name: B200 Smoke Tests
on:
pull_request:
paths:
- .github/workflows/test-b200.yml
workflow_dispatch:
schedule:
- cron: 0 4,10,16,22 * * * # every 6 hours
push:
tags:
- ciflow/b200/*
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
get-label-type:
if: github.repository_owner == 'pytorch'
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
linux-jammy-cuda12_8-py3_10-gcc11-sm100-build:
name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '10.0'
test-matrix: |
{ include: [
{ config: "smoke_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
]}
# config: "smoke_b200" maps to test_python_smoke_b200() in .ci/pytorch/test.sh
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc11-sm100-test:
name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda12_8-py3_10-gcc11-sm100-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.test-matrix }}
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
secrets: inherit

View File

@ -53,3 +53,27 @@ jobs:
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }} curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }} curr_ref_type: ${{ github.ref_type }}
linux-jammy-py3_9-clang9-xla-build:
name: linux-jammy-py3_9-clang9-xla
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3.9-clang9-xla
docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
test-matrix: |
{ include: [
{ config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
]}
secrets: inherit
linux-jammy-py3_9-clang9-xla-test:
name: linux-jammy-py3_9-clang9-xla
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-py3_9-clang9-xla-build
with:
build-environment: linux-jammy-py3.9-clang9-xla
docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
secrets: inherit

1
.gitignore vendored
View File

@ -82,7 +82,6 @@ torch/return_types.pyi
torch/nn/functional.pyi torch/nn/functional.pyi
torch/utils/data/datapipes/datapipe.pyi torch/utils/data/datapipes/datapipe.pyi
torch/csrc/autograd/generated/* torch/csrc/autograd/generated/*
torch/csrc/functionalization/generated/*
torch/csrc/lazy/generated/*.[!m]* torch/csrc/lazy/generated/*.[!m]*
torch_compile_debug/ torch_compile_debug/
# Listed manually because some files in this directory are not generated # Listed manually because some files in this directory are not generated

View File

@ -49,7 +49,7 @@ init_command = [
'mccabe==0.7.0', 'mccabe==0.7.0',
'pycodestyle==2.14.0', 'pycodestyle==2.14.0',
'pyflakes==3.4.0', 'pyflakes==3.4.0',
'torchfix==0.4.0 ; python_version >= "3.10" and python_version < "3.13"', 'torchfix==0.4.0 ; python_version >= "3.9" and python_version < "3.13"',
] ]
@ -153,7 +153,7 @@ init_command = [
'python3', 'python3',
'tools/linter/adapters/pip_init.py', 'tools/linter/adapters/pip_init.py',
'--dry-run={{DRYRUN}}', '--dry-run={{DRYRUN}}',
'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"', 'numpy==1.26.4 ; python_version >= "3.9" and python_version <= "3.11"',
'numpy==2.1.0 ; python_version >= "3.12"', 'numpy==2.1.0 ; python_version >= "3.12"',
'expecttest==0.3.0', 'expecttest==0.3.0',
'mypy==1.16.0', 'mypy==1.16.0',
@ -196,7 +196,6 @@ exclude_patterns = [
'tools/test/gen_operators_yaml_test.py', 'tools/test/gen_operators_yaml_test.py',
'tools/test/gen_oplist_test.py', 'tools/test/gen_oplist_test.py',
'tools/test/test_selective_build.py', 'tools/test/test_selective_build.py',
'tools/experimental/dynamic_shapes/torchfuzz/**',
] ]
command = [ command = [
'python3', 'python3',
@ -1453,7 +1452,7 @@ init_command = [
'--dry-run={{DRYRUN}}', '--dry-run={{DRYRUN}}',
'usort==1.0.8.post1', 'usort==1.0.8.post1',
'isort==6.0.1', 'isort==6.0.1',
'ruff==0.13.1', # sync with RUFF 'ruff==0.12.9', # sync with RUFF
] ]
is_formatter = true is_formatter = true
@ -1587,7 +1586,7 @@ init_command = [
'python3', 'python3',
'tools/linter/adapters/pip_init.py', 'tools/linter/adapters/pip_init.py',
'--dry-run={{DRYRUN}}', '--dry-run={{DRYRUN}}',
'ruff==0.13.1', # sync with PYFMT 'ruff==0.12.9', # sync with PYFMT
] ]
is_formatter = true is_formatter = true

View File

@ -91,8 +91,6 @@ generated_cpu_cpp = [
"aten/src/ATen/NativeMetaFunctions.h", "aten/src/ATen/NativeMetaFunctions.h",
"aten/src/ATen/RegistrationDeclarations.h", "aten/src/ATen/RegistrationDeclarations.h",
"aten/src/ATen/VmapGeneratedPlumbing.h", "aten/src/ATen/VmapGeneratedPlumbing.h",
"aten/src/ATen/ViewMetaClasses.h",
"aten/src/ATen/ViewMetaClasses.cpp",
"aten/src/ATen/core/aten_interned_strings.h", "aten/src/ATen/core/aten_interned_strings.h",
"aten/src/ATen/core/enum_tag.h", "aten/src/ATen/core/enum_tag.h",
"aten/src/ATen/core/TensorBody.h", "aten/src/ATen/core/TensorBody.h",
@ -835,6 +833,36 @@ pybind_extension(
], ],
) )
cc_library(
name = "functorch",
hdrs = glob([
"functorch/csrc/dim/*.h",
]),
srcs = glob([
"functorch/csrc/dim/*.cpp",
]),
deps = [
":aten_nvrtc",
":torch_python",
"@pybind11",
],
)
pybind_extension(
name = "functorch/_C",
copts=[
"-DTORCH_EXTENSION_NAME=_C"
],
srcs = [
"functorch/csrc/init_dim_only.cpp",
],
deps = [
":functorch",
":torch_python",
":aten_nvrtc",
],
)
cc_binary( cc_binary(
name = "torch/bin/torch_shm_manager", name = "torch/bin/torch_shm_manager",
srcs = [ srcs = [
@ -875,6 +903,7 @@ py_library(
], ],
data = [ data = [
":torch/_C.so", ":torch/_C.so",
":functorch/_C.so",
":torch/bin/torch_shm_manager", ":torch/bin/torch_shm_manager",
], ],
) )
@ -1077,7 +1106,6 @@ test_suite(
"aten/src/ATen/templates/LazyNonNativeIr.h", "aten/src/ATen/templates/LazyNonNativeIr.h",
"aten/src/ATen/templates/RegisterDispatchKey.cpp", "aten/src/ATen/templates/RegisterDispatchKey.cpp",
"aten/src/ATen/templates/RegisterDispatchDefinitions.ini", "aten/src/ATen/templates/RegisterDispatchDefinitions.ini",
"aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp",
"aten/src/ATen/native/native_functions.yaml", "aten/src/ATen/native/native_functions.yaml",
"aten/src/ATen/native/tags.yaml", "aten/src/ATen/native/tags.yaml",
"aten/src/ATen/native/ts_native_functions.yaml", "aten/src/ATen/native/ts_native_functions.yaml",

View File

@ -1,4 +1,5 @@
cmake_minimum_required(VERSION 3.27 FATAL_ERROR) cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
# cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0023 NEW)
# Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this # Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this
# sometimes makes XCode C compiler gets detected as "Clang", even when the C++ # sometimes makes XCode C compiler gets detected as "Clang", even when the C++
@ -442,7 +443,7 @@ if(WIN32)
message( message(
WARNING WARNING
"Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. " "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
"Please run command 'conda install -c conda-forge libuv=1.51' to install libuv." "Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
) )
else() else()
set(ENV{libuv_ROOT} ${libuv_tmp_LIBRARY}/../../) set(ENV{libuv_ROOT} ${libuv_tmp_LIBRARY}/../../)
@ -1390,6 +1391,10 @@ endif()
include(cmake/Summary.cmake) include(cmake/Summary.cmake)
caffe2_print_configuration_summary() caffe2_print_configuration_summary()
if(BUILD_FUNCTORCH)
add_subdirectory(functorch)
endif()
# Parse custom debug info # Parse custom debug info
if(DEFINED USE_CUSTOM_DEBINFO) if(DEFINED USE_CUSTOM_DEBINFO)
string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}") string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}")
@ -1481,4 +1486,4 @@ else()
To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1 To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
]]) ]])
endif() endif()
endif() endif()

View File

@ -1,61 +1,20 @@
# Reference: https://setuptools.pypa.io/en/latest/userguide/miscellaneous.html # Reference: https://setuptools.pypa.io/en/latest/userguide/miscellaneous.html
# Include individual top-level files # Include source files in SDist
include CITATION.cff include CMakeLists.txt
include CODEOWNERS include *.bzl *.bazel .bazel* BUILD *.BUILD BUILD.* WORKSPACE
include Dockerfile include BUCK BUCK.*
include LICENSE include requirements*.txt
include MANIFEST.in include version.txt
include Makefile include [Mm]akefile *.[Mm]akefile [Mm]akefile.*
include NOTICE include [Dd]ockerfile *.[Dd]ockerfile [Dd]ockerfile.* .dockerignore
include .bc-linter.yml
include .clang-format .clang-tidy
include .cmakelintrc
include .coveragerc
include .dockerignore
include .editorconfig
include .flake8
include .gdbinit
include .lintrunner.toml
include .lldbinit
include codex_setup.sh
include docker.Makefile
include pyrefly.toml
include ubsan.supp
# Include bazel and BUCK related files
include BUILD.bazel BUCK.oss
include WORKSPACE
include *.bzl
include .bazelignore .bazelrc .bazelversion
# Include general configuration files
include *.ini
# Include important top-level information
include *.md
# Include technical text files at the moment, comprises
# version.txt, CMakeLists.txt, requirements.txt
include *.txt
# Include ctags configuration
include .ctags.d/*.ctags
# Include subfolders completely
graft .devcontainer
graft .vscode
graft android graft android
graft aten graft aten
graft benchmarks
graft binaries graft binaries
graft c10 graft c10
graft caffe2 graft caffe2
graft cmake graft cmake
graft docs
graft functorch graft functorch
graft ios
graft mypy_plugins
graft scripts
graft test
graft third_party graft third_party
graft tools graft tools
graft torch graft torch
@ -63,37 +22,29 @@ graft torchgen
# FIXME: torch-xla build during codegen will fail if include this file in wheel # FIXME: torch-xla build during codegen will fail if include this file in wheel
exclude torchgen/BUILD.bazel exclude torchgen/BUILD.bazel
# The following exclusions omit parts from third-party dependencies that # Misc files and directories in SDist
# contain invalid symlinks[1] and that are not needed for pytorch, such as include *.md
# bindings for unused languages include CITATION.cff
prune third_party/flatbuffers/java include LICENSE NOTICE
prune third_party/flatbuffers/kotlin include mypy*.ini
prune third_party/ittapi/rust graft benchmarks
prune third_party/nccl/pkg/debian graft docs
prune third_party/opentelemetry-cpp/third_party/prometheus-cpp/cmake/project-import-* graft mypy_plugins
graft scripts
# The following document is also an invalid symlink[1] and superfluous
exclude third_party/flatbuffers/docs/source/CONTRIBUTING.md
# Omit autogenerated code
prune torchgen/packaged
# Omit caches, compiled, and scm related content
prune */__pycache__
prune **/.github
prune **/.gitlab
global-exclude *.o *.obj *.so *.dylib *.a *.pxd *.dll *.lib
global-exclude *.py[cod] *.swp *~
global-exclude .git .git-blame-ignore-revs .gitattributes .gitignore .gitmodules
global-exclude .gitlab-ci.yml
# Misc files needed for custom setuptools command # Misc files needed for custom setuptools command
include .gitignore include .gitignore
include .gitmodules include .gitmodules
# [1] Invalid symlinks for the purposes of Python source distributions are, # Include test suites in SDist
# according to the source distribution format[2] links pointing outside the graft test
# destination directory or links with a `..` component, which is those of include pytest.ini
# concern here. include .coveragerc
# [2] https://packaging.python.org/en/latest/specifications/source-distribution-format/#source-distribution-archive-features # Prune generated/compiled files
prune torchgen/packaged
prune */__pycache__
global-exclude *.o *.obj *.so *.a *.dylib *.pxd *.dll *.lib *.py[cod]
prune */.git
global-exclude .git *~ *.swp

View File

@ -161,7 +161,7 @@ They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv)
#### Prerequisites #### Prerequisites
If you are installing from source, you will need: If you are installing from source, you will need:
- Python 3.10 or later - Python 3.9 or later
- A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required, on Linux) - A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required, on Linux)
- Visual Studio or Visual Studio Build Tool (Windows only) - Visual Studio or Visual Studio Build Tool (Windows only)
@ -275,7 +275,7 @@ conda install pkg-config libuv
pip install mkl-static mkl-include pip install mkl-static mkl-include
# Add these packages if torch.distributed is needed. # Add these packages if torch.distributed is needed.
# Distributed package support on Windows is a prototype feature and is subject to changes. # Distributed package support on Windows is a prototype feature and is subject to changes.
conda install -c conda-forge libuv=1.51 conda install -c conda-forge libuv
``` ```
#### Install PyTorch #### Install PyTorch

View File

@ -317,20 +317,10 @@ IF(USE_FBGEMM_GENAI)
-greedy-reverse-local-assignment=1 -greedy-reverse-local-assignment=1
-fhip-new-launch-api) -fhip-new-launch-api)
# Only compile for gfx942 for now.
# This is rather hacky, I could not figure out a clean solution :(
set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
if("gfx942" IN_LIST PYTORCH_ROCM_ARCH)
list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;)
endif()
set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS})
hip_add_library( hip_add_library(
fbgemm_genai STATIC fbgemm_genai STATIC
${fbgemm_genai_native_rocm_hip} ${fbgemm_genai_native_rocm_hip}
HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS}) HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL})
set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES) target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)

View File

@ -401,13 +401,30 @@ T* toDLPackImpl(const Tensor& src) {
// The following code detects whether the src follows // The following code detects whether the src follows
// a continuous pattern. If the src follows such pattern (common-case) // a continuous pattern. If the src follows such pattern (common-case)
// then we do not need to normalize the strides. // then we do not need to normalize the strides.
bool need_normalize_strides = src.dim() == 1 && src.size(0) == 1 && src.stride(0) != 1; bool need_normalize_strides = false;
int64_t expected_stride = 1;
for (int i = src.dim() - 1; i >= 0; i--) {
// detect if we do not meet continuous pattern
// and the size is 1, so there is opportunity to normalize
if (src.stride(i) != expected_stride && src.size(i) == 1) {
need_normalize_strides = true;
break;
}
expected_stride *= src.size(i);
}
// less common case, try normalizing the strides // less common case, try normalizing the strides
if (need_normalize_strides) { if (need_normalize_strides) {
// create a new tensor with possibly normalized strides // create a new tensor with possibly normalized strides
// gh-83069 // gh-83069
auto shape = src.sizes(); auto shape = src.sizes();
view = src.as_strided(shape, {1}, src.storage_offset()); auto strides = src.strides().vec();
for (int i = 0; i < src.dim(); i++) {
if (shape[i] < 2) {
strides[i] = 1;
}
}
view = src.as_strided(shape, strides, src.storage_offset());
} }
ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>); ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>);

View File

@ -468,7 +468,7 @@ inline Tensor _sum_to(
// if we assume no reduction due to unbacked we ensure that at runtime. // if we assume no reduction due to unbacked we ensure that at runtime.
TORCH_MAYBE_SYM_CHECK( TORCH_MAYBE_SYM_CHECK(
sym_eq(shape[i - leading_dims], sizes[i]), sym_eq(shape[i - leading_dims], sizes[i]),
"non-reduction path was assumed due to unbacked symbols expected those two sizes to be the same:", "non-reduction path was assumed due to unabcked symbols expected those two sizes to be the same:",
shape[i - leading_dims], shape[i - leading_dims],
", ", ", ",
sizes[i]) sizes[i])

View File

@ -9,6 +9,11 @@
namespace at::functionalization { namespace at::functionalization {
ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
if (out_idx == this->out_index) return *this;
return ViewMeta(forward_fn, reverse_fn, has_symbolic_inputs, is_multi_output, is_as_strided, out_idx);
}
// Note [Functionalization: Alias Removal Part 2] // Note [Functionalization: Alias Removal Part 2]
// See Note [Functionalization: Alias Removal] for more details. // See Note [Functionalization: Alias Removal] for more details.
// This function applies a single update from one of the views to the StorageImpl. // This function applies a single update from one of the views to the StorageImpl.
@ -37,12 +42,12 @@ namespace at::functionalization {
static const Tensor apply_update(const FunctionalStorageImpl::Update& update, const Tensor& base) { static const Tensor apply_update(const FunctionalStorageImpl::Update& update, const Tensor& base) {
at::Tensor t = update.new_val; at::Tensor t = update.new_val;
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
if (update.view_metas.empty()) { return t; } if (update.view_metas.empty()) return t;
std::vector<at::Tensor> tmp_values({base}); std::vector<at::Tensor> tmp_values({base});
tmp_values.reserve(update.view_metas.size()); tmp_values.reserve(update.view_metas.size());
for (size_t i = 0; i < update.view_metas.size() - 1; ++i) { for (size_t i = 0; i < update.view_metas.size() - 1; ++i) {
at::Tensor next_view = update.view_metas[i]->forward(tmp_values.back()); at::Tensor next_view = update.view_metas[i].forward_fn(tmp_values.back(), update.view_metas[i].out_index);
// NB: We only actually need tmp_values for ops like select/slice/diagonal/squeeze/as_strided // NB: We only actually need tmp_values for ops like select/slice/diagonal/squeeze/as_strided
// All of these ops require additional information to recover the sizes of the original tensor. // All of these ops require additional information to recover the sizes of the original tensor.
// If need to, we could probably apply this optimization and only bother computing tmp_values // If need to, we could probably apply this optimization and only bother computing tmp_values
@ -50,8 +55,9 @@ static const Tensor apply_update(const FunctionalStorageImpl::Update& update, co
tmp_values.push_back(std::move(next_view)); tmp_values.push_back(std::move(next_view));
} }
for(int64_t i = static_cast<int64_t>(update.view_metas.size()) - 1; i >= 0; --i) { for(int64_t i = static_cast<int64_t>(update.view_metas.size()) - 1; i >= 0; --i) {
int64_t out_idx = update.view_metas[i].out_index;
// Each view inverse is implemented in ViewInverses.cpp. // Each view inverse is implemented in ViewInverses.cpp.
t = update.view_metas[i]->reverse(tmp_values[i], t); t = update.view_metas[i].reverse_fn(tmp_values[i], t, out_idx);
} }
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
return t; return t;
@ -105,13 +111,13 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base_)); TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base_));
} }
void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<std::shared_ptr<ViewMeta>>& metas) { void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<ViewMeta>& metas) {
TORCH_CHECK(!frozen_, "cannot mutate tensors with frozen storage"); TORCH_CHECK(!frozen_, "cannot mutate tensors with frozen storage");
if (metas.size() > 1) { if (metas.size() > 1) {
for (size_t i = 1; i < metas.size(); ++i) { for (size_t i = 1; i < metas.size(); ++i) {
// Skipping this check for XLA. Would be good to add it back, but it is failing XLA CI // Skipping this check for XLA. Would be good to add it back, but it is failing XLA CI
TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i]->is_as_strided, TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i].is_as_strided,
"During torch.compile, encountered a mutation on a view chain of length ", metas.size(), ", where view ", i, "During torch.compile, encountered a mutation on a view chain of length ", metas.size(), ", where view ", i,
" was an as_strided() call. as_strided() is non-compositional, and therefore is not possible to functionalize properly today," " was an as_strided() call. as_strided() is non-compositional, and therefore is not possible to functionalize properly today,"
"so this behavior is banned in compile. As a workaround, you can either remove the mutation from the model code, or you " "so this behavior is banned in compile. As a workaround, you can either remove the mutation from the model code, or you "

View File

@ -8,89 +8,44 @@ namespace at::functionalization {
// See Note [Functionalization Pass In Core] // See Note [Functionalization Pass In Core]
enum class InverseReturnMode {
/// Specifies that functional inverses should always return a view.
AlwaysView,
/// Specifies that functional inverses should always return a non-view / copy.
NeverView,
/// Specifies that functional inverses should return a view unless a (copying)
/// scatter
/// inverse exists, in which case that will be used instead.
/// This avoids as_strided() calls that can be difficult for subclasses to
/// handle.
ViewOrScatterInverse,
};
#define FUNCTIONALIZATION_VIEWMETA_NAME(TYPE) \
static const char* name() { \
return #TYPE; \
}
#define FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(...) \
using SerializableTuple = std::tuple<__VA_ARGS__>
// ViewMeta is a class used by the functionalization pass to navigate between // ViewMeta is a class used by the functionalization pass to navigate between
// a base tensor and a view tensor. // a base tensor and a view tensor.
// For example, if I call `b = a.view1(...)` // For example, if I call `b = a.view1(...)`
// the functionalization pass will generate and store a ViewMeta specialization // the functionalization pass will generate and store a ViewMeta on b that looks
// for `view1` operation on b that looks like: // like:
// //
// struct TORCH_API view1_ViewMeta : public ViewMeta { // ViewMeta(
// FUNCTIONALIZATION_VIEWMETA_NAME(view1_ViewMeta); // [<captures>](const Tensor& base, int64_t mutated_view_idx) {
// FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE( // return base.view1(...);
// bool /* reapply_views */, // },
// const std::vector<int64_t>&); // [<captures>](const at::Tensor& base, const at::Tensor& mutated_view,
// // int64_t mutated_view_idx) -> at::Tensor {
// view1_ViewMeta(const SerializableTuple& tpl) // return at::functionalization::impl::view1_inverse(base, mutated_view,
// : view1_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {} // ...);
//
// view1_ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
// : ViewMeta(/*has_symbolic_inputs=*/false),
// reapply_views(reapply_views),
// size(size) {}
//
// Tensor forward(const Tensor& base) override {
// return base.view1(...);
// } // }
// //
// Tensor reverse(const Tensor& base, const Tensor& mutated_view) override { // The forward_fn lambda describes how to replay view1 on a tensor.
// return at::functionalization::impl::view1_inverse(base, mutated_view,
// ...);
// }
// //
// SerializableTuple to_serializable_tuple() { // The reverse_fn lambda describes how, given a tensor that is already a view,
// return std::make_tuple(reapply_views, size);
// }
//
// bool reapply_views;
// std::vector<int64_t> size;
// };
//
// The forward function describes how to replay view1 on a tensor.
//
// The reverse function describes how, given a tensor that is already a view,
// how to get the corresponding base tensor. See Note [Functionalization Pass: // how to get the corresponding base tensor. See Note [Functionalization Pass:
// View Inverses] for details. // View Inverses] for details.
//
// `SerializedTuple` is a typedef that defines an `std::tuple<...>` type
// representing the `ViewMeta` instance state. Methods that take in/return such
// a type are used for supporting pickle serialization.
struct ViewMeta { struct ViewMeta {
ViewMeta( ViewMeta(
std::function<Tensor(const Tensor&, int64_t)> forward,
std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse,
bool has_symbolic_inputs, bool has_symbolic_inputs,
bool is_multi_output = false, bool is_multi_output = false,
bool is_as_strided = false, bool is_as_strided = false,
int64_t out_idx = 0) int64_t out_idx = 0)
: out_index(out_idx), : forward_fn(std::move(forward)),
reverse_fn(std::move(reverse)),
out_index(out_idx),
is_multi_output(is_multi_output), is_multi_output(is_multi_output),
is_as_strided(is_as_strided), is_as_strided(is_as_strided),
has_symbolic_inputs(has_symbolic_inputs) {} has_symbolic_inputs(has_symbolic_inputs) {}
virtual ~ViewMeta() = default; std::function<Tensor(const Tensor&, int64_t)> forward_fn;
std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse_fn;
virtual Tensor forward(const Tensor& base) = 0;
virtual Tensor reverse(const Tensor& base, const Tensor& mutated_view) = 0;
// See Note [out_idx in ViewMeta] // See Note [out_idx in ViewMeta]
int64_t out_index; int64_t out_index;
@ -102,17 +57,10 @@ struct ViewMeta {
// Tells us if this view operation has any symbolic inputs // Tells us if this view operation has any symbolic inputs
bool has_symbolic_inputs; bool has_symbolic_inputs;
// Returns a new ViewMeta with the same forward/reverse // Returns a copy of the current ViewMeta, if out_idx matches the current
// out_index. Otherwise, returns a new ViewMeta with the same forward/reverse
// functions, but a new out index. // functions, but a new out index.
// ViewMeta to_out_idx(int64_t out_idx);
// This method should be implemented by those `ViewMeta` that have more than
// one output.
virtual std::shared_ptr<ViewMeta> to_out_index(int64_t out_index) {
TORCH_CHECK_NOT_IMPLEMENTED(
false,
"ViewMeta::to_out_index not implemented. ",
"Likely because there's only one output.");
}
}; };
// FunctionalStorageImpl is a subclass of StorageImpl used by the // FunctionalStorageImpl is a subclass of StorageImpl used by the
@ -145,14 +93,14 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
// NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
const at::Tensor new_val; const at::Tensor new_val;
// NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
const std::vector<std::shared_ptr<ViewMeta>> view_metas; const std::vector<ViewMeta> view_metas;
}; };
explicit FunctionalStorageImpl(const Tensor& value); explicit FunctionalStorageImpl(const Tensor& value);
void add_update( void add_update(
const Tensor& updated_val, const Tensor& updated_val,
const std::vector<std::shared_ptr<ViewMeta>>& view_metas); const std::vector<ViewMeta>& view_metas);
bool apply_updates(); bool apply_updates();
const Tensor& base() { const Tensor& base() {
return base_; return base_;

View File

@ -129,19 +129,17 @@ void FunctionalTensorWrapper::freeze_storage() const {
// - view_value: The output tensor that we need to wrap. // - view_value: The output tensor that we need to wrap.
// - base: The "base" of the view that `view_value` was generated from. // - base: The "base" of the view that `view_value` was generated from.
// See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic. // See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic.
FunctionalTensorWrapper::FunctionalTensorWrapper( FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, const functionalization::ViewMeta& meta)
const Tensor& view_value, : c10::TensorImpl(
const FunctionalTensorWrapper* base, c10::DispatchKeySet(DispatchKey::Functionalize),
const std::shared_ptr<functionalization::ViewMeta>& meta) view_value.dtype(),
: c10::TensorImpl( base->storage().data_ptr().device()
c10::DispatchKeySet(DispatchKey::Functionalize), ),
view_value.dtype(), value_(view_value),
base->storage().data_ptr().device()), is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output),
value_(view_value), was_storage_changed_(base->was_storage_changed_),
is_multi_output_view_( is_symbolic_(base->is_symbolic_)
base->is_multi_output_view_ || meta->is_multi_output), {
was_storage_changed_(base->was_storage_changed_),
is_symbolic_(base->is_symbolic_) {
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(value_)); TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(value_));
TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize)); TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
set_constructor_metadata(); set_constructor_metadata();
@ -150,10 +148,11 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(
view_metas_ = base->view_metas_; // copy view_metas_ = base->view_metas_; // copy
} }
view_metas_.push_back(meta); view_metas_.push_back(meta);
maybe_mark_symbolic(meta.get()); maybe_mark_symbolic(meta);
storage_ = base->storage_; // alias this tensor's storage with the base tensor's storage_ = base->storage_; // alias this tensor's storage with the base tensor's
} }
functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_storage_impl() const { functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_storage_impl() const {
return static_cast<functionalization::FunctionalStorageImpl*>(storage_.unsafeGetStorageImpl()); return static_cast<functionalization::FunctionalStorageImpl*>(storage_.unsafeGetStorageImpl());
} }
@ -177,18 +176,18 @@ bool FunctionalTensorWrapper::is_up_to_date() const {
} }
// See Note [Functionalization Pass - Inplace View Ops] // See Note [Functionalization Pass - Inplace View Ops]
void FunctionalTensorWrapper::mutate_view_meta(const std::shared_ptr<at::functionalization::ViewMeta>& meta) { void FunctionalTensorWrapper::mutate_view_meta(const at::functionalization::ViewMeta& meta) {
view_metas_.push_back(meta); view_metas_.push_back(meta);
// Manually track the fact that this tensor received a metadata mutation! // Manually track the fact that this tensor received a metadata mutation!
has_metadata_mutation_ = true; has_metadata_mutation_ = true;
// Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation. // Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation.
maybe_mark_symbolic(meta.get()); maybe_mark_symbolic(meta);
// Note [Functionalization Pass - Inplace View Ops] // Note [Functionalization Pass - Inplace View Ops]
// So, these ops are special - they're mutation AND view ops. They get special codegen. // So, these ops are special - they're mutation AND view ops. They get special codegen.
// An example is transpose_, e.g. `a.transpose_()` // An example is transpose_, e.g. `a.transpose_()`
// Calling transpose_() should ensure that a gets an alias, and append the new ViewMeta to a's current list of ViewMetas. // Calling transpose_() should ensure that a gets an alias, and append the new ViewMeta to a's current list of ViewMetas.
at::AutoDispatchSkipFunctionalize guard; at::AutoDispatchSkipFunctionalize guard;
value_ = meta->forward(value_); value_ = meta.forward_fn(value_, meta.out_index);
TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize)); TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
} }
@ -369,8 +368,15 @@ void FunctionalTensorWrapper::sync_() {
regenerate_from_base(); regenerate_from_base();
} }
const std::vector<std::shared_ptr<functionalization::ViewMeta>>& FunctionalTensorWrapper::view_metas() const { Tensor FunctionalTensorWrapper::apply_view_metas(const Tensor& base) {
return view_metas_; auto t = base;
// Reapply views to get the viewed tensor from the base in alias_
for (auto& view_meta: view_metas_) {
t = view_meta.forward_fn(t, view_meta.out_index);
}
return t;
} }
void FunctionalTensorWrapper::regenerate_from_base() { void FunctionalTensorWrapper::regenerate_from_base() {
@ -379,7 +385,7 @@ void FunctionalTensorWrapper::regenerate_from_base() {
auto t = storage_impl->base(); auto t = storage_impl->base();
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
t = at::functionalization::impl::apply_view_meta_sequence(t, view_metas_); t = apply_view_metas(t);
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
replace_(t, /*from_lazy_regenerate=*/true); replace_(t, /*from_lazy_regenerate=*/true);
@ -721,11 +727,11 @@ bool isFunctionalTensor(const std::optional<Tensor>& t) {
} }
bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) { bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
if (t_list.empty()) { return false; } if (t_list.empty()) return false;
auto functional_count = 0; auto functional_count = 0;
for (const auto i : c10::irange(t_list.size())) { for (const auto i : c10::irange(t_list.size())) {
auto const & e= t_list[i]; auto const & e= t_list[i];
if (!e.has_value() || !e->defined()) { continue; } if (!e.has_value() || !e->defined()) continue;
if (isFunctionalTensor(e)) { if (isFunctionalTensor(e)) {
++functional_count; ++functional_count;
} }
@ -735,10 +741,10 @@ bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
template <typename T> template <typename T>
static bool isFunctionalTensorIListRef(c10::IListRef<T> list) { static bool isFunctionalTensorIListRef(c10::IListRef<T> list) {
if (list.size() == 0) { return false; } if (list.size() == 0) return false;
auto functional_count = 0; auto functional_count = 0;
for (const auto& tensor : list) { for (const auto& tensor : list) {
if (!tensor.defined()) { continue; } if (!tensor.defined()) continue;
if (isFunctionalTensor(tensor)) { if (isFunctionalTensor(tensor)) {
++functional_count; ++functional_count;
} }
@ -756,28 +762,20 @@ void freeze_functional_tensor(const Tensor& tensor) {
functional_base_impl->freeze_storage(); functional_base_impl->freeze_storage();
} }
Tensor create_functional_tensor_with_view_meta( Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta, int64_t out_idx) {
const at::Tensor& view_to_wrap,
const at::Tensor& base,
const std::shared_ptr<functionalization::ViewMeta>& meta,
int64_t out_idx) {
TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap)); TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap));
TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base)); TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base));
auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base); auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base);
auto meta_ = meta;
if (out_idx != 0) { if (out_idx != 0) {
// Note [out_idx in ViewMeta] // Note [out_idx in ViewMeta]
// When a view op outputs multiple tensors, each output needs its own separate ViewMeta. // When a view op outputs multiple tensors, each output needs its own separate ViewMeta.
// Each ViewMeta also tracks the index of the particular output tensor, which is needed in the reverse function. // Each ViewMeta also tracks the index of the particular output tensor, which is needed in the reverse function.
meta_ = meta->to_out_index(out_idx); meta = meta.to_out_idx(out_idx);
} }
return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta_); return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta);
} }
std::vector<Tensor> create_functional_tensor_with_view_meta( std::vector<Tensor> create_functional_tensor_with_view_meta(ITensorListRef view_to_wrap, const at::Tensor& base, const functionalization::ViewMeta& meta) {
ITensorListRef view_to_wrap,
const at::Tensor& base,
const std::shared_ptr<functionalization::ViewMeta>& meta) {
std::vector<Tensor> outputs(view_to_wrap.size()); std::vector<Tensor> outputs(view_to_wrap.size());
int64_t i = 0; int64_t i = 0;
for (const auto& tensor : view_to_wrap) { for (const auto& tensor : view_to_wrap) {
@ -787,22 +785,12 @@ std::vector<Tensor> create_functional_tensor_with_view_meta(
return outputs; return outputs;
} }
void mutate_view_meta(const at::Tensor& self, const std::shared_ptr<functionalization::ViewMeta>& meta) { void mutate_view_meta(const at::Tensor& self, const functionalization::ViewMeta& meta) {
TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self)); TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self));
auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self); auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self);
self_impl->mutate_view_meta(meta); self_impl->mutate_view_meta(meta);
} }
Tensor apply_view_meta_sequence(
const Tensor& base,
const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence) {
Tensor r = base;
for (auto& vm : sequence) {
r = vm->forward(r);
}
return r;
}
// Note [Propagating strides in the functionalization pass] // Note [Propagating strides in the functionalization pass]
// In order to properly compute stride information, the functionalization pass // In order to properly compute stride information, the functionalization pass
// calls each {view} reference implementations with meta tensors. // calls each {view} reference implementations with meta tensors.
@ -896,7 +884,7 @@ void functionalize_op_helper(const c10::OperatorHandle& op, torch::jit::Stack* s
const auto& ivalue = returns[idx]; const auto& ivalue = returns[idx];
if (ivalue.isTensor()) { if (ivalue.isTensor()) {
const auto& t = ivalue.toTensor(); const auto& t = ivalue.toTensor();
if (!t.defined()) { continue; } if (!t.defined()) continue;
at::functionalization::impl::sync(t); at::functionalization::impl::sync(t);
auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t)); auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t));
(*stack)[returns_begin + idx] = t_new; (*stack)[returns_begin + idx] = t_new;

View File

@ -56,7 +56,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
explicit FunctionalTensorWrapper( explicit FunctionalTensorWrapper(
const Tensor& view_value, const Tensor& view_value,
const FunctionalTensorWrapper* base, const FunctionalTensorWrapper* base,
const std::shared_ptr<functionalization::ViewMeta>& meta); const functionalization::ViewMeta& meta);
// Get the underlying, actual tensor, that doesn't know anything about // Get the underlying, actual tensor, that doesn't know anything about
// functionalization. // functionalization.
@ -99,17 +99,17 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
->are_all_mutations_under_no_grad_or_inference_mode(); ->are_all_mutations_under_no_grad_or_inference_mode();
} }
void maybe_mark_symbolic(functionalization::ViewMeta* meta) { void maybe_mark_symbolic(const functionalization::ViewMeta& meta) {
is_symbolic_ = is_symbolic_ | meta->has_symbolic_inputs; is_symbolic_ = is_symbolic_ | meta.has_symbolic_inputs;
} }
bool is_symbolic() const { bool is_symbolic() const {
return is_symbolic_; return is_symbolic_;
} }
// Retrieves the ViewMeta sequence of this tensor. // Runs the forward_fn of every ViewMeta collected in the current instance
const std::vector<std::shared_ptr<functionalization::ViewMeta>>& view_metas() // to some other base.
const; Tensor apply_view_metas(const Tensor& base);
// Sync's the underlying tensor with its alias, if it's out of date. This // Sync's the underlying tensor with its alias, if it's out of date. This
// involves two steps: 1) Apply any pending updates/mutations to the alias 2) // involves two steps: 1) Apply any pending updates/mutations to the alias 2)
@ -146,8 +146,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
// from the base tensor. This method is used by inplace-view ops like // from the base tensor. This method is used by inplace-view ops like
// transpose_. It appends a ViewMeta to the existing stack, and refreshes the // transpose_. It appends a ViewMeta to the existing stack, and refreshes the
// tensor by replaying the views off of the alias. // tensor by replaying the views off of the alias.
void mutate_view_meta( void mutate_view_meta(const at::functionalization::ViewMeta& meta);
const std::shared_ptr<at::functionalization::ViewMeta>& meta);
// Custom implementation of self.set_(src) // Custom implementation of self.set_(src)
void set__impl(const FunctionalTensorWrapper* other); void set__impl(const FunctionalTensorWrapper* other);
@ -286,7 +285,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
bool is_symbolic_ = false; bool is_symbolic_ = false;
size_t generation_ = 0; size_t generation_ = 0;
std::vector<std::shared_ptr<at::functionalization::ViewMeta>> view_metas_; std::vector<at::functionalization::ViewMeta> view_metas_;
protected: protected:
static void copy_tensor_metadata( static void copy_tensor_metadata(
@ -378,20 +377,16 @@ TORCH_API void propagate_xla_data_direct(
Tensor create_functional_tensor_with_view_meta( Tensor create_functional_tensor_with_view_meta(
const Tensor& view_to_wrap, const Tensor& view_to_wrap,
const Tensor& base, const Tensor& base,
const std::shared_ptr<functionalization::ViewMeta>& meta, functionalization::ViewMeta meta,
int64_t out_idx = 0); int64_t out_idx = 0);
std::vector<Tensor> create_functional_tensor_with_view_meta( std::vector<Tensor> create_functional_tensor_with_view_meta(
ITensorListRef view_to_wrap, ITensorListRef view_to_wrap,
const Tensor& base, const Tensor& base,
const std::shared_ptr<functionalization::ViewMeta>& meta); const functionalization::ViewMeta& meta);
void mutate_view_meta( void mutate_view_meta(
const Tensor& self, const Tensor& self,
const std::shared_ptr<functionalization::ViewMeta>& meta); const functionalization::ViewMeta& meta);
TORCH_API Tensor apply_view_meta_sequence(
const Tensor& base,
const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence);
void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out); void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out);
void set_sizes_strides_offset( void set_sizes_strides_offset(

View File

@ -1,5 +1,3 @@
#include <ATen/FunctionalizeFallbackKernel.h>
#include <ATen/core/dispatch/Dispatcher.h> #include <ATen/core/dispatch/Dispatcher.h>
#include <ATen/core/LegacyTypeDispatch.h> #include <ATen/core/LegacyTypeDispatch.h>
#include <ATen/EmptyTensor.h> #include <ATen/EmptyTensor.h>
@ -9,6 +7,7 @@
#include <torch/library.h> #include <torch/library.h>
#include <c10/util/irange.h> #include <c10/util/irange.h>
#include <c10/util/strides.h> #include <c10/util/strides.h>
#include <ATen/EmptyTensor.h>
#ifndef AT_PER_OPERATOR_HEADERS #ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/ATen.h> #include <ATen/ATen.h>
@ -29,31 +28,6 @@
#include <utility> #include <utility>
#endif #endif
namespace at::functionalization {
Tensor resize__ViewMeta::forward(const Tensor& base) {
if (reapply_views) {
return base.as_strided(size, c10::contiguous_strides(size));
} else {
return at::as_strided_copy(base, size, c10::contiguous_strides(size));
}
}
Tensor resize__ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
return base.as_strided_scatter(
mutated_view, size, c10::contiguous_strides(size));
}
Tensor _unsafe_view_ViewMeta::forward(const Tensor& base) {
return at::_unsafe_view_symint(base, size);
}
Tensor _unsafe_view_ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
}
} // namespace at::functionalization
namespace { namespace {
void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet [[maybe_unused]], torch::jit::Stack* stack) { void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet [[maybe_unused]], torch::jit::Stack* stack) {
const auto& schema = op.schema(); const auto& schema = op.schema();
@ -132,9 +106,7 @@ namespace {
const auto& ivalue = returns[idx]; const auto& ivalue = returns[idx];
if (ivalue.isTensor() && should_wrap_outputs) { if (ivalue.isTensor() && should_wrap_outputs) {
const auto& t = ivalue.toTensor(); const auto& t = ivalue.toTensor();
if (!t.defined()) { if (!t.defined()) continue;
continue;
}
auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t)); auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t));
(*stack)[returns_begin + idx] = t_new; (*stack)[returns_begin + idx] = t_new;
} else if (ivalue.isTensorList() && should_wrap_outputs) { } else if (ivalue.isTensorList() && should_wrap_outputs) {
@ -197,8 +169,19 @@ static const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatch
// The output of resizing is equivalent to taking a slice of a larger tensor. // The output of resizing is equivalent to taking a slice of a larger tensor.
// We have to emulate this "slicing" with an as_strided call. // We have to emulate this "slicing" with an as_strided call.
auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS(); auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS();
auto view_meta = std::make_shared<at::functionalization::resize__ViewMeta>( at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
reapply_views, size.vec()); [reapply_views = reapply_views, size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
if (reapply_views) {
return base.as_strided(size, c10::contiguous_strides(size));
} else {
return at::as_strided_copy(base, size, c10::contiguous_strides(size));
}
},
[size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
return base.as_strided_scatter(mutated_view, size, c10::contiguous_strides(size));
},
/*has_symbolic_inputs=*/false
);
at::functionalization::impl::mutate_view_meta(self, view_meta); at::functionalization::impl::mutate_view_meta(self, view_meta);
return self; return self;
} }
@ -317,11 +300,17 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt
tmp_output = at::_unsafe_view_symint(self_, size); tmp_output = at::_unsafe_view_symint(self_, size);
} }
bool has_symbolic_inputs = std::any_of( bool has_symbolic_inputs = std::any_of(size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
auto view_meta = at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
std::make_shared<at::functionalization::_unsafe_view_ViewMeta>( [size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
has_symbolic_inputs, size.vec()); return at::_unsafe_view_symint(base, size);
},
[size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
},
/*has_symbolic_inputs=*/has_symbolic_inputs
);
auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, self, std::move(view_meta)); auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, self, std::move(view_meta));
// See Note [Propagating strides in the functionalization pass] // See Note [Propagating strides in the functionalization pass]

View File

@ -1,58 +0,0 @@
#pragma once
#include <ATen/FunctionalStorageImpl.h>
namespace at::functionalization {
// `ViewMeta` implementation for `resize_` operation.
struct TORCH_API resize__ViewMeta : public ViewMeta {
FUNCTIONALIZATION_VIEWMETA_NAME(resize__ViewMeta)
FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
bool /* reapply_views */,
const std::vector<int64_t>&);
resize__ViewMeta(const SerializableTuple& tpl)
: resize__ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
resize__ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
: ViewMeta(/*has_symbolic_inputs=*/false),
reapply_views(reapply_views),
size(size) {}
Tensor forward(const Tensor& base) override;
Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
SerializableTuple to_serializable_tuple() {
return std::make_tuple(reapply_views, size);
}
bool reapply_views;
std::vector<int64_t> size;
};
// `ViewMeta` implementation for `_unsafe_view` operation.
struct TORCH_API _unsafe_view_ViewMeta : public ViewMeta {
FUNCTIONALIZATION_VIEWMETA_NAME(_unsafe_view_ViewMeta)
FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
bool /* has_symbolic_inputs */,
const std::vector<c10::SymInt>&);
_unsafe_view_ViewMeta(const SerializableTuple& tpl)
: _unsafe_view_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
_unsafe_view_ViewMeta(
bool has_symbolic_inputs,
const std::vector<c10::SymInt>& size)
: ViewMeta(has_symbolic_inputs), size(size) {}
Tensor forward(const Tensor& base) override;
Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
SerializableTuple to_serializable_tuple() {
return std::make_tuple(has_symbolic_inputs, size);
}
std::vector<c10::SymInt> size;
};
} // namespace at::functionalization

View File

@ -45,39 +45,7 @@ inline void infer_size_impl(
} }
} }
if (infer_dim) { auto set_infer_dim = [&]() {
// numel is the product of known sizes, it has to be divisible by newsize.
// and newsize should be positive unless newsize == numel (we throw
// different) error message in that case.
if constexpr (std::is_same_v<NumelType, c10::SymInt>) {
auto v = newsize.maybe_as_int();
if (v and *v == 0) {
// Avoid div by 0 when sym_eq(numel % newsize, 0) is constructed!
// which may happen when newsize is not a symbol! if its a symbol
// division won't happen anyway during compile.
TORCH_MAYBE_SYM_CHECK(
numel == newsize,
"shape '",
shape,
"' is invalid for input of size ",
numel);
} else {
auto cond = sym_gt(newsize, 0)
.sym_and(sym_eq(numel % newsize, 0))
.sym_or(sym_eq(numel, newsize));
TORCH_MAYBE_SYM_CHECK(
cond, "shape '", shape, "' is invalid for input of size ", numel);
}
} else {
TORCH_CHECK(
(newsize > 0 && (numel % newsize == 0)) || numel == newsize,
"shape '",
shape,
"' is invalid for input of size ",
numel);
}
// We have a degree of freedom here to select the dimension size; follow // We have a degree of freedom here to select the dimension size; follow
// NumPy semantics and just bail. However, a nice error message is needed // NumPy semantics and just bail. However, a nice error message is needed
// because users often use `view` as a way to flatten & unflatten // because users often use `view` as a way to flatten & unflatten
@ -86,15 +54,19 @@ inline void infer_size_impl(
// works yet // works yet
// empty_tensor.view(-1, 0) // empty_tensor.view(-1, 0)
// doesn't. // doesn't.
TORCH_MAYBE_SYM_CHECK( TORCH_CHECK(
newsize != 0, newsize != 0,
"cannot reshape tensor of 0 elements into shape ", "cannot reshape tensor of 0 elements into shape ",
shape, shape,
" because the unspecified dimension size -1 can be any " " because the unspecified dimension size -1 can be any "
"value and is ambiguous"); "value and is ambiguous");
res[*infer_dim] = numel / newsize; res[*infer_dim] = numel / newsize;
return; return;
};
if (infer_dim && newsize > 0 && numel % newsize == 0) {
set_infer_dim();
return;
} }
TORCH_MAYBE_SYM_CHECK( TORCH_MAYBE_SYM_CHECK(
@ -103,6 +75,9 @@ inline void infer_size_impl(
shape, shape,
"' is invalid for input of size ", "' is invalid for input of size ",
numel); numel);
if (infer_dim) {
set_infer_dim();
}
} }
inline std::vector<int64_t> infer_size(IntArrayRef shape, int64_t numel) { inline std::vector<int64_t> infer_size(IntArrayRef shape, int64_t numel) {

View File

@ -1,22 +1,32 @@
#include <ATen/core/PythonOpRegistrationTrampoline.h> #include <ATen/core/PythonOpRegistrationTrampoline.h>
#include <c10/core/impl/PyInterpreterHooks.h>
// TODO: delete this
namespace at::impl { namespace at::impl {
c10::impl::PyInterpreter* PythonOpRegistrationTrampoline::interpreter_ = nullptr; // The strategy is that all python interpreters attempt to register themselves
// as the main interpreter, but only one wins. Only that interpreter is
// allowed to interact with the C++ dispatcher. Furthermore, when we execute
// logic on that interpreter, we do so hermetically, never setting pyobj field
// on Tensor.
std::atomic<c10::impl::PyInterpreter*>
PythonOpRegistrationTrampoline::interpreter_{nullptr};
c10::impl::PyInterpreter* PythonOpRegistrationTrampoline::getInterpreter() { c10::impl::PyInterpreter* PythonOpRegistrationTrampoline::getInterpreter() {
return c10::impl::getGlobalPyInterpreter(); return PythonOpRegistrationTrampoline::interpreter_.load();
} }
bool PythonOpRegistrationTrampoline::registerInterpreter( bool PythonOpRegistrationTrampoline::registerInterpreter(
c10::impl::PyInterpreter* interp) { c10::impl::PyInterpreter* interp) {
if (interpreter_ != nullptr) { c10::impl::PyInterpreter* expected = nullptr;
interpreter_.compare_exchange_strong(expected, interp);
if (expected != nullptr) {
// This is the second (or later) Python interpreter, which means we need
// non-trivial hermetic PyObject TLS
c10::impl::HermeticPyObjectTLS::init_state();
return false; return false;
} else {
return true;
} }
interpreter_ = interp;
return true;
} }
} // namespace at::impl } // namespace at::impl

View File

@ -2,21 +2,19 @@
#include <ATen/core/dispatch/Dispatcher.h> #include <ATen/core/dispatch/Dispatcher.h>
// TODO: We can get rid of this // TODO: this can probably live in c10
namespace at::impl { namespace at::impl {
// Manages the single Python interpreter instance for PyTorch.
class TORCH_API PythonOpRegistrationTrampoline final { class TORCH_API PythonOpRegistrationTrampoline final {
static c10::impl::PyInterpreter* interpreter_; static std::atomic<c10::impl::PyInterpreter*> interpreter_;
public: public:
// Register the Python interpreter. Returns true on first registration, // Returns true if you successfully registered yourself (that means
// false if an interpreter was already registered. // you are in the hot seat for doing the operator registrations!)
static bool registerInterpreter(c10::impl::PyInterpreter*); static bool registerInterpreter(c10::impl::PyInterpreter*);
// Returns the registered interpreter via the global PyInterpreter hooks.
// Returns nullptr if no interpreter has been registered yet. // Returns nullptr if no interpreter has been registered yet.
static c10::impl::PyInterpreter* getInterpreter(); static c10::impl::PyInterpreter* getInterpreter();
}; };

View File

@ -149,105 +149,5 @@ static inline void pack_vnni4(
#endif #endif
} }
// This is a helper function for transpose_pack_vnni4
// Transform a [4, 16] block (with incontiguous output)
// Src:
// a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15 a16
// b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12 b13 b14 b15 b16
// c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 c16
// d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15 d16
// Dst:
// a1 a2 a3 a4 b1 b2 b3 b4 c1 c2 c3 c4 d1 d2 d3 d4
// a5 a6 a7 a8 b5 b6 b7 b8 c5 c6 c7 c8 d5 d6 d7 d8
// a9 a10 a11 a12 b9 b10 b11 b12 c9 c10 c11 c12 d9 d10 d11 d12
// a13 a14 a15 a16 b13 b14 b15 b16 c13 c14 c15 c16 d13 d14 d15 d16
template <typename scalar_t, typename = std::enable_if_t<sizeof(scalar_t) == 1>>
static inline void transpose_vnni4_pad_4x16_block(
const scalar_t* src,
scalar_t* dst,
int64_t ld_src,
int64_t ld_dst,
int krem = 4) {
#if defined(CPU_CAPABILITY_AVX512)
__m128i r[4];
for (int i = 0; i < krem; ++i) {
r[i] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i * ld_src));
}
for (int i = krem; i < 4; ++i) {
r[i] = _mm_setzero_si128();
}
// Transpose 4x16 bytes using unpack and shuffle
__m128i t0 = _mm_unpacklo_epi32(r[0], r[1]);
__m128i t1 = _mm_unpackhi_epi32(r[0], r[1]);
__m128i t2 = _mm_unpacklo_epi32(r[2], r[3]);
__m128i t3 = _mm_unpackhi_epi32(r[2], r[3]);
__m128i r0 = _mm_unpacklo_epi64(t0, t2);
__m128i r1 = _mm_unpackhi_epi64(t0, t2);
__m128i r2 = _mm_unpacklo_epi64(t1, t3);
__m128i r3 = _mm_unpackhi_epi64(t1, t3);
// Store output
if (krem == 4) {
// normal case
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst), r0);
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst), r1);
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst * 2), r2);
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst * 3), r3);
} else {
// masked case
__mmask16 mask = (1ULL << (krem * 4)) - 1;
_mm_mask_storeu_epi8(dst, mask, r0);
_mm_mask_storeu_epi8(reinterpret_cast<__m128i*>(dst + ld_dst), mask, r1);
_mm_mask_storeu_epi8(
reinterpret_cast<__m128i*>(dst + ld_dst * 2), mask, r2);
_mm_mask_storeu_epi8(
reinterpret_cast<__m128i*>(dst + ld_dst * 3), mask, r3);
}
#else
TORCH_CHECK(
false,
"transpose_vnni4_pad_4x16_block is only supported when AVX-512 is supported")
#endif
}
// Do the transpose packing fusion with VNNI4
// Reorder [K, N] → [N/4, K, 4] (VNNI4-style layout for bit8)
template <typename scalar_t, typename = std::enable_if_t<sizeof(scalar_t) == 1>>
static inline void transpose_pack_vnni4(
const scalar_t* src,
scalar_t* dst,
int64_t ld_src,
int64_t K,
int64_t N) {
#if defined(CPU_CAPABILITY_AVX512)
TORCH_CHECK(
N % 16 == 0, "N needs to be multiple of 16 for transpose_pack_vnni4");
int64_t bk = 0;
int64_t _K = K / 4 * 4;
for (; bk < _K; bk += 4) {
int64_t bn = 0;
for (; bn < N; bn += 16) {
transpose_vnni4_pad_4x16_block(
src + bk * ld_src + bn, dst + bn * K + bk * 4, ld_src, K * 4);
}
}
// Handle leftover K rows (< 4)
if (K % 4 != 0) {
int krem = K - bk;
int64_t bn = 0;
for (; bn < N; bn += 16) {
transpose_vnni4_pad_4x16_block(
src + bk * ld_src + bn, dst + bn * K + bk * 4, ld_src, K * 4, krem);
}
}
#else
TORCH_CHECK(
false, "transpose_pack_vnni4 is only supported when AVX-512 is supported")
#endif
}
} // namespace CPU_CAPABILITY } // namespace CPU_CAPABILITY
} // namespace at::vec } // namespace at::vec

View File

@ -1637,7 +1637,9 @@ bool gemm_and_bias(
if (activation == GEMMAndBiasActivationEpilogue::RELU) { if (activation == GEMMAndBiasActivationEpilogue::RELU) {
epilogue = CUBLASLT_EPILOGUE_RELU_BIAS; epilogue = CUBLASLT_EPILOGUE_RELU_BIAS;
} else if (activation == GEMMAndBiasActivationEpilogue::GELU) { } else if (activation == GEMMAndBiasActivationEpilogue::GELU) {
#if CUDA_VERSION >= 11040 || defined(USE_ROCM)
epilogue = CUBLASLT_EPILOGUE_GELU_BIAS; epilogue = CUBLASLT_EPILOGUE_GELU_BIAS;
#endif
} }
if (bias != nullptr) { if (bias != nullptr) {
@ -1929,6 +1931,7 @@ void scaled_gemm(
bool use_fast_accum) { bool use_fast_accum) {
// Note: see `cublasCommonArgs` for various non-intuitive manupulations // Note: see `cublasCommonArgs` for various non-intuitive manupulations
// of input arguments to this function. // of input arguments to this function.
#if CUDA_VERSION >= 11080 || defined(USE_ROCM)
const auto computeType = CUBLAS_COMPUTE_32F; const auto computeType = CUBLAS_COMPUTE_32F;
const auto scaleType = CUDA_R_32F; const auto scaleType = CUDA_R_32F;
const float alpha_val = 1.0; const float alpha_val = 1.0;
@ -2130,6 +2133,8 @@ void scaled_gemm(
" scaleType ", " scaleType ",
scaleType); scaleType);
return; return;
#endif // if CUDA_VERSION >= 11080 || defined(USE_ROCM)
TORCH_CHECK(false, "scaled_gemm is only supported for CUDA 11.8 and above");
} }
void int8_gemm( void int8_gemm(

View File

@ -281,9 +281,6 @@ bool CUDAHooks::compiledWithMIOpen() const {
bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const { bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const {
#if AT_CUDNN_ENABLED() #if AT_CUDNN_ENABLED()
if (!hasCUDA()) {
return false;
}
// NOTE: extra parenthesis around numbers disable clang warnings about // NOTE: extra parenthesis around numbers disable clang warnings about
// dead code // dead code
return true; return true;
@ -294,9 +291,6 @@ bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const {
bool CUDAHooks::supportsDepthwiseConvolutionWithCuDNN() const { bool CUDAHooks::supportsDepthwiseConvolutionWithCuDNN() const {
#if AT_CUDNN_ENABLED() #if AT_CUDNN_ENABLED()
if (!hasCUDA()) {
return false;
}
cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
// Check for Volta cores // Check for Volta cores
if (prop->major >= 7) { if (prop->major >= 7) {
@ -311,9 +305,6 @@ bool CUDAHooks::supportsDepthwiseConvolutionWithCuDNN() const {
bool CUDAHooks::supportsBFloat16ConvolutionWithCuDNNv8() const { bool CUDAHooks::supportsBFloat16ConvolutionWithCuDNNv8() const {
#if AT_CUDNN_ENABLED() #if AT_CUDNN_ENABLED()
if (!hasCUDA()) {
return false;
}
cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
// Check for Volta cores // Check for Volta cores
if (prop->major >= 8) { if (prop->major >= 8) {

View File

@ -465,11 +465,8 @@ inline bool mps_conv_use_channels_last(const at::Tensor& input, const at::Tensor
return false; return false;
} }
auto is_channel_last = [](const at::Tensor& t) { auto fmt = input.suggest_memory_format();
auto fmt = t.suggest_memory_format(); return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d;
return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d;
};
return is_channel_last(input) || is_channel_last(weight);
} }
} // namespace at::native } // namespace at::native

View File

@ -32,6 +32,10 @@
#include <ATen/native/mkldnn/Utils.h> #include <ATen/native/mkldnn/Utils.h>
#endif #endif
#ifdef USE_MPS
#include <ATen/mps/MPSDevice.h>
#endif
#ifndef AT_PER_OPERATOR_HEADERS #ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h> #include <ATen/Functions.h>
#include <ATen/NativeFunctions.h> #include <ATen/NativeFunctions.h>
@ -406,23 +410,11 @@ struct ConvParams {
// cudnn and miopen are guaranteed not to be on mobile, and T102591915 / T110194934 suggest // cudnn and miopen are guaranteed not to be on mobile, and T102591915 / T110194934 suggest
// that maybe the compiledWithCuDNN() check sometimes segfaults (though I can't imagine how) // that maybe the compiledWithCuDNN() check sometimes segfaults (though I can't imagine how)
#if !defined(C10_MOBILE) #if !defined(C10_MOBILE)
if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) { if (!detail::getCUDAHooks().compiledWithCuDNN()) {
return false; return false;
} }
static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
// broken on cuDNN 9.8
if (cudnn_version >= 90800) {
if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&
(input.scalar_type() == at::kBFloat16 || input.scalar_type() == at::kHalf) &&
weight.dim() == 5) {
for (int i = 2; i < weight.dim(); i++) {
if (weight.size(i) != 1) {
return false;
}
}
}
}
if (needs_64bit_indexing_no_split(input, weight)) { if (needs_64bit_indexing_no_split(input, weight)) {
static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) { if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions" TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
" if the V8 API is not enabled or before cuDNN version 9.3+." " if the V8 API is not enabled or before cuDNN version 9.3+."
@ -430,6 +422,9 @@ struct ConvParams {
return false; return false;
} }
} }
if (!input.is_cuda() || !cudnn_enabled) {
return false;
}
if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16) { if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16) {
if (!(detail::getCUDAHooks().supportsBFloat16ConvolutionWithCuDNNv8() && at::native::cudnnv8_enabled_check_debug())) { if (!(detail::getCUDAHooks().supportsBFloat16ConvolutionWithCuDNNv8() && at::native::cudnnv8_enabled_check_debug())) {
return false; return false;
@ -448,19 +443,16 @@ struct ConvParams {
// Use cudnn for FP16 depthwise convolutions // Use cudnn for FP16 depthwise convolutions
bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const { bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const {
if (!cudnn_enabled || !detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda()) { if (!detail::getCUDAHooks().compiledWithCuDNN()) {
return false; return false;
} }
if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous && use_cudnn(input, weight)) {
// always use cudnn_depthwise for channels_last format
return true;
}
// native kernel doesn't support 64-bit non-splittable case // native kernel doesn't support 64-bit non-splittable case
if (!(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) { if (cudnn_enabled && !(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1; static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1;
// TODO(eqy): remove this once cuDNN fixes 64-bit depthwise support, first broken in 9.11x
if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
if (cudnn_version < 0 || cudnn_version > 91000) {
return false;
}
}
if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) { if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions" TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
" if the V8 API is not enabled or before cuDNN version 9.3+." " if the V8 API is not enabled or before cuDNN version 9.3+."
@ -470,10 +462,6 @@ struct ConvParams {
return true; return true;
} }
} }
if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
// always use cudnn_depthwise for channels_last format
return true;
}
if (detail::getCUDAHooks().supportsDepthwiseConvolutionWithCuDNN()) { if (detail::getCUDAHooks().supportsDepthwiseConvolutionWithCuDNN()) {
bool kernel_cond = (use_cudnn(input, weight) && bool kernel_cond = (use_cudnn(input, weight) &&
input.scalar_type() == kHalf && // only for FP16 input.scalar_type() == kHalf && // only for FP16
@ -1441,8 +1429,12 @@ static inline at::MemoryFormat determine_backend_memory_format(
} }
break; break;
case ConvBackend::Mps: case ConvBackend::Mps:
case ConvBackend::MpsTranspose:
if (mps_conv_use_channels_last(input, weight)) { if (mps_conv_use_channels_last(input, weight)) {
#ifdef USE_MPS
if (!mps::is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_15_0_PLUS)) {
break;
}
#endif
backend_memory_format = (k == 5) ? MemoryFormat::ChannelsLast3d : MemoryFormat::ChannelsLast; backend_memory_format = (k == 5) ? MemoryFormat::ChannelsLast3d : MemoryFormat::ChannelsLast;
} }
break; break;

View File

@ -9,7 +9,6 @@
#include <ATen/native/TransposeType.h> #include <ATen/native/TransposeType.h>
#include <ATen/native/Unfold3d.h> #include <ATen/native/Unfold3d.h>
#include <c10/util/irange.h> #include <c10/util/irange.h>
#include <c10/util/safe_numerics.h>
#ifndef AT_PER_OPERATOR_HEADERS #ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h> #include <ATen/Functions.h>
@ -175,23 +174,6 @@ static inline void slow_conv3d_shape_check(
const int64_t input_height = input.size(dim_height); const int64_t input_height = input.size(dim_height);
const int64_t input_width = input.size(dim_width); const int64_t input_width = input.size(dim_width);
constexpr int64_t MAX_SAFE_PAD = (1LL << 61);
TORCH_CHECK_VALUE(
pad_height <= MAX_SAFE_PAD,
"Padding height too large: pad_height=",
pad_height);
TORCH_CHECK_VALUE(
pad_width <= MAX_SAFE_PAD,
"Padding width too large: pad_width=",
pad_width);
TORCH_CHECK_VALUE(
pad_depth <= MAX_SAFE_PAD,
"Padding depth too large: pad_depth=",
pad_depth);
const int64_t exact_input_depth = input_depth + 2 * pad_depth; const int64_t exact_input_depth = input_depth + 2 * pad_depth;
const int64_t exact_input_height = input_height + 2 * pad_height; const int64_t exact_input_height = input_height + 2 * pad_height;
const int64_t exact_input_width = input_width + 2 * pad_width; const int64_t exact_input_width = input_width + 2 * pad_width;
@ -239,14 +221,6 @@ static inline void slow_conv3d_shape_check(
output_width, output_width,
"). Output size is too small"); "). Output size is too small");
uint64_t kernel_product;
TORCH_CHECK(
!c10::mul_overflows(kernel_height, kernel_width, &kernel_product),
"Kernel height x width product is too large: kernel_height=",
kernel_height,
", kernel_width=",
kernel_width);
if (weight.defined()) { if (weight.defined()) {
int64_t n_input_plane = weight.size(1); int64_t n_input_plane = weight.size(1);
if (weight.dim() == 2) { if (weight.dim() == 2) {

View File

@ -97,38 +97,43 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) {
int64_t nDims = self.dim(); int64_t nDims = self.dim();
TORCH_CHECK(nDims >= 2, "dimensions must larger than 1"); TORCH_CHECK(nDims >= 2, "dimensions must larger than 1");
auto height = self.sym_size(0); int64_t height = self.size(0);
auto width = self.sym_size(1); int64_t width = self.size(1);
if (nDims > 2) { if (nDims > 2) {
int64_t dim1 = height;
for (const auto i : c10::irange(1, nDims)) { for (const auto i : c10::irange(1, nDims)) {
if (self.sym_size(i) != height) { if (self.size(i) != dim1) {
TORCH_CHECK(false, "all dimensions of input must be of equal length"); TORCH_CHECK(false, "all dimensions of input must be of equal length");
} }
} }
} }
auto storage_offset = self.sym_storage_offset(); int64_t storage_offset = self.storage_offset();
auto size = std::min(height, width); std::vector<int64_t> sizes;
std::vector<int64_t> strides;
int64_t size = std::min(height, width);
int64_t stride = 0; int64_t stride = 0;
for (const auto i : c10::irange(nDims)) { for (const auto i : c10::irange(nDims)) {
stride += self.stride(i); stride += self.stride(i);
} }
std::vector<SymInt> strides{stride}; strides.push_back(stride);
std::vector<SymInt> sizes{size}; sizes.push_back(size);
auto main_diag = self.as_strided_symint(sizes, strides, storage_offset); auto main_diag = self.as_strided(sizes, strides, storage_offset);
main_diag.fill_(fill_value); main_diag.fill_(fill_value);
if (wrap && nDims == 2 && height > width + 1) { if (wrap && nDims == 2 && height > width + 1) {
auto step = width + 1; std::vector<int64_t> wrap_sizes;
auto wrap_size = ((self.numel() + step - 1) / step) - size;
std::vector<SymInt> wrap_sizes{wrap_size};
auto offset = self.stride(0) * (width + 1); int64_t step = width + 1;
int64_t wrap_size = ((self.numel() + step - 1) / step) - size;
wrap_sizes.push_back(wrap_size);
auto wrap_diag = self.as_strided_symint(wrap_sizes, strides, storage_offset + offset); int64_t offset = self.stride(0) * (width + 1);
auto wrap_diag = self.as_strided(wrap_sizes, strides, storage_offset + offset);
wrap_diag.fill_(fill_value); wrap_diag.fill_(fill_value);
} }

View File

@ -23,7 +23,6 @@
#include <ATen/ops/linspace.h> #include <ATen/ops/linspace.h>
#endif #endif
#include <cmath>
#include <numeric> #include <numeric>
#include <tuple> #include <tuple>
#include <vector> #include <vector>
@ -203,46 +202,6 @@ select_outer_bin_edges(const Tensor& input, std::optional<c10::ArrayRef<double>>
return std::make_pair(leftmost_edges, rightmost_edges); return std::make_pair(leftmost_edges, rightmost_edges);
} }
/* Bin edges correction based on the precision representation.
* To maintain the backward compatibility we take max(std::nextafter<>, +1)
* and min(std::nextafter<>, -1) for scalar types. For other types +/- 1 as usual.
*/
void bins_edges_correction(const ScalarType& t, double &leftmost_edge, double &rightmost_edge)
{
#define UPDATE_WITH_LIMIT(real_type, scalartype) \
case ScalarType::scalartype: \
leftmost_edge = std::min( \
static_cast<double>( \
std::nexttoward( \
static_cast<real_type>(leftmost_edge), \
std::numeric_limits<real_type>::lowest() \
) \
), \
leftmost_edge - 1. \
); \
rightmost_edge = std::max( \
static_cast<double>( \
std::nexttoward( \
static_cast<real_type>(rightmost_edge), \
std::numeric_limits<real_type>::max() \
) \
), \
rightmost_edge + 1. \
); \
break;
switch (t) {
UPDATE_WITH_LIMIT(double, Double)
UPDATE_WITH_LIMIT(float, Float)
default:
// Fallback to the default behavior for other types
leftmost_edge -= 1;
rightmost_edge += 1;
}
#undef UPDATE_WITH_LIMIT
}
/* histc's version of the logic for outermost bin edges. /* histc's version of the logic for outermost bin edges.
*/ */
std::pair<double, double> histc_select_outer_bin_edges(const Tensor& input, std::pair<double, double> histc_select_outer_bin_edges(const Tensor& input,
@ -257,7 +216,8 @@ std::pair<double, double> histc_select_outer_bin_edges(const Tensor& input,
} }
if (leftmost_edge == rightmost_edge) { if (leftmost_edge == rightmost_edge) {
bins_edges_correction(input.dtype().toScalarType(), leftmost_edge, rightmost_edge); leftmost_edge -= 1;
rightmost_edge += 1;
} }
TORCH_CHECK(!(std::isinf(leftmost_edge) || std::isinf(rightmost_edge) || TORCH_CHECK(!(std::isinf(leftmost_edge) || std::isinf(rightmost_edge) ||

View File

@ -23,6 +23,8 @@ Tensor& max_unpooling2d_forward_out_cpu(
// Nondeterministic with duplicate indices // Nondeterministic with duplicate indices
at::globalContext().alertNotDeterministic("max_unpooling2d_forward_out"); at::globalContext().alertNotDeterministic("max_unpooling2d_forward_out");
auto oheight = output_size[0];
auto owidth = output_size[1];
TORCH_CHECK( TORCH_CHECK(
indices_.scalar_type() == at::ScalarType::Long, indices_.scalar_type() == at::ScalarType::Long,
"elements in indices should be type int64 but got: ", indices_.scalar_type()); "elements in indices should be type int64 but got: ", indices_.scalar_type());
@ -43,9 +45,6 @@ Tensor& max_unpooling2d_forward_out_cpu(
self_.sizes(), " with dimension ", i , " being empty."); self_.sizes(), " with dimension ", i , " being empty.");
} }
auto oheight = output_size[0];
auto owidth = output_size[1];
auto memory_format = self_.suggest_memory_format(); auto memory_format = self_.suggest_memory_format();
auto self = self_.contiguous(memory_format); auto self = self_.contiguous(memory_format);
auto indices = indices_.contiguous(memory_format); auto indices = indices_.contiguous(memory_format);

View File

@ -1,5 +1,3 @@
#include <ATen/core/ATen_fwd.h>
#include <c10/core/ScalarType.h>
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/AccumulateType.h> #include <ATen/AccumulateType.h>
#include <ATen/Dispatch.h> #include <ATen/Dispatch.h>
@ -1880,18 +1878,19 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) {
Tensor xtensor = self.expand(padded_size); Tensor xtensor = self.expand(padded_size);
Tensor urtensor; Tensor result;
if (self.is_quantized()) { if (self.is_quantized()) {
urtensor = at::empty_quantized(target_size, self); result = at::empty_quantized(target_size, self);
} else { } else {
urtensor = at::empty(target_size, self.options()); result = at::empty(target_size, self.options());
} }
// return an empty tensor if one of the repeat dimensions is zero // return an empty tensor if one of the repeat dimensions is zero
if (zero_tensor) { if (zero_tensor) {
return urtensor; return result;
} }
Tensor urtensor = at::alias(result);
for (const auto i : c10::irange(xtensor.dim())) { for (const auto i : c10::irange(xtensor.dim())) {
// can't unfold with step 0, so make sure step is at least 1 // can't unfold with step 0, so make sure step is at least 1
// (it doesn't matter what it is in that case, because the size is 0). // (it doesn't matter what it is in that case, because the size is 0).
@ -1901,22 +1900,7 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) {
urtensor.copy_(xtensor.expand_as(urtensor)); urtensor.copy_(xtensor.expand_as(urtensor));
// Combine the dimensions to produce the target_size. return result;
// xtensor dims: [a0, ..., ad-1]
// urtensor dims: [a0, ..., ad-1, b0, ..., bd-1]
// b dims are produced by unfold.
// Transform urtensor to [a0 * b0, ..., ad-1 * bd-1]
const int64_t n_dims = xtensor.dim();
auto range_a = at::arange(xtensor.dim(), at::TensorOptions(at::kLong));
auto range_b = range_a + n_dims;
auto stacked = stack({std::move(range_a), std::move(range_b)}, 1).flatten();
auto permutation = IntArrayRef(stacked.data_ptr<int64_t>(), n_dims * 2);
// Permute from [a0, ..., ad-1, b0, ..., bd-1] to [a0, b0, ..., ad-1, bd-1]
urtensor = urtensor.permute(permutation);
// Reshape from [a0, b0, ..., ad-1, bd-1] to [a0 * b0, ..., ad-1 * bd-1]
urtensor = urtensor.reshape(target_size);
return urtensor;
} }
Tensor tile_symint(const Tensor& self, SymIntArrayRef reps) { Tensor tile_symint(const Tensor& self, SymIntArrayRef reps) {

View File

@ -999,41 +999,12 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
dtypes[i] = iter.dtype(i); dtypes[i] = iter.dtype(i);
} }
auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter); auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
#ifdef USE_ROCM
constexpr int grp_sz = 128;
launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
if (unrl) {
auto offsets0 = offset_calc.get(idx);
auto offsets1 = offset_calc.get(idx + grp_sz);
auto offsets2 = offset_calc.get(idx + grp_sz * 2);
auto offsets3 = offset_calc.get(idx + grp_sz * 3);
void* out0 = data[0] + offsets0[0];
void* out1 = data[0] + offsets1[0];
void* out2 = data[0] + offsets2[0];
void* out3 = data[0] + offsets3[0];
arg0_t result0 = invoke(f, &data[1], &offsets0[1], &dtypes[1], 1);
arg0_t result1 = invoke(f, &data[1], &offsets1[1], &dtypes[1], 1);
arg0_t result2 = invoke(f, &data[1], &offsets2[1], &dtypes[1], 1);
arg0_t result3 = invoke(f, &data[1], &offsets3[1], &dtypes[1], 1);
c10::cast_and_store<arg0_t>(dtypes[0], out0, result0);
c10::cast_and_store<arg0_t>(dtypes[0], out1, result1);
c10::cast_and_store<arg0_t>(dtypes[0], out2, result2);
c10::cast_and_store<arg0_t>(dtypes[0], out3, result3);
} else {
auto offsets = offset_calc.get(idx);
void* out = data[0] + offsets[0];
arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1);
c10::cast_and_store<arg0_t>(dtypes[0], out, result);
}
});
#else
launch_legacy_kernel<128, 4>(numel, [=] GPU_LAMBDA(int idx) { launch_legacy_kernel<128, 4>(numel, [=] GPU_LAMBDA(int idx) {
auto offsets = offset_calc.get(idx); auto offsets = offset_calc.get(idx);
void* out = data[0] + offsets[0]; void* out = data[0] + offsets[0];
arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1); arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1);
c10::cast_and_store<arg0_t>(dtypes[0], out, result); c10::cast_and_store<arg0_t>(dtypes[0], out, result);
}); });
#endif
} }
} }

View File

@ -42,19 +42,6 @@ void bfloat16_copy_kernel_cuda(TensorIteratorBase &iter) {
}); });
} }
#ifdef USE_ROCM
void bfloat16tofloat32_copy_kernel_cuda(TensorIteratorBase &iter) {
gpu_kernel_nocast(iter, [] GPU_LAMBDA(at::BFloat16 value) {
return static_cast<float>(value);
});
}
void float16tofloat32_copy_kernel_cuda(TensorIteratorBase &iter) {
gpu_kernel_nocast(iter, [] GPU_LAMBDA(at::Half value) {
return static_cast<float>(value);
});
}
#endif
void float8_copy_kernel_cuda(TensorIteratorBase &iter) { void float8_copy_kernel_cuda(TensorIteratorBase &iter) {
ScalarType dtype = iter.dtype(0); ScalarType dtype = iter.dtype(0);
ScalarType other_dtype = iter.dtype(1); ScalarType other_dtype = iter.dtype(1);
@ -200,17 +187,7 @@ void direct_copy_kernel_cuda(TensorIteratorBase &iter) {
} else { } else {
float16_copy_kernel_cuda(iter); float16_copy_kernel_cuda(iter);
} }
} } else if (isBitsType(dtype)) {
#ifdef USE_ROCM
else if ((iter.dtype(1) == kBFloat16 || iter.dtype(1) == kHalf) && dtype == kFloat) {
if (iter.dtype(1) == kBFloat16) {
bfloat16tofloat32_copy_kernel_cuda(iter);
} else {
float16tofloat32_copy_kernel_cuda(iter);
}
}
#endif
else if (isBitsType(dtype)) {
TORCH_CHECK(dtype == iter.dtype(1), "copy_() does not support casting " TORCH_CHECK(dtype == iter.dtype(1), "copy_() does not support casting "
"bits types to different bits types. Source dtype is ", iter.dtype(1), "target dtype is ", dtype); "bits types to different bits types. Source dtype is ", iter.dtype(1), "target dtype is ", dtype);
AT_DISPATCH_BIT_TYPES(dtype, "copy_", [&] { AT_DISPATCH_BIT_TYPES(dtype, "copy_", [&] {

View File

@ -125,6 +125,8 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_,
TORCH_CHECK( TORCH_CHECK(
indices_.scalar_type() == at::ScalarType::Long, indices_.scalar_type() == at::ScalarType::Long,
"elements in indices should be type int64 but got: ", indices_.scalar_type()); "elements in indices should be type int64 but got: ", indices_.scalar_type());
auto oheight = output_size[0];
auto owidth = output_size[1];
TensorArg output_arg{output, "output", 1}, self_arg{self_, "self_", 2}, TensorArg output_arg{output, "output", 1}, self_arg{self_, "self_", 2},
indices_arg{indices_, "indices_", 3}; indices_arg{indices_, "indices_", 3};
@ -147,9 +149,6 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_,
output_size.size() == 2, output_size.size() == 2,
"There should be exactly two elements (height, width) in output_size, but got ", output_size.size(), " elements."); "There should be exactly two elements (height, width) in output_size, but got ", output_size.size(), " elements.");
auto oheight = output_size[0];
auto owidth = output_size[1];
int64_t dimw = 2; int64_t dimw = 2;
int64_t dimh = 1; int64_t dimh = 1;
int64_t numBatch = 1; int64_t numBatch = 1;
@ -218,6 +217,9 @@ static void max_unpooling3d_shape_check(
IntArrayRef stride, IntArrayRef stride,
IntArrayRef padding, IntArrayRef padding,
const char *fn_name) { const char *fn_name) {
int64_t oT = output_size[0];
int64_t oH = output_size[1];
int64_t oW = output_size[2];
TORCH_CHECK( TORCH_CHECK(
indices.scalar_type() == at::ScalarType::Long, indices.scalar_type() == at::ScalarType::Long,
"elements in indices should be type int64 but got: ", indices.scalar_type()); "elements in indices should be type int64 but got: ", indices.scalar_type());
@ -248,10 +250,6 @@ static void max_unpooling3d_shape_check(
"strides should be greater than zero, but got stride: ", "strides should be greater than zero, but got stride: ",
stride); stride);
int64_t oT = output_size[0];
int64_t oH = output_size[1];
int64_t oW = output_size[2];
int dimw = 3; int dimw = 3;
int dimh = 2; int dimh = 2;
int dimt = 1; int dimt = 1;
@ -404,6 +402,8 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_,
const Tensor& indices_, const Tensor& indices_,
IntArrayRef output_size, IntArrayRef output_size,
Tensor& grad_input) { Tensor& grad_input) {
int64_t oheight = output_size[0];
int64_t owidth = output_size[1];
TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous"); TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
TORCH_CHECK( TORCH_CHECK(
indices_.scalar_type() == at::ScalarType::Long, indices_.scalar_type() == at::ScalarType::Long,
@ -426,9 +426,6 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_,
TORCH_CHECK(output_size.size() == 2, "output_size must have two elements, got size: ", output_size.size()); TORCH_CHECK(output_size.size() == 2, "output_size must have two elements, got size: ", output_size.size());
int64_t oheight = output_size[0];
int64_t owidth = output_size[1];
int64_t nInputCols, nInputRows, nInputPlane; int64_t nInputCols, nInputRows, nInputPlane;
int dimw = 2; int dimw = 2;
@ -508,14 +505,13 @@ at::Tensor& max_unpooling3d_backward_out_cuda(const Tensor& grad_output_,
IntArrayRef padding, IntArrayRef padding,
Tensor& grad_input) { Tensor& grad_input) {
TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous"); TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
max_unpooling3d_shape_check(
self_, grad_output_, indices_, output_size, stride, padding, "max_unpooling3d_backward_out_cuda()");
int64_t oT = output_size[0]; int64_t oT = output_size[0];
int64_t oH = output_size[1]; int64_t oH = output_size[1];
int64_t oW = output_size[2]; int64_t oW = output_size[2];
max_unpooling3d_shape_check(
self_, grad_output_, indices_, output_size, stride, padding, "max_unpooling3d_backward_out_cuda()");
int batchSize = 0; int batchSize = 0;
int inputSlices = 0; int inputSlices = 0;
int inputTime = 0; int inputTime = 0;

View File

@ -300,6 +300,8 @@ void nonzero_static_cuda_out_impl(
int64_t size, int64_t size,
int64_t fill_value, int64_t fill_value,
Tensor& out) { Tensor& out) {
#if defined(CUDA_VERSION) || defined(USE_ROCM)
Tensor self_contiguous_ = self.contiguous(); Tensor self_contiguous_ = self.contiguous();
// see comment in nonzero_cuda_out_impl on reqs for out // see comment in nonzero_cuda_out_impl on reqs for out
bool out_correct_size = bool out_correct_size =
@ -375,6 +377,9 @@ void nonzero_static_cuda_out_impl(
if (need_to_copy) { if (need_to_copy) {
out.copy_(out_temp); out.copy_(out_temp);
} }
#else
TORCH_CHECK(false, "Nonzero_static is not supported for cuda <= 11.4");
#endif
} }
Tensor& nonzero_out_cuda(const Tensor& self, Tensor& out) { Tensor& nonzero_out_cuda(const Tensor& self, Tensor& out) {

View File

@ -221,9 +221,22 @@ static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_
std::optional<CuFFTConfig> uncached_plan; std::optional<CuFFTConfig> uncached_plan;
const CuFFTConfig * config = nullptr; const CuFFTConfig * config = nullptr;
// Workaround for gh-63152, gh-58724
// Bluestein plans in CUDA 11.1 (cufft 10.3) cannot be re-used
// Bluestein's algorithm is only used when a size has large prime factors, // Bluestein's algorithm is only used when a size has large prime factors,
// sizes with only small prime factors can still be cached // sizes with only small prime factors can still be cached
if (plan_cache.max_size() > 0) { bool use_caching = true;
#ifdef CUFFT_VERSION
if constexpr (10300 <= CUFFT_VERSION && CUFFT_VERSION < 10400) {
// Only cache plans for transforms with small prime factors
use_caching = std::none_of(
signal_size.begin() + 1, signal_size.end(), [](int64_t dim_size) {
return has_large_prime_factor(dim_size);
});
}
#endif
if (use_caching && plan_cache.max_size() > 0) {
guard.lock(); guard.lock();
if (plan_cache.max_size() > 0) { // check again after acquiring the lock if (plan_cache.max_size() > 0) { // check again after acquiring the lock
config = &plan_cache.lookup(Params); config = &plan_cache.lookup(Params);

View File

@ -1238,7 +1238,7 @@ Tensor _cholesky_solve_helper_cuda_magma(const Tensor& self, const Tensor& A, bo
// Todo: cusolverDn<T>potrsBatched only supports nrhs == 1 and does not have good performance. // Todo: cusolverDn<T>potrsBatched only supports nrhs == 1 and does not have good performance.
// Batched cholesky_solve is dispatched to magma. // Batched cholesky_solve is dispatched to magma.
Tensor _cholesky_solve_helper_cuda(const Tensor& self, const Tensor& A, bool upper) { Tensor _cholesky_solve_helper_cuda(const Tensor& self, const Tensor& A, bool upper) {
#if defined(USE_LINALG_SOLVER) #if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
auto preferred_backend = at::globalContext().linalgPreferredBackend(); auto preferred_backend = at::globalContext().linalgPreferredBackend();
switch (preferred_backend) { switch (preferred_backend) {
case at::LinalgBackend::Cusolver: case at::LinalgBackend::Cusolver:
@ -1352,7 +1352,7 @@ void cholesky_helper_magma(const Tensor& input, bool upper, const Tensor& info)
} }
static void cholesky_kernel(const Tensor& input, const Tensor& info, bool upper) { static void cholesky_kernel(const Tensor& input, const Tensor& info, bool upper) {
#if defined(USE_LINALG_SOLVER) #if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
auto preferred_backend = at::globalContext().linalgPreferredBackend(); auto preferred_backend = at::globalContext().linalgPreferredBackend();
switch (preferred_backend) { switch (preferred_backend) {
case at::LinalgBackend::Cusolver: case at::LinalgBackend::Cusolver:
@ -2709,7 +2709,7 @@ void linalg_lstsq_gels(const Tensor& A, const Tensor& B, const Tensor& /*infos*/
} }
void gels_looped(const Tensor& a, Tensor& b, Tensor& infos) { void gels_looped(const Tensor& a, Tensor& b, Tensor& infos) {
#if defined(USE_LINALG_SOLVER) #if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
auto preferred_backend = at::globalContext().linalgPreferredBackend(); auto preferred_backend = at::globalContext().linalgPreferredBackend();
switch (preferred_backend) { switch (preferred_backend) {
case at::LinalgBackend::Magma: case at::LinalgBackend::Magma:
@ -2733,7 +2733,7 @@ void lstsq_kernel(const Tensor& a, Tensor& b, Tensor& /*rank*/, Tensor& /*singul
// first handle the underdetermined case (m < n) // first handle the underdetermined case (m < n)
// this case is not supported by MAGMA or cuBLAS // this case is not supported by MAGMA or cuBLAS
if (m < n) { if (m < n) {
#if defined(USE_LINALG_SOLVER) #if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
linalg_lstsq_gels(a, b, infos); linalg_lstsq_gels(a, b, infos);
#else #else
TORCH_CHECK( TORCH_CHECK(

View File

@ -14,7 +14,7 @@ struct EmbeddingBagParams {
::c10::metal::array<idx_type_t, 2> output_strides; ::c10::metal::array<idx_type_t, 2> output_strides;
::c10::metal::array<idx_type_t, 2> max_indices_strides; ::c10::metal::array<idx_type_t, 2> max_indices_strides;
idx_type_t per_sample_weights_stride; idx_type_t per_sample_weights_strides;
idx_type_t num_indices; idx_type_t num_indices;
idx_type_t num_bags; idx_type_t num_bags;

View File

@ -23,72 +23,54 @@ struct ReductionOpInit<EmbeddingBagMode::MAX, T> {
template <EmbeddingBagMode M, typename T> template <EmbeddingBagMode M, typename T>
struct ReductionOp { struct ReductionOp {
inline opmath_t<T> operator()( inline opmath_t<T> operator()(
opmath_t<T> weight_val, T weight_val,
opmath_t<T> out_val, opmath_t<T> out_val,
bool is_first) { uint32_t per_sample_weights_index,
return weight_val + out_val; constant T* per_sample_weights,
uint32_t per_sample_weights_strides);
};
template <typename T>
struct ReductionOp<EmbeddingBagMode::SUM, T> {
inline opmath_t<T> operator()(
T weight_val,
opmath_t<T> out_val,
uint32_t per_sample_weights_index,
constant T* per_sample_weights,
uint32_t per_sample_weights_strides) {
if (per_sample_weights_strides) {
T per_sample_weight = per_sample_weights
[per_sample_weights_strides * per_sample_weights_index];
return static_cast<opmath_t<T>>(per_sample_weight) *
static_cast<opmath_t<T>>(weight_val) +
out_val;
} else {
return static_cast<opmath_t<T>>(weight_val) + out_val;
}
}
};
template <typename T>
struct ReductionOp<EmbeddingBagMode::MEAN, T> {
inline opmath_t<T> operator()(
T weight_val,
opmath_t<T> out_val,
uint32_t,
constant T*,
uint32_t) {
return static_cast<opmath_t<T>>(weight_val) + out_val;
} }
}; };
template <typename T> template <typename T>
struct ReductionOp<EmbeddingBagMode::MAX, T> { struct ReductionOp<EmbeddingBagMode::MAX, T> {
inline opmath_t<T> operator()( inline opmath_t<T> operator()(
opmath_t<T> weight_val, T weight_val,
opmath_t<T> out_val, opmath_t<T> out_val,
bool is_first) { uint32_t,
return (is_first || weight_val > out_val) ? weight_val : out_val; constant T*,
} uint32_t) {
}; return max(static_cast<opmath_t<T>>(weight_val), out_val);
template <EmbeddingBagMode M, typename T>
struct MaybeApplyPerSampleWeight {
inline opmath_t<T> operator()(
opmath_t<T> weight_val,
uint32_t per_sample_weights_index,
constant T* per_sample_weights,
uint32_t per_sample_weights_stride) {
return weight_val;
}
};
template <typename T>
struct MaybeApplyPerSampleWeight<EmbeddingBagMode::SUM, T> {
inline opmath_t<T> operator()(
opmath_t<T> weight_val,
uint32_t per_sample_weights_index,
constant T* per_sample_weights,
uint32_t per_sample_weights_stride) {
if (per_sample_weights_stride) {
T per_sample_weight = per_sample_weights
[per_sample_weights_stride * per_sample_weights_index];
return static_cast<opmath_t<T>>(per_sample_weight) * weight_val;
} else {
return weight_val;
}
}
};
template <EmbeddingBagMode M, typename T, typename I>
struct MaybeCalcMaxIndex {
inline void operator()(
opmath_t<T> weight_val,
opmath_t<T> out_val,
bool is_first,
thread I& max_idx,
I weight_idx,
bool pad) {}
};
template <typename T, typename I>
struct MaybeCalcMaxIndex<EmbeddingBagMode::MAX, T, I> {
inline void operator()(
opmath_t<T> weight_val,
opmath_t<T> out_val,
bool is_first,
thread I& max_idx,
I weight_idx,
bool pad) {
max_idx = !pad && (is_first || weight_val > out_val) ? weight_idx : max_idx;
} }
}; };
@ -114,30 +96,6 @@ struct ReductionOpFinal<EmbeddingBagMode::MAX, T> {
} }
}; };
template <EmbeddingBagMode M, typename I>
struct MaybeWriteMaxIndex {
inline void operator()(
device I*,
const constant ::c10::metal::array<uint32_t, 2>&,
uint32_t,
uint32_t,
I) {}
};
template <typename I>
struct MaybeWriteMaxIndex<EmbeddingBagMode::MAX, I> {
inline void operator()(
device I* max_indices,
const constant ::c10::metal::array<uint32_t, 2>& max_indices_strides,
uint32_t bag_idx,
uint32_t feature_idx,
I max_idx) {
max_indices
[bag_idx * max_indices_strides[0] +
feature_idx * max_indices_strides[1]] = max_idx;
}
};
template <EmbeddingBagMode M, typename T, typename I> template <EmbeddingBagMode M, typename T, typename I>
void embedding_bag_impl( void embedding_bag_impl(
constant T* weight, constant T* weight,
@ -154,7 +112,7 @@ void embedding_bag_impl(
auto num_bags = params.num_bags; auto num_bags = params.num_bags;
auto feature_size = params.feature_size; auto feature_size = params.feature_size;
auto padding_idx = params.padding_idx; auto padding_idx = params.padding_idx;
auto per_sample_weights_stride = params.per_sample_weights_stride; auto per_sample_weights_strides = params.per_sample_weights_strides;
constant auto& output_strides = params.output_strides; constant auto& output_strides = params.output_strides;
constant auto& weight_strides = params.weight_strides; constant auto& weight_strides = params.weight_strides;
constant auto& max_indices_strides = params.max_indices_strides; constant auto& max_indices_strides = params.max_indices_strides;
@ -162,6 +120,8 @@ void embedding_bag_impl(
auto bag_idx = tid / feature_size; auto bag_idx = tid / feature_size;
auto feature_idx = tid % feature_size; auto feature_idx = tid % feature_size;
output += bag_idx * output_strides[0] + feature_idx * output_strides[1];
uint32_t offsets_end = min(bag_idx + 1, num_bags - 1); uint32_t offsets_end = min(bag_idx + 1, num_bags - 1);
bool is_last_bag = bag_idx + 1 == num_bags; bool is_last_bag = bag_idx + 1 == num_bags;
uint32_t indices_start = static_cast<uint32_t>(offsets[bag_idx]); uint32_t indices_start = static_cast<uint32_t>(offsets[bag_idx]);
@ -171,37 +131,28 @@ void embedding_bag_impl(
auto out_val = ReductionOpInit<M, T>()(); auto out_val = ReductionOpInit<M, T>()();
uint32_t bag_size_ = 0; uint32_t bag_size_ = 0;
I max_idx = 0;
for (uint32_t indices_idx = indices_start; indices_idx < indices_end; for (uint32_t indices_idx = indices_start; indices_idx < indices_end;
indices_idx++) { indices_idx++) {
I weight_idx = indices[indices_idx]; I weight_idx = indices[indices_idx];
bool pad = (weight_idx == padding_idx); bool pad = (weight_idx == padding_idx);
auto weight_val = static_cast<opmath_t<T>>( T weight_val = weight
weight [static_cast<uint32_t>(weight_idx) * weight_strides[0] +
[static_cast<uint32_t>(weight_idx) * weight_strides[0] + feature_idx * weight_strides[1]];
feature_idx * weight_strides[1]]);
weight_val = MaybeApplyPerSampleWeight<M, T>()(
weight_val, indices_idx, per_sample_weights, per_sample_weights_stride);
auto new_out_val = ReductionOp<M, T>()(weight_val, out_val, bag_size_ == 0);
MaybeCalcMaxIndex<M, T, I>()(
weight_val, out_val, bag_size_ == 0, max_idx, weight_idx, pad);
out_val = pad ? out_val : new_out_val;
offset2bag[indices_idx] = bag_idx;
bag_size_ += static_cast<uint32_t>(!pad); bag_size_ += static_cast<uint32_t>(!pad);
auto tmp_val = ReductionOp<M, T>()(
weight_val,
out_val,
indices_idx,
per_sample_weights,
per_sample_weights_strides);
out_val = pad ? out_val : tmp_val;
} }
output[bag_idx * output_strides[0] + feature_idx * output_strides[1]] = *output = ReductionOpFinal<M, T>()(out_val, bag_size_);
ReductionOpFinal<M, T>()(out_val, bag_size_);
bag_size[bag_idx] = bag_size_;
MaybeWriteMaxIndex<M, I>()(
max_indices, max_indices_strides, bag_idx, feature_idx, max_idx);
} }
#define DISPATCH_IMPL(MODE) \ #define DISPATCH_IMPL(MODE) \

View File

@ -223,6 +223,9 @@ void grid_sampler_single_element(
auto input_size = input_sizes[input_dim]; auto input_size = input_sizes[input_dim];
auto coord = static_cast<opmath_t<T>>(coords[coord_dim]); auto coord = static_cast<opmath_t<T>>(coords[coord_dim]);
// Interpret nan as -1
coord = isnan(coord) ? -1 : coord;
if (!align_corners) { if (!align_corners) {
// Map unaligned grid space to aligned grid space // Map unaligned grid space to aligned grid space
auto corner_alignment_factor = static_cast<opmath_t<T>>(input_size) / auto corner_alignment_factor = static_cast<opmath_t<T>>(input_size) /

View File

@ -52,7 +52,9 @@ static void fill_depthwise_conv_desc(MPSGraphDepthwiseConvolution3DOpDescriptor*
NSUInteger dilationRateInX, NSUInteger dilationRateInX,
NSUInteger dilationRateInY, NSUInteger dilationRateInY,
NSUInteger paddingHorizontal, NSUInteger paddingHorizontal,
NSUInteger paddingVertical) { NSUInteger paddingVertical,
c10::MemoryFormat memory_format,
NSUInteger groups) {
descriptor_.strides = descriptor_.strides =
@[ @1, [[NSNumber alloc] initWithInteger:strideInY], [[NSNumber alloc] initWithInteger:strideInX] ]; @[ @1, [[NSNumber alloc] initWithInteger:strideInY], [[NSNumber alloc] initWithInteger:strideInX] ];
descriptor_.dilationRates = descriptor_.dilationRates =
@ -101,7 +103,7 @@ static void fill_conv_desc(MPSGraphConvolution2DOpDescriptor* descriptor_,
descriptor_.groups = groups; descriptor_.groups = groups;
} }
static Tensor _mps_convolution_impl(const Tensor& input_t, static Tensor _mps_convolution_impl(const Tensor& input_t_,
const Tensor& weight_t, const Tensor& weight_t,
const std::optional<Tensor>& bias_opt, const std::optional<Tensor>& bias_opt,
IntArrayRef padding, IntArrayRef padding,
@ -109,15 +111,12 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
IntArrayRef dilation, IntArrayRef dilation,
int64_t groups, int64_t groups,
std::optional<IntArrayRef> input_shape) { std::optional<IntArrayRef> input_shape) {
constexpr auto kChannelsLast = MemoryFormat::ChannelsLast; const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
constexpr auto kContiguous = MemoryFormat::Contiguous; Tensor input_t = input_t_;
const bool is_macos_15_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS); bool is3DConv = input_t.dim() == 5;
if (!is_macOS_15_0_or_newer || is3DConv) {
const bool is3DConv = input_t.dim() == 5; input_t = input_t.contiguous();
const auto memory_format = input_t.suggest_memory_format(); }
const auto input_suggested_layout = memory_format == kChannelsLast && is_macos_15_plus ? kChannelsLast : kContiguous;
const bool is_channels_last = mps_conv_use_channels_last(input_t, weight_t) && !is3DConv;
const bool bias_defined = bias_opt ? bias_opt->defined() : false;
TORCH_CHECK(isFloatingType(input_t.scalar_type()), "Convolution is supported only for Floating types"); TORCH_CHECK(isFloatingType(input_t.scalar_type()), "Convolution is supported only for Floating types");
@ -127,6 +126,15 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
checkAllSameType(c, {input, weight}); checkAllSameType(c, {input, weight});
checkAllSameGPU(c, {input, weight}); checkAllSameGPU(c, {input, weight});
bool bias_defined;
if (bias_opt == std::nullopt)
bias_defined = false;
else
bias_defined = bias_opt->defined();
auto memory_format = input_t.suggest_memory_format();
bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast) && !is3DConv;
auto output_t = auto output_t =
at::empty(input_shape.has_value() ? input_shape.value() at::empty(input_shape.has_value() ? input_shape.value()
: conv_output_size(input->sizes(), weight->sizes(), padding, stride, dilation), : conv_output_size(input->sizes(), weight->sizes(), padding, stride, dilation),
@ -134,18 +142,12 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
std::nullopt, std::nullopt,
kMPS, kMPS,
std::nullopt, std::nullopt,
is_channels_last ? kChannelsLast : kContiguous); is_macOS_15_0_or_newer ? memory_format : MemoryFormat::Contiguous);
if (output_t.numel() == 0) { if (output_t.numel() == 0) {
return output_t; return output_t;
} }
TensorArg output{output_t, "result", 0}; TensorArg output{output_t, "result", 0};
// TODO: Remove me when MacOS-14 is no longer supported
std::optional<Tensor> output_c;
if (!is_macos_15_plus && is_channels_last) {
output_c = at::empty_like(output_t, output_t.options().memory_format(kContiguous));
}
if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_1_PLUS)) { if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_1_PLUS)) {
// On macOS < 15.1, MPS convolution kernel does not support output channels > 2^16 // On macOS < 15.1, MPS convolution kernel does not support output channels > 2^16
for (auto elem : output_t.sizes()) { for (auto elem : output_t.sizes()) {
@ -184,22 +186,32 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
getArrayRefString(dilation), getArrayRefString(dilation),
getArrayRefString(padding), getArrayRefString(padding),
groups, groups,
input_suggested_layout == kChannelsLast, is_channels_last,
mps::getTensorsStringKey({input_t, weight_t}), mps::getTensorsStringKey({input_t, weight_t}),
bias_defined, bias_defined,
bias_shape_key); bias_shape_key);
auto inputShape = mps::getMPSShape(input_t, input_suggested_layout); MPSShape* inputShape = mps::getMPSShape(input_t, memory_format);
auto outputShape = mps::getMPSShape(output_t, input_suggested_layout); MPSShape* outputShape = mps::getMPSShape(output_t, memory_format);
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) { MPSNDArray* inputNDArray = nil;
bool isDepthwiseConv = MPSNDArray* outputNDArray = nil;
(groups > 1 && weight_t.size(1) == 1) && input_t.dim() >= 4 && weight_t.dim() >= 4 && !is_channels_last;
auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(input_t), inputShape); if (input_t.is_contiguous(memory_format) && output_t.is_contiguous(memory_format) && is_macOS_15_0_or_newer) {
auto weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t); inputNDArray = getMPSNDArray(input_t, inputShape);
MPSGraphTensor* outputTensor = nil; outputNDArray = getMPSNDArray(*output, outputShape);
}
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
MPSShape* weightShape = mps::getMPSShape(weight_t);
bool isDepthwiseConv = ((groups > 1 && (weightShape[1].intValue == 1)) && inputShape.count >= 4 &&
weightShape.count >= 4 && !is_channels_last);
MPSGraphTensor* inputTensor =
mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(input_t.scalar_type()), inputShape);
MPSGraphTensor* weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
MPSGraphTensor* outputTensor;
if (is3DConv) { if (is3DConv) {
auto conv3dDescriptor_ = [[MPSGraphConvolution3DOpDescriptor new] autorelease]; MPSGraphConvolution3DOpDescriptor* conv3dDescriptor_ = [[MPSGraphConvolution3DOpDescriptor new] autorelease];
fill_conv3d_desc(conv3dDescriptor_, fill_conv3d_desc(conv3dDescriptor_,
stride[2], stride[2],
stride[1], stride[1],
@ -217,9 +229,17 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
descriptor:conv3dDescriptor_ descriptor:conv3dDescriptor_
name:nil]; name:nil];
} else if (isDepthwiseConv) { } else if (isDepthwiseConv) {
auto depthWiseConv3dDescriptor_ = [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease]; MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor_ =
fill_depthwise_conv_desc( [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease];
depthWiseConv3dDescriptor_, stride[1], stride[0], dilation[1], dilation[0], padding[1], padding[0]); fill_depthwise_conv_desc(depthWiseConv3dDescriptor_,
stride[1],
stride[0],
dilation[1],
dilation[0],
padding[1],
padding[0],
memory_format,
groups);
MPSGraphTensor* weightTransposeTensor = [mpsGraph transposeTensor:weightTensor MPSGraphTensor* weightTransposeTensor = [mpsGraph transposeTensor:weightTensor
dimension:-3 dimension:-3
@ -238,7 +258,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
dilation[0], dilation[0],
padding[1], padding[1],
padding[0], padding[0],
input_suggested_layout, memory_format,
groups); groups);
outputTensor = [mpsGraph convolution2DWithSourceTensor:inputTensor outputTensor = [mpsGraph convolution2DWithSourceTensor:inputTensor
@ -250,6 +270,13 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
MPSGraphTensor* biasTensor = nil; MPSGraphTensor* biasTensor = nil;
if (bias_defined) { if (bias_defined) {
biasTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(bias_opt.value())); biasTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(bias_opt.value()));
}
if (is_channels_last && !is_macOS_15_0_or_newer) {
outputTensor = mps::convertNHWCtoNCHW(mpsGraph, outputTensor);
}
if (bias_defined) {
outputTensor = [mpsGraph additionWithPrimaryTensor:outputTensor secondaryTensor:biasTensor name:nil]; outputTensor = [mpsGraph additionWithPrimaryTensor:outputTensor secondaryTensor:biasTensor name:nil];
} }
newCachedGraph->inputTensor_ = inputTensor; newCachedGraph->inputTensor_ = inputTensor;
@ -258,26 +285,27 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
newCachedGraph->outputTensor_ = outputTensor; newCachedGraph->outputTensor_ = outputTensor;
}); });
auto inputPlaceholder = input_suggested_layout == kContiguous auto inputPlaceholder = inputNDArray ? Placeholder(cachedGraph->inputTensor_, inputNDArray)
? Placeholder(cachedGraph->inputTensor_, output_c || is3DConv ? input_t.contiguous() : input_t) : Placeholder(cachedGraph->inputTensor_, input_t, inputShape);
: Placeholder(cachedGraph->inputTensor_, getMPSNDArray(input_t, inputShape)); auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t);
auto outputPlaceholder = input_suggested_layout == kContiguous
? Placeholder(cachedGraph->outputTensor_, output_c ? *output_c : output_t)
: Placeholder(cachedGraph->outputTensor_, getMPSNDArray(output_t, outputShape));
auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, output_c ? weight_t.contiguous() : weight_t);
auto biasPlaceholder = Placeholder(); auto biasPlaceholder = Placeholder();
// Reshape the bias to be broadcastable with output of conv2d or conv3d // Reshape the bias to be broadcastable with output of conv2d or conv3d
if (bias_defined) { if (bias_defined) {
if (is3DConv) { if (is3DConv) {
biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias_opt->view({1, bias_shape[0], 1, 1, 1})); biasPlaceholder = Placeholder(cachedGraph->biasTensor_, (bias_opt.value()).view({1, bias_shape[0], 1, 1, 1}));
} else if (input_suggested_layout == kChannelsLast) {
biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias_opt->view({1, 1, 1, bias_shape[0]}));
} else { } else {
biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias_opt->view({1, bias_shape[0], 1, 1})); if (is_channels_last && is_macOS_15_0_or_newer) {
biasPlaceholder = Placeholder(cachedGraph->biasTensor_, (bias_opt.value()).view({1, 1, 1, bias_shape[0]}));
} else {
biasPlaceholder = Placeholder(cachedGraph->biasTensor_, (bias_opt.value()).view({1, bias_shape[0], 1, 1}));
}
} }
} }
auto outputPlaceholder = outputNDArray ? Placeholder(cachedGraph->outputTensor_, outputNDArray)
: Placeholder(cachedGraph->outputTensor_, *output);
auto feeds = [[[NSMutableDictionary alloc] initWithCapacity:3] autorelease]; NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
[[[NSMutableDictionary alloc] initWithCapacity:3] autorelease];
feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData(); feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData();
feeds[weightsPlaceholder.getMPSGraphTensor()] = weightsPlaceholder.getMPSGraphTensorData(); feeds[weightsPlaceholder.getMPSGraphTensor()] = weightsPlaceholder.getMPSGraphTensorData();
if (bias_defined) { if (bias_defined) {
@ -287,11 +315,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
} }
if (output_c) { return *output;
output_t.copy_(*output_c);
}
return output_t;
} }
Tensor _mps_convolution(const Tensor& input_t, Tensor _mps_convolution(const Tensor& input_t,
@ -327,21 +351,14 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
TensorArg grad_output{grad_output_t, "grad_output", 1}, weight{weight_t, "weight", 2}; TensorArg grad_output{grad_output_t, "grad_output", 1}, weight{weight_t, "weight", 2};
checkAllSameType(c, {grad_output, weight}); checkAllSameType(c, {grad_output, weight});
checkAllSameGPU(c, {grad_output, weight}); checkAllSameGPU(c, {grad_output, weight});
constexpr auto kChannelsLast = at::MemoryFormat::ChannelsLast; auto memory_format = grad_output_t.suggest_memory_format();
bool is_channels_last = mps_conv_use_channels_last(grad_output_t, weight_t) && !is3DConv; bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast) && !is3DConv;
auto grad_input_t = auto grad_input_t = at::empty(input_size, grad_output_t.options(), std::nullopt);
at::empty(input_size, grad_output_t.options(), is_channels_last ? std::optional(kChannelsLast) : std::nullopt);
// Avoid "grad_input" when this is being used as transposed convolution // Avoid "grad_input" when this is being used as transposed convolution
TensorArg grad_input{grad_input_t, "result", 0}; TensorArg grad_input{grad_input_t, "result", 0};
convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups); convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups);
// TODO: Remove me when MacOS-14 is no longer supported
std::optional<Tensor> grad_input_c;
if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS) && is_channels_last) {
grad_input_c = at::empty_like(grad_input_t, grad_input_t.options().memory_format(MemoryFormat::Contiguous));
}
// Derive from MPSCachedGraph // Derive from MPSCachedGraph
struct CachedGraph : public MPSCachedGraph { struct CachedGraph : public MPSCachedGraph {
CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {} CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
@ -353,6 +370,7 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
// Add backward with input // Add backward with input
@autoreleasepool { @autoreleasepool {
MPSStream* stream = getCurrentMPSStream(); MPSStream* stream = getCurrentMPSStream();
MPSShape* mps_input_shape = getMPSShape(input_size); MPSShape* mps_input_shape = getMPSShape(input_size);
std::string key = fmt::format("mps_{}_convolution_backward_input:{}:{}:{}:{}:{}:{}", std::string key = fmt::format("mps_{}_convolution_backward_input:{}:{}:{}:{}:{}:{}",
is3DConv ? "3d_" : "", is3DConv ? "3d_" : "",
@ -393,8 +411,15 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
} else if (isDepthwiseConv) { } else if (isDepthwiseConv) {
MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor_ = MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor_ =
[[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease]; [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease];
fill_depthwise_conv_desc( fill_depthwise_conv_desc(depthWiseConv3dDescriptor_,
depthWiseConv3dDescriptor_, stride[1], stride[0], dilation[1], dilation[0], padding[1], padding[0]); stride[1],
stride[0],
dilation[1],
dilation[0],
padding[1],
padding[0],
at::MemoryFormat::Contiguous,
groups);
MPSGraphTensor* weightTransposeTensor = [mpsGraph transposeTensor:weightTensor MPSGraphTensor* weightTransposeTensor = [mpsGraph transposeTensor:weightTensor
dimension:-3 dimension:-3
withDimension:-4 withDimension:-4
@ -429,18 +454,14 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
newCachedGraph->gradInputTensor_ = gradInputTensor; newCachedGraph->gradInputTensor_ = gradInputTensor;
}); });
auto gradOutputPlaceholder = auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t);
Placeholder(cachedGraph->gradOutputTensor_, grad_input_c ? grad_output_t.contiguous() : grad_output_t); auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t);
auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, grad_input_c ? weight_t.contiguous() : weight_t); auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, *grad_input);
auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input_c ? *grad_input_c : grad_input_t);
auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, weightsPlaceholder); auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, weightsPlaceholder);
runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
} }
if (grad_input_c) { return *grad_input;
grad_input_t.copy_(*grad_input_c);
}
return grad_input_t;
} }
static Tensor mps_convolution_backward_weights(IntArrayRef weight_size, static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
@ -453,11 +474,9 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
bool bias_defined) { bool bias_defined) {
using namespace at::native::mps; using namespace at::native::mps;
using namespace mps; using namespace mps;
const bool is3DConv = input_t.dim() == 5; bool is3DConv = input_t.dim() == 5;
TORCH_CHECK(isFloatingType(grad_output_t.scalar_type()), "Convolution is supported only for Floating types"); TORCH_CHECK(isFloatingType(grad_output_t.scalar_type()), "Convolution is supported only for Floating types");
CheckedFrom c = "mps_convolution_backward_weights"; CheckedFrom c = "mps_convolution_backward_weights";
constexpr auto kChannelsLast = at::MemoryFormat::ChannelsLast;
bool is_channels_last = mps_conv_use_channels_last(input_t, grad_output_t) && !is3DConv;
// For uniformity with everything else, although it seems grad_weight // For uniformity with everything else, although it seems grad_weight
// would be unambiguous too. // would be unambiguous too.
@ -468,8 +487,7 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
checkAllSameGPU(c, {grad_output, input}); checkAllSameGPU(c, {grad_output, input});
auto grad_weight_t = auto grad_weight_t =
at::empty(weight_size, grad_output_t.options(), is_channels_last ? std::optional(kChannelsLast) : std::nullopt); at::empty(weight_size, grad_output_t.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
TensorArg grad_weight{grad_weight_t, "result", 0}; TensorArg grad_weight{grad_weight_t, "result", 0};
convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups); convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups);
@ -482,23 +500,16 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
MPSGraphTensor* gradWeightTensor_ = nil; MPSGraphTensor* gradWeightTensor_ = nil;
}; };
// TODO: Remove me when MacOS-14 is no longer supported
std::optional<Tensor> grad_weight_c;
if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS) && is_channels_last) {
grad_weight_c = at::empty_like(grad_weight_t, grad_weight_t.options().memory_format(MemoryFormat::Contiguous));
}
@autoreleasepool { @autoreleasepool {
MPSStream* stream = getCurrentMPSStream(); MPSStream* stream = getCurrentMPSStream();
MPSShape* mps_weight_shape = getMPSShape(weight_size); MPSShape* mps_weight_shape = getMPSShape(weight_size);
std::string key = fmt::format("mps_{}convolution_backward_weights:{}:{}:{}:{}:{}:{}", std::string key = fmt::format("mps_{}convolution_backward_weights:{}:{}:{}:{}:{}",
is3DConv ? "3d_" : "", is3DConv ? "3d_" : "",
getArrayRefString(stride), getArrayRefString(stride),
getArrayRefString(dilation), getArrayRefString(dilation),
getArrayRefString(padding), getArrayRefString(padding),
groups, groups,
is_channels_last,
getTensorsStringKey({grad_output_t, input_t, grad_weight_t})); getTensorsStringKey({grad_output_t, input_t, grad_weight_t}));
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) { auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
MPSShape* inputShape = getMPSShape(input_t); MPSShape* inputShape = getMPSShape(input_t);
@ -530,8 +541,15 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
} else if (isDepthwiseConv) { } else if (isDepthwiseConv) {
MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor_ = MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor_ =
[[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease]; [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease];
fill_depthwise_conv_desc( fill_depthwise_conv_desc(depthWiseConv3dDescriptor_,
depthWiseConv3dDescriptor_, stride[1], stride[0], dilation[1], dilation[0], padding[1], padding[0]); stride[1],
stride[0],
dilation[1],
dilation[0],
padding[1],
padding[0],
at::MemoryFormat::Contiguous,
groups);
NSNumber* outputFeatChannelDim = mps_weight_shape[0]; NSNumber* outputFeatChannelDim = mps_weight_shape[0];
MPSShape* weightShapeTranspose = @[ @1, outputFeatChannelDim, mps_weight_shape[2], mps_weight_shape[3] ]; MPSShape* weightShapeTranspose = @[ @1, outputFeatChannelDim, mps_weight_shape[2], mps_weight_shape[3] ];
MPSGraphTensor* gradWeightTensorTranspose = MPSGraphTensor* gradWeightTensorTranspose =
@ -565,19 +583,14 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
newCachedGraph->gradWeightTensor_ = gradWeightTensor; newCachedGraph->gradWeightTensor_ = gradWeightTensor;
}); });
auto gradOutputPlaceholder = auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t);
Placeholder(cachedGraph->gradOutputTensor_, grad_weight_c ? grad_output_t.contiguous() : grad_output_t); auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t);
auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, grad_weight_c ? input_t.contiguous() : input_t); auto outputPlaceholder = Placeholder(cachedGraph->gradWeightTensor_, grad_weight_t);
auto outputPlaceholder =
Placeholder(cachedGraph->gradWeightTensor_, grad_weight_c ? *grad_weight_c : grad_weight_t);
auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, inputPlaceholder); auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, inputPlaceholder);
runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
} }
if (grad_weight_c) {
grad_weight_t.copy_(*grad_weight_c);
}
return grad_weight_t; return grad_weight_t;
} }

View File

@ -66,12 +66,11 @@ static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl(
int64_t num_indices = indices.size(0); int64_t num_indices = indices.size(0);
int64_t num_bags = offsets.size(0); int64_t num_bags = offsets.size(0);
if (include_last_offset) { if (include_last_offset) {
TORCH_CHECK(num_bags >= 1, "include_last_offset: number of offsets should be at least 1");
num_bags -= 1; num_bags -= 1;
} }
int64_t feature_size = weight.size(1); int64_t feature_size = weight.size(1);
auto bag_size = at::empty({num_bags}, indices.options()); auto bag_size = at::empty(offsets.sizes(), indices.options());
auto offset2bag = at::empty({indices.size(0)}, indices.options()); auto offset2bag = at::empty({indices.size(0)}, indices.options());
auto output = at::empty({num_bags, feature_size}, weight.options()); auto output = at::empty({num_bags, feature_size}, weight.options());
@ -95,7 +94,7 @@ static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl(
} }
bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined(); bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined();
params.per_sample_weights_stride = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0; params.per_sample_weights_strides = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;
params.num_indices = num_indices; params.num_indices = num_indices;
params.num_bags = num_bags; params.num_bags = num_bags;

View File

@ -20,7 +20,6 @@
#include <ATen/ops/baddbmm_native.h> #include <ATen/ops/baddbmm_native.h>
#include <ATen/ops/bmm_native.h> #include <ATen/ops/bmm_native.h>
#include <ATen/ops/cholesky_native.h> #include <ATen/ops/cholesky_native.h>
#include <ATen/ops/eye_native.h>
#include <ATen/ops/linalg_cholesky_ex_native.h> #include <ATen/ops/linalg_cholesky_ex_native.h>
#include <ATen/ops/linalg_inv_ex_native.h> #include <ATen/ops/linalg_inv_ex_native.h>
#include <ATen/ops/linalg_lu_factor_ex_native.h> #include <ATen/ops/linalg_lu_factor_ex_native.h>
@ -497,24 +496,26 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const
using namespace mps; using namespace mps;
TORCH_CHECK(result.is_mps(), "Output tensor is not MPS"); TORCH_CHECK(result.is_mps(), "Output tensor is not MPS");
TORCH_CHECK(!A.is_complex(), "linalg_inv: not supported for complex types yet!"); TORCH_CHECK(!A.is_complex(), "linalg_inv: not supported for complex types yet!");
using CachedGraph = MPSUnaryCachedGraph;
MPSStream* stream = getCurrentMPSStream();
info.zero_(); info.zero_();
if (A.numel() == 0) { if (A.numel() == 0) {
return; return;
} }
if (!result.is_contiguous()) {
result.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
}
auto A_sizes = A.sizes(); auto A_sizes = A.sizes();
int ndim = A.dim(); int ndim = A.dim();
Tensor LU = empty_like(A, MemoryFormat::Contiguous); Tensor LU = empty_like(A);
Tensor identity = eye(A.size(-2), A.size(-1), A.scalar_type(), A.options().layout(), A.device()).expand_as(A); Tensor identity = zeros_like(A);
Tensor pivots = empty({A_sizes.begin(), A_sizes.end() - 1}, A.options().dtype(kInt)); Tensor pivots = empty({A_sizes.begin(), A_sizes.end() - 1}, A.options().dtype(kInt));
// need to do this to keep the strides of the result tensor (ndim == 2 ? identity.diagonal() : identity.diagonal(0, -2, -1)).fill_(1);
// mps's solve expects row major layout, while inductor linalg_solve_out_mps_impl(A, identity, true, check_errors, result, LU, pivots, info);
// expects result to be column major
Tensor tmp = empty_like(A, MemoryFormat::Contiguous);
linalg_solve_out_mps_impl(A, identity, true, check_errors, tmp, LU, pivots, info);
result.copy_(tmp);
} }
static Tensor& mm_out_mps_impl(const Tensor& self, const Tensor& other, Tensor& output) { static Tensor& mm_out_mps_impl(const Tensor& self, const Tensor& other, Tensor& output) {

View File

@ -519,13 +519,6 @@ static void max_unpool_out_mps_template(const Tensor& input,
Tensor& output, Tensor& output,
const int32_t pooling_dims, const int32_t pooling_dims,
const std::string& op_name) { const std::string& op_name) {
TORCH_CHECK(output_size_.size() == static_cast<size_t>(pooling_dims),
op_name,
"There should be exactly ",
pooling_dims,
" elements but got ",
output_size_.size());
auto dims = input.dim(); auto dims = input.dim();
auto leading_dims = input.dim() - pooling_dims; auto leading_dims = input.dim() - pooling_dims;

View File

@ -9,22 +9,11 @@
#else #else
#include <ATen/ops/_unique2.h> #include <ATen/ops/_unique2.h>
#include <ATen/ops/_unique2_native.h> #include <ATen/ops/_unique2_native.h>
#include <ATen/ops/arange.h>
#include <ATen/ops/argsort.h>
#include <ATen/ops/cat.h>
#include <ATen/ops/cumsum.h>
#include <ATen/ops/full.h>
#include <ATen/ops/masked_select.h>
#include <ATen/ops/nonzero.h>
#include <ATen/ops/ones.h>
#include <ATen/ops/ones_like.h>
#include <ATen/ops/slice.h> #include <ATen/ops/slice.h>
#include <ATen/ops/unique_consecutive.h> #include <ATen/ops/unique_consecutive.h>
#include <ATen/ops/unique_consecutive_native.h> #include <ATen/ops/unique_consecutive_native.h>
#include <ATen/ops/unique_dim_consecutive.h> #include <ATen/ops/unique_dim_consecutive.h>
#include <ATen/ops/unique_dim_consecutive_native.h> #include <ATen/ops/unique_dim_consecutive_native.h>
#include <ATen/ops/unique_dim_native.h>
#include <ATen/ops/zeros.h>
#endif #endif
namespace at::native { namespace at::native {
@ -316,85 +305,4 @@ std::tuple<Tensor, Tensor, Tensor> _unique2_mps(const Tensor& self,
return _unique_impl_mps(self, return_inverse, return_counts, false, std::nullopt); return _unique_impl_mps(self, return_inverse, return_counts, false, std::nullopt);
} }
static Tensor lexsort_rows_perm_mps(const Tensor& mat_2d) {
const auto rows = mat_2d.size(0), cols = mat_2d.size(1);
if (rows <= 1 || cols == 0) {
return arange(rows, mat_2d.options().dtype(kLong));
}
auto perm = arange(rows, mat_2d.options().dtype(kLong));
for (auto c = cols - 1; c >= 0; --c) {
auto keys = mat_2d.select(1, c).index_select(0, perm);
const auto idx = argsort(keys, /*dim=*/0, /*descending=*/false);
perm = perm.index_select(0, idx);
}
return perm;
}
static std::tuple<Tensor, Tensor, Tensor> unique_dim_sorted_mps_impl(const Tensor& self,
int64_t dim,
bool return_inverse,
bool return_counts) {
dim = maybe_wrap_dim(dim, self.dim());
auto sizes = self.sizes().vec();
auto num_zero_dims = std::count(sizes.begin(), sizes.end(), (int64_t)0);
if (self.size(dim) == 0) {
auto output = at::empty(sizes, self.options());
auto inverse_indices = at::empty({0}, self.options().dtype(kLong));
auto counts = at::empty({0}, self.options().dtype(kLong));
return {output, inverse_indices, counts};
}
auto transposed = self.moveaxis(dim, 0);
auto orig_sizes = transposed.sizes().vec();
auto rows = transposed.size(0);
auto input_flat = transposed.contiguous().view({rows, -1});
auto perm = lexsort_rows_perm_mps(input_flat);
auto input_sorted = input_flat.index_select(0, perm);
Tensor is_unique = at::zeros({rows}, self.options().dtype(kBool));
if (rows > 0) {
is_unique.narrow(0, 0, 1).fill_(true);
}
if (rows > 1) {
auto a = input_sorted.narrow(0, 1, rows - 1);
auto b = input_sorted.narrow(0, 0, rows - 1);
auto row_changed = a.ne(b).any(1);
is_unique.narrow(0, 1, rows - 1).copy_(row_changed);
}
auto unique_pos = nonzero(is_unique).squeeze(1);
auto group_id = cumsum(is_unique.to(kLong), 0).sub(1);
auto unique_rows_2d = input_sorted.index_select(0, unique_pos);
Tensor inverse_indices = empty({0}, self.options().dtype(kLong));
if (return_inverse) {
inverse_indices = empty({rows}, self.options().dtype(kLong));
inverse_indices.index_copy_(0, perm, group_id);
}
Tensor counts = empty({0}, self.options().dtype(kLong));
if (return_counts) {
const auto num_unique = unique_pos.size(0);
counts = zeros({num_unique}, self.options().dtype(kLong));
counts.scatter_add_(0, group_id, ones_like(group_id, group_id.options().dtype(kLong)));
}
orig_sizes[0] = unique_rows_2d.size(0);
auto output = unique_rows_2d.view(orig_sizes).moveaxis(0, dim);
return std::make_tuple(std::move(output), std::move(inverse_indices), std::move(counts));
}
std::tuple<Tensor, Tensor, Tensor> unique_dim_mps(const Tensor& self,
int64_t dim,
const bool /*sorted*/,
const bool return_inverse,
const bool return_counts) {
return unique_dim_sorted_mps_impl(self, dim, return_inverse, return_counts);
}
} // namespace at::native } // namespace at::native

View File

@ -1409,7 +1409,7 @@
- func: _sparse_broadcast_to(Tensor(a) self, int[] size) -> Tensor(a) - func: _sparse_broadcast_to(Tensor(a) self, int[] size) -> Tensor(a)
variants: function variants: function
dispatch: dispatch:
SparseCPU, SparseCUDA, SparseMPS: sparse_broadcast_to SparseCPU, SparseCUDA: sparse_broadcast_to
- func: cat(Tensor[] tensors, int dim=0) -> Tensor - func: cat(Tensor[] tensors, int dim=0) -> Tensor
structured_delegate: cat.out structured_delegate: cat.out
@ -3858,7 +3858,7 @@
device_check: NoCheck # TensorIterator device_check: NoCheck # TensorIterator
structured: True structured: True
dispatch: dispatch:
CPU, CUDA, MTIA: aminmax_out CPU, CUDA: aminmax_out
MPS: aminmax_out_mps MPS: aminmax_out_mps
- func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor - func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
@ -3909,7 +3909,7 @@
- func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) - func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
structured: True structured: True
dispatch: dispatch:
CPU, CUDA, MTIA: amax_out CPU, CUDA: amax_out
MPS: amax_out_mps MPS: amax_out_mps
# Return: (Tensor output, Tensor indices) # Return: (Tensor output, Tensor indices)
@ -4090,7 +4090,7 @@
- func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) - func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
structured: True structured: True
dispatch: dispatch:
CPU, CUDA, MTIA: amin_out CPU, CUDA: amin_out
MPS: amin_out_mps MPS: amin_out_mps
# TODO: Add this function to MPS dispatch key so that we avoid declaring it in # TODO: Add this function to MPS dispatch key so that we avoid declaring it in
@ -6450,7 +6450,6 @@
dispatch: dispatch:
CPU: unique_dim_cpu CPU: unique_dim_cpu
CUDA: unique_dim_cuda CUDA: unique_dim_cuda
MPS: unique_dim_mps
tags: dynamic_output_shape tags: dynamic_output_shape
autogen: unique_dim.out autogen: unique_dim.out

View File

@ -158,46 +158,12 @@ c10::intrusive_ptr<EmbeddingPackedParamsBase> PackedEmbeddingBagWeight::prepack(
return packed_ptr; return packed_ptr;
} }
#ifdef USE_FBGEMM
namespace {
/// Number of columns in the rowwise min/max buffer passed to the quantization function(s)
constexpr int kRowwiseMinMaxNumCols = 2;
bool _validate_rowwise_min_max(
const at::Tensor& weight,
const std::optional<at::Tensor>& rowwise_min_max_opt) {
const auto is_valid_rowwise_min_max = rowwise_min_max_opt.has_value();
if (is_valid_rowwise_min_max) {
TORCH_CHECK(
(rowwise_min_max_opt->dim() == 2 &&
rowwise_min_max_opt->size(0) == weight.size(0) &&
rowwise_min_max_opt->size(1) == kRowwiseMinMaxNumCols),
"'rowwise_min_max' must be a 2D tensor with shape [num_rows(weight), 2].");
}
return is_valid_rowwise_min_max;
}
auto _get_rowwise_min_max_contig(
const std::optional<at::Tensor>& rowwise_min_max_opt) {
return rowwise_min_max_opt.has_value()
? rowwise_min_max_opt->expect_contiguous(rowwise_min_max_opt->suggest_memory_format())
: at::borrow_from_optional_tensor(rowwise_min_max_opt);
}
}
#endif // USE_FBGEMM
namespace at::native { namespace at::native {
// Note - This is a temporary pack function for embedding bag which quantizes // Note - This is a temporary pack function for embedding bag which quantizes
// and packs the float weight tensor. In the next step it will be replaced by a // and packs the float weight tensor. In the next step it will be replaced by a
// quantize and pack function once we support FP scale and FP zero_point // quantize and pack function once we support FP scale and FP zero_point
// //
// The optional rowwise_min_max argument is to support callers to pass in the min/max
// values of the weight tensor. If the rowwise_min_max is not provided, the min/max
// values will be computed from the weight tensor.
//
// Python example examining a packed 8bit zero_point and scale: // Python example examining a packed 8bit zero_point and scale:
// //
// >> x = torch.from_numpy(np.array([[[10, 20], [30, 40]],[[50, 60], [70, 80]]], // >> x = torch.from_numpy(np.array([[[10, 20], [30, 40]],[[50, 60], [70, 80]]],
@ -255,10 +221,7 @@ namespace at::native {
// //
// [[50. , 60.00000035], // [[50. , 60.00000035],
// [70. , 80.00000035]]]) // [70. , 80.00000035]]])
Tensor& qembeddingbag_byte_prepack_out( Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight) {
Tensor& output,
const Tensor& weight,
const std::optional<Tensor>& rowwise_min_max_opt) {
// The "last" dimension of an N-Dimensioned batch of embedding bags is // The "last" dimension of an N-Dimensioned batch of embedding bags is
// quantization channel. E.g. for a 2D embedding bag, this has // quantization channel. E.g. for a 2D embedding bag, this has
// [ row, col ] dimensions, for batched of embedding bags, dimensions might be // [ row, col ] dimensions, for batched of embedding bags, dimensions might be
@ -293,16 +256,9 @@ Tensor& qembeddingbag_byte_prepack_out(
auto* output_data = output.data_ptr<uint8_t>(); auto* output_data = output.data_ptr<uint8_t>();
#ifdef USE_FBGEMM #ifdef USE_FBGEMM
// Move these outside of the ifdef when we support non-FBGEMM flow.
const auto is_valid_rowwise_min_max = _validate_rowwise_min_max(weight, rowwise_min_max_opt);
const auto rowwise_min_max_contig = _get_rowwise_min_max_contig(rowwise_min_max_opt);
if (weight_contig->scalar_type() == at::ScalarType::Half) { if (weight_contig->scalar_type() == at::ScalarType::Half) {
const auto weight_data = const auto weight_data =
static_cast<fbgemm::float16*>(weight_contig->data_ptr()); static_cast<fbgemm::float16*>(weight_contig->data_ptr());
const auto rowwise_min_max_data = is_valid_rowwise_min_max
? static_cast<fbgemm::float16*>(rowwise_min_max_contig->data_ptr())
: nullptr;
at::parallel_for( at::parallel_for(
0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) { 0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
fbgemm::FloatOrHalfToFused8BitRowwiseQuantizedSBFloat< fbgemm::FloatOrHalfToFused8BitRowwiseQuantizedSBFloat<
@ -310,21 +266,17 @@ Tensor& qembeddingbag_byte_prepack_out(
weight_data + start_idx * embedding_cols, weight_data + start_idx * embedding_cols,
end_idx - start_idx, end_idx - start_idx,
embedding_cols, embedding_cols,
output_data + start_idx * output_columns, output_data + start_idx * output_columns);
(is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
}); });
} else { } else {
const auto weight_data = weight_contig->data_ptr<float>(); const auto weight_data = weight_contig->data_ptr<float>();
const auto rowwise_min_max_data =
is_valid_rowwise_min_max ? rowwise_min_max_contig->data_ptr<float>() : nullptr;
at::parallel_for( at::parallel_for(
0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) { 0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
fbgemm::FloatOrHalfToFused8BitRowwiseQuantizedSBFloat<float>( fbgemm::FloatOrHalfToFused8BitRowwiseQuantizedSBFloat<float>(
weight_data + start_idx * embedding_cols, weight_data + start_idx * embedding_cols,
end_idx - start_idx, end_idx - start_idx,
embedding_cols, embedding_cols,
output_data + start_idx * output_columns, output_data + start_idx * output_columns);
(is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
}); });
} }
@ -374,22 +326,6 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
return output; return output;
} }
static Tensor qembeddingbag_byte_prepack_with_rowwise_min_max(
const Tensor& weight,
const Tensor& rowwise_min_max) {
const auto weight_contig =
weight.expect_contiguous(weight.suggest_memory_format());
Tensor output = at::detail::empty_cpu(
{0},
at::kByte,
weight_contig->layout(),
weight_contig->device(),
std::nullopt,
std::nullopt);
qembeddingbag_byte_prepack_out(output, weight, rowwise_min_max);
return output;
}
Tensor qembeddingbag_byte_prepack_meta(const Tensor& weight) { Tensor qembeddingbag_byte_prepack_meta(const Tensor& weight) {
const auto weight_contig = const auto weight_contig =
weight.expect_contiguous(weight.suggest_memory_format()); weight.expect_contiguous(weight.suggest_memory_format());
@ -399,7 +335,7 @@ Tensor qembeddingbag_byte_prepack_meta(const Tensor& weight) {
"'embedding_bag_byte_prepack' only support float32 or float16."); "'embedding_bag_byte_prepack' only support float32 or float16.");
const auto weight_sizes = weight.sym_sizes(); const auto weight_sizes = weight.sym_sizes();
const auto cols_dim = weight.ndimension() - 1; const auto cols_dim = weight.ndimension() - 1;
const auto& embedding_cols = weight_sizes[cols_dim]; const auto embedding_cols = weight_sizes[cols_dim];
// Add 8 bytes per column to store FP32 scale and zero_point per row. // Add 8 bytes per column to store FP32 scale and zero_point per row.
const auto output_columns = embedding_cols + 2 * sizeof(float); const auto output_columns = embedding_cols + 2 * sizeof(float);
@ -423,8 +359,7 @@ Tensor _qembeddingbag_nbit_prepack_helper(
int bit_width, int bit_width,
const bool optimized_qparams, const bool optimized_qparams,
const int64_t nbins, const int64_t nbins,
const double ratio, const double ratio) {
const std::optional<Tensor>& rowwise_min_max_opt = std::nullopt) {
TORCH_CHECK( TORCH_CHECK(
weight.scalar_type() == at::ScalarType::Float || weight.scalar_type() == at::ScalarType::Float ||
weight.scalar_type() == at::ScalarType::Half, weight.scalar_type() == at::ScalarType::Half,
@ -466,17 +401,10 @@ Tensor _qembeddingbag_nbit_prepack_helper(
auto* output_data = output.data_ptr<uint8_t>(); auto* output_data = output.data_ptr<uint8_t>();
#ifdef USE_FBGEMM #ifdef USE_FBGEMM
// Move these outside of the ifdef when we support non-FBGEMM flow.
const auto is_valid_rowwise_min_max = _validate_rowwise_min_max(weight, rowwise_min_max_opt);
const auto rowwise_min_max_contig = _get_rowwise_min_max_contig(rowwise_min_max_opt);
if (!optimized_qparams) { if (!optimized_qparams) {
if (weight_contig.scalar_type() == at::ScalarType::Half) { if (weight_contig.scalar_type() == at::ScalarType::Half) {
const auto weight_data = const auto weight_data =
static_cast<fbgemm::float16*>(weight_contig.data_ptr()); static_cast<fbgemm::float16*>(weight_contig.data_ptr());
const auto rowwise_min_max_data = is_valid_rowwise_min_max
? static_cast<fbgemm::float16*>(rowwise_min_max_contig->data_ptr())
: nullptr;
at::parallel_for( at::parallel_for(
0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) { 0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
fbgemm::FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf< fbgemm::FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf<
@ -485,13 +413,10 @@ Tensor _qembeddingbag_nbit_prepack_helper(
weight_data + start_idx * embedding_cols, weight_data + start_idx * embedding_cols,
end_idx - start_idx, end_idx - start_idx,
static_cast<int>(embedding_cols), static_cast<int>(embedding_cols),
output_data + start_idx * output_shape[1], output_data + start_idx * output_shape[1]);
(is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
}); });
} else { } else {
const auto weight_data = weight_contig.data_ptr<float>(); const auto weight_data = weight_contig.data_ptr<float>();
const auto rowwise_min_max_data =
is_valid_rowwise_min_max ? rowwise_min_max_contig->data_ptr<float>() : nullptr;
at::parallel_for( at::parallel_for(
0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) { 0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
fbgemm::FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf<float>( fbgemm::FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf<float>(
@ -499,8 +424,7 @@ Tensor _qembeddingbag_nbit_prepack_helper(
weight_data + start_idx * embedding_cols, weight_data + start_idx * embedding_cols,
end_idx - start_idx, end_idx - start_idx,
static_cast<int>(embedding_cols), static_cast<int>(embedding_cols),
output_data + start_idx * output_shape[1], output_data + start_idx * output_shape[1]);
(is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
}); });
} }
} else { } else {
@ -590,16 +514,6 @@ Tensor qembeddingbag_4bit_prepack(
weight, 4 /*bit_width*/, optimized_qparams, nbins, ratio); weight, 4 /*bit_width*/, optimized_qparams, nbins, ratio);
} }
Tensor qembeddingbag_4bit_prepack_with_rowwise_min_max(
const Tensor& weight,
const Tensor& rowwise_min_max,
const bool optimized_qparams,
const int64_t nbins,
const double ratio) {
return _qembeddingbag_nbit_prepack_helper(
weight, 4 /*bit_width*/, optimized_qparams, nbins, ratio, rowwise_min_max);
}
// Applies 2-bit row-wise quantization by determining the range // Applies 2-bit row-wise quantization by determining the range
// (maximum - minimum) and bias (minimum value) of each row in the input // (maximum - minimum) and bias (minimum value) of each row in the input
// matrix, and then scaling each element to an 2-bit number between 0 and // matrix, and then scaling each element to an 2-bit number between 0 and
@ -617,16 +531,6 @@ Tensor qembeddingbag_2bit_prepack(
weight, 2 /*bit_width*/, optimized_qparams, nbins, ratio); weight, 2 /*bit_width*/, optimized_qparams, nbins, ratio);
} }
Tensor qembeddingbag_2bit_prepack_with_rowwise_min_max(
const Tensor& weight,
const Tensor& rowwise_min_max,
const bool optimized_qparams,
const int64_t nbins,
const double ratio) {
return _qembeddingbag_nbit_prepack_helper(
weight, 2 /*bit_width*/, optimized_qparams, nbins, ratio, rowwise_min_max);
}
class QEmbeddingPackWeights final { class QEmbeddingPackWeights final {
public: public:
static c10::intrusive_ptr<EmbeddingPackedParamsBase> run(const at::Tensor& weight) { static c10::intrusive_ptr<EmbeddingPackedParamsBase> run(const at::Tensor& weight) {
@ -638,21 +542,12 @@ TORCH_LIBRARY_IMPL(quantized, CPU, m) {
m.impl( m.impl(
TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte_prepack"), TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte_prepack"),
TORCH_FN(qembeddingbag_byte_prepack)); TORCH_FN(qembeddingbag_byte_prepack));
m.impl(
TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte_prepack_with_rowwise_min_max"),
TORCH_FN(qembeddingbag_byte_prepack_with_rowwise_min_max));
m.impl( m.impl(
TORCH_SELECTIVE_NAME("quantized::embedding_bag_4bit_prepack"), TORCH_SELECTIVE_NAME("quantized::embedding_bag_4bit_prepack"),
TORCH_FN(qembeddingbag_4bit_prepack)); TORCH_FN(qembeddingbag_4bit_prepack));
m.impl(
TORCH_SELECTIVE_NAME("quantized::embedding_bag_4bit_prepack_with_rowwise_min_max"),
TORCH_FN(qembeddingbag_4bit_prepack_with_rowwise_min_max));
m.impl( m.impl(
TORCH_SELECTIVE_NAME("quantized::embedding_bag_2bit_prepack"), TORCH_SELECTIVE_NAME("quantized::embedding_bag_2bit_prepack"),
TORCH_FN(qembeddingbag_2bit_prepack)); TORCH_FN(qembeddingbag_2bit_prepack));
m.impl(
TORCH_SELECTIVE_NAME("quantized::embedding_bag_2bit_prepack_with_rowwise_min_max"),
TORCH_FN(qembeddingbag_2bit_prepack_with_rowwise_min_max));
} }
TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {

View File

@ -3,10 +3,7 @@
namespace at::native { namespace at::native {
Tensor& qembeddingbag_byte_prepack_out( Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight);
Tensor& output,
const Tensor& weight,
const std::optional<Tensor>& rowwise_min_max_opt = std::nullopt);
Tensor qembeddingbag_byte_prepack(const Tensor& weight); Tensor qembeddingbag_byte_prepack(const Tensor& weight);

View File

@ -121,12 +121,9 @@ TORCH_LIBRARY(quantized, m) {
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_unpack(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase W_prepack) -> Tensor W_origin"), {at::Tag::pt2_compliant_tag}); m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_unpack(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase W_prepack) -> Tensor W_origin"), {at::Tag::pt2_compliant_tag});
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_prepack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag}); m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_prepack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_unpack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag}); m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_unpack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_prepack_with_rowwise_min_max(Tensor weight, Tensor rowwise_min_max) -> Tensor"), {at::Tag::pt2_compliant_tag});
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_prepack(Tensor weight, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag}); m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_prepack(Tensor weight, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_prepack_with_rowwise_min_max(Tensor weight, Tensor rowwise_min_max, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_unpack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag}); m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_unpack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_prepack(Tensor weight, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag}); m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_prepack(Tensor weight, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_prepack_with_rowwise_min_max(Tensor weight, Tensor rowwise_min_max, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_unpack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag}); m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_unpack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"), {at::Tag::pt2_compliant_tag}); m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"), {at::Tag::pt2_compliant_tag});
m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"), {at::Tag::pt2_compliant_tag}); m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"), {at::Tag::pt2_compliant_tag});

Some files were not shown because too many files have changed in this diff Show More