mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-27 17:54:55 +08:00
Compare commits
3 Commits
csl/print_
...
new-codege
| Author | SHA1 | Date | |
|---|---|---|---|
| 25e1e99fcd | |||
| f803a57a7a | |||
| 39287c561b |
@ -181,7 +181,7 @@ case "$tag" in
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950;gfx1100"
|
||||
PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
|
||||
if [[ $tag =~ "benchmarks" ]]; then
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
fi
|
||||
@ -344,7 +344,7 @@ docker build \
|
||||
--build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
|
||||
--build-arg "KATEX=${KATEX:-}" \
|
||||
--build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
|
||||
--build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}" \
|
||||
--build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942}" \
|
||||
--build-arg "IMAGE_NAME=${IMAGE_NAME}" \
|
||||
--build-arg "UCX_COMMIT=${UCX_COMMIT}" \
|
||||
--build-arg "UCC_COMMIT=${UCC_COMMIT}" \
|
||||
|
||||
@ -1 +1 @@
|
||||
7416ffcb92cdbe98d9f97e4e6f95247e46dfc9fd
|
||||
27664085f804afc83df26f740bb46c365854f2c4
|
||||
|
||||
@ -46,9 +46,9 @@ case ${DOCKER_TAG_PREFIX} in
|
||||
BASE_TARGET=rocm
|
||||
GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
|
||||
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
|
||||
# add gfx950, gfx115x conditionally starting in ROCm 7.0
|
||||
# add gfx950 conditionally starting in ROCm 7.0
|
||||
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
|
||||
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
|
||||
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
|
||||
fi
|
||||
DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
|
||||
;;
|
||||
|
||||
@ -115,9 +115,6 @@ RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio
|
||||
# cmake-3.28.0 from pip for onnxruntime
|
||||
RUN python3 -mpip install cmake==3.28.0
|
||||
|
||||
ADD ./common/patch_libstdc.sh patch_libstdc.sh
|
||||
RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
|
||||
|
||||
# build onnxruntime 1.21.0 from sources.
|
||||
# it is not possible to build it from sources using pip,
|
||||
# so just build it from upstream repository.
|
||||
|
||||
@ -84,9 +84,9 @@ case ${image} in
|
||||
DEVTOOLSET_VERSION="11"
|
||||
GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
|
||||
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
|
||||
# add gfx950, gfx115x conditionally starting in ROCm 7.0
|
||||
# add gfx950 conditionally starting in ROCm 7.0
|
||||
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
|
||||
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
|
||||
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
|
||||
fi
|
||||
DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
|
||||
;;
|
||||
|
||||
@ -10,6 +10,11 @@ BAD_SSL = "https://self-signed.badssl.com"
|
||||
|
||||
print("Testing SSL certificate checking for Python:", sys.version)
|
||||
|
||||
if sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4):
|
||||
print("This version never checks SSL certs; skipping tests")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
EXC = OSError
|
||||
|
||||
print(f"Connecting to {GOOD_SSL} should work")
|
||||
|
||||
@ -120,8 +120,9 @@ ninja==1.11.1.4
|
||||
numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
|
||||
numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
|
||||
#Description: Just-In-Time Compiler for Numerical Functions
|
||||
#Pinned versions: 0.55.2, 0.60.0
|
||||
#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
|
||||
#test that import: test_numba_integration.py
|
||||
#For numba issue see https://github.com/pytorch/pytorch/issues/51511
|
||||
#Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073
|
||||
|
||||
#numpy
|
||||
@ -241,9 +242,10 @@ pygments==2.15.0
|
||||
#Pinned versions: 14.1.0
|
||||
#test that import:
|
||||
|
||||
scikit-image==0.22.0
|
||||
scikit-image==0.19.3 ; python_version < "3.10"
|
||||
scikit-image==0.22.0 ; python_version >= "3.10"
|
||||
#Description: image processing routines
|
||||
#Pinned versions: 0.22.0
|
||||
#Pinned versions:
|
||||
#test that import: test_nn.py
|
||||
|
||||
#scikit-learn
|
||||
|
||||
@ -5,7 +5,7 @@ DESIRED_ROCM ?= 7.0
|
||||
DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
|
||||
PACKAGE_NAME = magma-rocm
|
||||
# inherit this from underlying docker image, do not pass this env var to docker
|
||||
#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201
|
||||
#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
|
||||
|
||||
DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
|
||||
-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
|
||||
@ -18,6 +18,7 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
|
||||
.PHONY: all
|
||||
all: magma-rocm70
|
||||
all: magma-rocm64
|
||||
all: magma-rocm63
|
||||
|
||||
.PHONY:
|
||||
clean:
|
||||
@ -33,3 +34,8 @@ magma-rocm70:
|
||||
magma-rocm64: DESIRED_ROCM := 6.4
|
||||
magma-rocm64:
|
||||
$(DOCKER_RUN)
|
||||
|
||||
.PHONY: magma-rocm63
|
||||
magma-rocm63: DESIRED_ROCM := 6.3
|
||||
magma-rocm63:
|
||||
$(DOCKER_RUN)
|
||||
|
||||
@ -233,9 +233,7 @@ if [[ "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
|
||||
export BUILD_STATIC_RUNTIME_BENCHMARK=ON
|
||||
fi
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" == *-full-debug* ]]; then
|
||||
export CMAKE_BUILD_TYPE=Debug
|
||||
elif [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
|
||||
if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
|
||||
export CMAKE_BUILD_TYPE=RelWithAssert
|
||||
fi
|
||||
|
||||
@ -301,11 +299,6 @@ else
|
||||
python -m build --wheel --no-isolation
|
||||
fi
|
||||
pip_install_whl "$(echo dist/*.whl)"
|
||||
if [[ "$BUILD_ENVIRONMENT" == *full-debug* ]]; then
|
||||
# Regression test for https://github.com/pytorch/pytorch/issues/164297
|
||||
# Torch should be importable and that's about it
|
||||
pushd /; python -c "import torch;print(torch.__config__.show(), torch.randn(5) + 1.7)"; popd
|
||||
fi
|
||||
|
||||
if [[ "${BUILD_ADDITIONAL_PACKAGES:-}" == *vision* ]]; then
|
||||
install_torchvision
|
||||
|
||||
@ -67,7 +67,7 @@ fi
|
||||
# wheels with cxx11-abi
|
||||
|
||||
echo "Checking that the gcc ABI is what we expect"
|
||||
if [[ "$(uname)" != 'Darwin' ]]; then
|
||||
if [[ "$(uname)" != 'Darwin' && "$(uname -m)" != "s390x" ]]; then
|
||||
# We also check that there are cxx11 symbols in libtorch
|
||||
#
|
||||
echo "Checking that symbols in libtorch.so have the right gcc abi"
|
||||
|
||||
@ -256,7 +256,7 @@ test_torchbench_smoketest() {
|
||||
local device=mps
|
||||
local dtypes=(undefined float16 bfloat16 notset)
|
||||
local dtype=${dtypes[$1]}
|
||||
local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16)
|
||||
local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
|
||||
|
||||
for backend in eager inductor; do
|
||||
|
||||
@ -319,7 +319,7 @@ test_aoti_torchbench_smoketest() {
|
||||
local device=mps
|
||||
local dtypes=(undefined float16 bfloat16 notset)
|
||||
local dtype=${dtypes[$1]}
|
||||
local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16)
|
||||
local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
|
||||
|
||||
echo "Launching torchbench inference performance run for AOT Inductor and dtype ${dtype}"
|
||||
local dtype_arg="--${dtype}"
|
||||
|
||||
@ -337,13 +337,13 @@ test_python() {
|
||||
|
||||
test_python_smoke() {
|
||||
# Smoke tests for H100/B200
|
||||
time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
test_python_smoke_b200() {
|
||||
# Targeted smoke tests for B200 - staged approach to avoid too many failures
|
||||
time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
@ -838,7 +838,7 @@ test_dynamo_benchmark() {
|
||||
elif [[ "${suite}" == "timm_models" ]]; then
|
||||
export TORCHBENCH_ONLY_MODELS="inception_v3"
|
||||
elif [[ "${suite}" == "torchbench" ]]; then
|
||||
export TORCHBENCH_ONLY_MODELS="BERT_pytorch"
|
||||
export TORCHBENCH_ONLY_MODELS="hf_Bert"
|
||||
fi
|
||||
fi
|
||||
test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
|
||||
@ -869,13 +869,13 @@ test_inductor_torchbench_smoketest_perf() {
|
||||
mkdir -p "$TEST_REPORTS_DIR"
|
||||
|
||||
python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
|
||||
--batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only BERT_pytorch \
|
||||
--batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
|
||||
--output "$TEST_REPORTS_DIR/inductor_training_smoketest.csv"
|
||||
# The threshold value needs to be actively maintained to make this check useful
|
||||
python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4
|
||||
|
||||
# Check memory compression ratio for a few models
|
||||
for test in BERT_pytorch yolov3; do
|
||||
for test in hf_Albert timm_vision_transformer; do
|
||||
python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --amp --training \
|
||||
--disable-cudagraphs --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" \
|
||||
--only $test --output "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv"
|
||||
@ -886,7 +886,7 @@ test_inductor_torchbench_smoketest_perf() {
|
||||
done
|
||||
|
||||
# Perform some "warm-start" runs for a few huggingface models.
|
||||
for test in AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
|
||||
for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
|
||||
python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
|
||||
--only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
|
||||
python benchmarks/dynamo/check_accuracy.py \
|
||||
|
||||
@ -15,35 +15,37 @@ if errorlevel 1 exit /b 1
|
||||
if not errorlevel 0 exit /b 1
|
||||
|
||||
cd %TMP_DIR_WIN%\build\torch\test
|
||||
|
||||
:: Enable delayed variable expansion to make the list
|
||||
setlocal enabledelayedexpansion
|
||||
set EXE_LIST=
|
||||
for /r "." %%a in (*.exe) do (
|
||||
if "%%~na" == "c10_intrusive_ptr_benchmark" (
|
||||
@REM NB: This is not a gtest executable file, thus couldn't be handled by
|
||||
@REM pytest-cpp and is excluded from test discovery by run_test
|
||||
call "%%~fa"
|
||||
call :libtorch_check "%%~na" "%%~fa"
|
||||
if errorlevel 1 goto fail
|
||||
if not errorlevel 0 goto fail
|
||||
) else (
|
||||
if "%%~na" == "verify_api_visibility" (
|
||||
@REM Skip verify_api_visibility as it is a compile-level test
|
||||
) else (
|
||||
set EXE_LIST=!EXE_LIST! cpp/%%~na
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
goto :eof
|
||||
|
||||
:libtorch_check
|
||||
|
||||
cd %CWD%
|
||||
set CPP_TESTS_DIR=%TMP_DIR_WIN%\build\torch\test
|
||||
|
||||
:: Run python test\run_test.py on the list
|
||||
set NO_TD=True && python test\run_test.py --cpp --verbose -i !EXE_LIST!
|
||||
if errorlevel 1 goto fail
|
||||
if not errorlevel 0 goto fail
|
||||
:: Skip verify_api_visibility as it a compile level test
|
||||
if "%~1" == "verify_api_visibility" goto :eof
|
||||
|
||||
goto :eof
|
||||
echo Running "%~2"
|
||||
if "%~1" == "c10_intrusive_ptr_benchmark" (
|
||||
:: NB: This is not a gtest executable file, thus couldn't be handled by pytest-cpp
|
||||
call "%~2"
|
||||
goto :eof
|
||||
)
|
||||
|
||||
python test\run_test.py --cpp --verbose -i "cpp/%~1"
|
||||
if errorlevel 1 (
|
||||
echo %1 failed with exit code %errorlevel%
|
||||
goto fail
|
||||
)
|
||||
if not errorlevel 0 (
|
||||
echo %1 failed with exit code %errorlevel%
|
||||
goto fail
|
||||
)
|
||||
|
||||
:eof
|
||||
exit /b 0
|
||||
|
||||
@ -38,7 +38,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
|
||||
fi
|
||||
|
||||
# TODO: Move this to .ci/docker/requirements-ci.txt
|
||||
python -m pip install "psutil==5.9.1" nvidia-ml-py "pytest-shard==0.1.2"
|
||||
python -m pip install "psutil==5.9.1" "pynvml==11.4.1" "pytest-shard==0.1.2"
|
||||
|
||||
run_tests() {
|
||||
# Run nvidia-smi if available
|
||||
|
||||
@ -71,7 +71,14 @@ export PYTORCH_BUILD_NUMBER=1
|
||||
|
||||
# Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS
|
||||
TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
|
||||
TRITON_CONSTRAINT="platform_system == 'Linux'"
|
||||
|
||||
# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
|
||||
TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||
|
||||
# CUDA 12.9/13.0 builds have triton for Linux and Linux aarch64 binaries.
|
||||
if [[ "$DESIRED_CUDA" == "cu129" ]] || [[ "$DESIRED_CUDA" == "cu130" ]]; then
|
||||
TRITON_CONSTRAINT="platform_system == 'Linux'"
|
||||
fi
|
||||
|
||||
if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" && ! "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then
|
||||
TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
|
||||
|
||||
2
.flake8
2
.flake8
@ -12,7 +12,7 @@ ignore =
|
||||
# to line this up with executable bit
|
||||
EXE001,
|
||||
# these ignores are from flake8-bugbear; please fix!
|
||||
B007,B008,B017,B019,B023,B028,B903,B905,B906,B907,B908,B910
|
||||
B007,B008,B017,B019,B023,B028,B903,B904,B905,B906,B907,B908,B910
|
||||
# these ignores are from flake8-comprehensions; please fix!
|
||||
C407,
|
||||
# these ignores are from flake8-logging-format; please fix!
|
||||
|
||||
2
.github/actions/linux-test/action.yml
vendored
2
.github/actions/linux-test/action.yml
vendored
@ -274,6 +274,8 @@ runs:
|
||||
-w /var/lib/jenkins/workspace \
|
||||
"${DOCKER_IMAGE}"
|
||||
)
|
||||
# Propagate download.pytorch.org IP to container
|
||||
grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
|
||||
echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
|
||||
docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
|
||||
|
||||
|
||||
35
.github/actions/setup-linux/action.yml
vendored
35
.github/actions/setup-linux/action.yml
vendored
@ -28,10 +28,6 @@ runs:
|
||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
||||
echo "system info $(uname -a)"
|
||||
|
||||
- name: Print GPU info (if present)
|
||||
shell: bash
|
||||
run: if [ -f /usr/bin/nvidia-smi ]; then nvidia-smi; fi
|
||||
|
||||
- name: Check if in a container runner
|
||||
shell: bash
|
||||
id: check_container_runner
|
||||
@ -86,6 +82,37 @@ runs:
|
||||
# Prune all of the docker images
|
||||
docker system prune -af
|
||||
|
||||
- name: Manually resolve download.pytorch.org
|
||||
shell: bash
|
||||
continue-on-error: true
|
||||
run: |
|
||||
set +e
|
||||
set -x
|
||||
|
||||
PT_DOMAIN=download.pytorch.org
|
||||
# TODO: Flaky access to download.pytorch.org https://github.com/pytorch/pytorch/issues/100400,
|
||||
# cleaning this up once the issue is fixed. There are more than one resolved IP here, the last
|
||||
# one is returned at random
|
||||
RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" | tail -n1)
|
||||
|
||||
if [ -z "${RESOLVED_IP}" ]; then
|
||||
echo "Couldn't resolve ${PT_DOMAIN}, retrying with Google DNS..."
|
||||
RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" @8.8.8.8 | tail -n1)
|
||||
|
||||
if [ -z "${RESOLVED_IP}" ]; then
|
||||
echo "Couldn't resolve ${PT_DOMAIN}, exiting..."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if grep -r "${PT_DOMAIN}" /etc/hosts; then
|
||||
# Clean up any old records first
|
||||
sudo sed -i "/${PT_DOMAIN}/d" /etc/hosts
|
||||
fi
|
||||
|
||||
echo "${RESOLVED_IP} ${PT_DOMAIN}" | sudo tee -a /etc/hosts
|
||||
cat /etc/hosts
|
||||
|
||||
- name: Check that the docker daemon is running
|
||||
shell: bash
|
||||
continue-on-error: true
|
||||
|
||||
@ -33,6 +33,10 @@ runs:
|
||||
)
|
||||
|
||||
echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
|
||||
if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" && "${BUILD_ENVIRONMENT}" != "linux-s390x-binary-manywheel" && "${GPU_ARCH_TYPE}" != "xpu" ]]; then
|
||||
# Propagate download.pytorch.org IP to container. This is only needed on Linux non aarch64 runner
|
||||
grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" bash -c "/bin/cat >> /etc/hosts"
|
||||
fi
|
||||
|
||||
docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
|
||||
# Generate test script
|
||||
|
||||
1
.github/pytorch-probot.yml
vendored
1
.github/pytorch-probot.yml
vendored
@ -30,7 +30,6 @@ ciflow_push_tags:
|
||||
- ciflow/riscv64
|
||||
- ciflow/rocm
|
||||
- ciflow/rocm-mi300
|
||||
- ciflow/rocm-mi355
|
||||
- ciflow/s390
|
||||
- ciflow/slow
|
||||
- ciflow/torchbench
|
||||
|
||||
BIN
.github/scripts/drci_mocks.json.gz
vendored
BIN
.github/scripts/drci_mocks.json.gz
vendored
Binary file not shown.
1
.github/scripts/github_utils.py
vendored
1
.github/scripts/github_utils.py
vendored
@ -18,7 +18,6 @@ class GitHubComment:
|
||||
body_text: str
|
||||
created_at: str
|
||||
author_login: str
|
||||
author_url: Optional[str]
|
||||
author_association: str
|
||||
editor_login: Optional[str]
|
||||
database_id: int
|
||||
|
||||
BIN
.github/scripts/gql_mocks.json.gz
vendored
BIN
.github/scripts/gql_mocks.json.gz
vendored
Binary file not shown.
2
.github/scripts/test_check_labels.py
vendored
2
.github/scripts/test_check_labels.py
vendored
@ -38,7 +38,6 @@ def mock_get_comments() -> list[GitHubComment]:
|
||||
body_text="mock_body_text",
|
||||
created_at="",
|
||||
author_login="",
|
||||
author_url=None,
|
||||
author_association="",
|
||||
editor_login=None,
|
||||
database_id=1,
|
||||
@ -49,7 +48,6 @@ def mock_get_comments() -> list[GitHubComment]:
|
||||
body_text=" #" + LABEL_ERR_MSG_TITLE.replace("`", ""),
|
||||
created_at="",
|
||||
author_login=BOT_AUTHORS[1],
|
||||
author_url=None,
|
||||
author_association="",
|
||||
editor_login=None,
|
||||
database_id=2,
|
||||
|
||||
18
.github/scripts/test_trymerge.py
vendored
18
.github/scripts/test_trymerge.py
vendored
@ -32,7 +32,6 @@ from trymerge import (
|
||||
main as trymerge_main,
|
||||
MandatoryChecksMissingError,
|
||||
MergeRule,
|
||||
PostCommentError,
|
||||
RE_GHSTACK_DESC,
|
||||
read_merge_rules,
|
||||
remove_job_name_suffix,
|
||||
@ -589,23 +588,6 @@ class TestTryMerge(TestCase):
|
||||
self.assertEqual(mock_merge_base, pr.get_merge_base())
|
||||
mocked_gh_fetch_merge_base.assert_called_once()
|
||||
|
||||
def test_app_can_revert(self, *args: Any) -> None:
|
||||
pr = GitHubPR("pytorch", "pytorch", 164660)
|
||||
repo = DummyGitRepo()
|
||||
app_comment_id, impostor_comment_id = 3375785595, 3377647892
|
||||
# Check that app can revert
|
||||
self.assertIsNotNone(validate_revert(repo, pr, comment_id=app_comment_id))
|
||||
# But impostor can not
|
||||
self.assertRaises(
|
||||
PostCommentError,
|
||||
lambda: validate_revert(repo, pr, comment_id=impostor_comment_id),
|
||||
)
|
||||
# Despite it's name being the name of the bot
|
||||
self.assertEqual(
|
||||
pr.get_comment_by_id(impostor_comment_id).author_login,
|
||||
"pytorch-auto-revert",
|
||||
)
|
||||
|
||||
|
||||
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
|
||||
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
|
||||
|
||||
7
.github/scripts/trymerge.py
vendored
7
.github/scripts/trymerge.py
vendored
@ -234,7 +234,6 @@ query ($owner: String!, $name: String!, $number: Int!) {
|
||||
createdAt
|
||||
author {
|
||||
login
|
||||
url
|
||||
}
|
||||
authorAssociation
|
||||
editor {
|
||||
@ -1094,7 +1093,6 @@ class GitHubPR:
|
||||
body_text=node["bodyText"],
|
||||
created_at=node["createdAt"] if "createdAt" in node else "",
|
||||
author_login=node["author"]["login"],
|
||||
author_url=node["author"].get("url", None),
|
||||
author_association=node["authorAssociation"],
|
||||
editor_login=editor["login"] if editor else None,
|
||||
database_id=node["databaseId"],
|
||||
@ -2031,11 +2029,6 @@ def validate_revert(
|
||||
# For some reason, one can not be a member of private repo, only CONTRIBUTOR
|
||||
if pr.is_base_repo_private():
|
||||
allowed_reverters.append("CONTRIBUTOR")
|
||||
# Special case the pytorch-auto-revert app, whose does not have association
|
||||
# But should be able to issue revert command
|
||||
if comment.author_url == "https://github.com/apps/pytorch-auto-revert":
|
||||
allowed_reverters.append("NONE")
|
||||
|
||||
if author_association not in allowed_reverters:
|
||||
raise PostCommentError(
|
||||
f"Will not revert as @{author_login} is not one of "
|
||||
|
||||
2
.github/workflows/_linux-test.yml
vendored
2
.github/workflows/_linux-test.yml
vendored
@ -389,6 +389,8 @@ jobs:
|
||||
"${DOCKER_IMAGE}" \
|
||||
${DOCKER_SHELL_CMD}
|
||||
)
|
||||
# Propagate download.pytorch.org IP to container
|
||||
grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
|
||||
echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
|
||||
|
||||
if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
|
||||
|
||||
2
.github/workflows/h100-distributed.yml
vendored
2
.github/workflows/h100-distributed.yml
vendored
@ -37,7 +37,7 @@ jobs:
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runner: "linux.c7i.12xlarge"
|
||||
runner: "linux.12xlarge"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '9.0'
|
||||
|
||||
@ -2,7 +2,7 @@ name: inductor-perf-nightly-h100
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: 15 0 * * 1-6
|
||||
- cron: 15 0,12 * * 1-6
|
||||
- cron: 0 7 * * 0
|
||||
# NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
|
||||
# out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
|
||||
|
||||
@ -63,7 +63,6 @@ jobs:
|
||||
# Same as the build job
|
||||
python-version: 3.12.7
|
||||
test-matrix: ${{ needs.macos-perf-py3-arm64-build.outputs.test-matrix }}
|
||||
timeout-minutes: 300
|
||||
disable-monitor: false
|
||||
monitor-log-interval: 15
|
||||
monitor-data-collect-interval: 4
|
||||
|
||||
7
.github/workflows/rocm-mi355.yml
vendored
7
.github/workflows/rocm-mi355.yml
vendored
@ -1,9 +1,6 @@
|
||||
name: rocm-mi355
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- ciflow/rocm-mi355/*
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: 30 11,1 * * * # about 4:30am PDT and 6:30pm PDT
|
||||
@ -67,7 +64,5 @@ jobs:
|
||||
build-environment: linux-noble-rocm-py3.12-mi355
|
||||
docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }}
|
||||
tests-to-include: >-
|
||||
${{ github.event_name == 'schedule' && 'test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor test_matmul_cuda test_scaled_matmul_cuda'
|
||||
|| '' }}
|
||||
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
|
||||
secrets: inherit
|
||||
|
||||
26
.github/workflows/rocm.yml
vendored
26
.github/workflows/rocm.yml
vendored
@ -59,29 +59,3 @@ jobs:
|
||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-gfx1100-test:
|
||||
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-jammy-rocm-py3_10-gfx1100
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-jammy-rocm-py3_10-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
|
||||
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
|
||||
]}
|
||||
tests-to-include: >
|
||||
test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
|
||||
test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
|
||||
inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
|
||||
inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
|
||||
inductor/test_flex_attention inductor/test_max_autotune
|
||||
secrets: inherit
|
||||
|
||||
13
.github/workflows/trunk.yml
vendored
13
.github/workflows/trunk.yml
vendored
@ -56,7 +56,7 @@ jobs:
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
build-generates-artifacts: false
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runner: "linux.c7i.4xlarge"
|
||||
runner: "linux.4xlarge"
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 1 },
|
||||
@ -249,14 +249,3 @@ jobs:
|
||||
docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3_10-gcc11-full-debug-build-only:
|
||||
name: linux-jammy-py3.10-gcc11-full-debug-build-only
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runner: linux.2xlarge.memory
|
||||
build-environment: linux-jammy-py3.10-gcc11-full-debug-build-only
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
|
||||
secrets: inherit
|
||||
|
||||
4
.github/workflows/xpu.yml
vendored
4
.github/workflows/xpu.yml
vendored
@ -35,7 +35,7 @@ jobs:
|
||||
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
|
||||
build-environment: linux-jammy-xpu-n-1-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-1-py3
|
||||
runner: linux.c7i.12xlarge
|
||||
runner: linux.12xlarge
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
@ -56,7 +56,7 @@ jobs:
|
||||
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
|
||||
build-environment: linux-jammy-xpu-n-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
|
||||
runner: linux.c7i.12xlarge
|
||||
runner: linux.12xlarge
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@ -88,7 +88,7 @@ torch_compile_debug/
|
||||
# Listed manually because some files in this directory are not generated
|
||||
torch/testing/_internal/generated/annotated_fn_args.py
|
||||
torch/testing/_internal/data/*.pt
|
||||
torch/headeronly/version.h
|
||||
torch/csrc/api/include/torch/version.h
|
||||
torch/csrc/cudnn/cuDNN.cpp
|
||||
torch/csrc/generated
|
||||
torch/csrc/generic/TensorMethods.cpp
|
||||
|
||||
@ -28,7 +28,7 @@ exclude_patterns = [
|
||||
'torch/lib/**',
|
||||
'venv/**',
|
||||
'**/*.pyi',
|
||||
"tools/experimental/torchfuzz/**",
|
||||
"tools/experimental/dynamic_shapes/torchfuzz/**",
|
||||
'tools/test/test_selective_build.py',
|
||||
]
|
||||
command = [
|
||||
@ -198,7 +198,7 @@ exclude_patterns = [
|
||||
'tools/test/gen_operators_yaml_test.py',
|
||||
'tools/test/gen_oplist_test.py',
|
||||
'tools/test/test_selective_build.py',
|
||||
'tools/experimental/torchfuzz/**',
|
||||
'tools/experimental/dynamic_shapes/torchfuzz/**',
|
||||
]
|
||||
command = [
|
||||
'python3',
|
||||
|
||||
@ -13,9 +13,6 @@ load(":build_variables.bzl", "jit_core_sources", "lazy_tensor_ts_sources", "libt
|
||||
load(":ufunc_defs.bzl", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cuda_sources")
|
||||
load("//:tools/bazel.bzl", "rules")
|
||||
|
||||
# Export files for use by torch/headeronly (where version.h generation now lives)
|
||||
exports_files(["version.txt"])
|
||||
|
||||
define_targets(rules = rules)
|
||||
|
||||
COMMON_COPTS = [
|
||||
@ -693,9 +690,7 @@ cc_library(
|
||||
"torch/csrc/*/generated/*.h",
|
||||
"torch/csrc/jit/serialization/mobile_bytecode_generated.h",
|
||||
] + torch_cuda_headers,
|
||||
) + GENERATED_AUTOGRAD_CPP + [
|
||||
"//torch/headeronly:version_h",
|
||||
],
|
||||
) + GENERATED_AUTOGRAD_CPP + [":version_h"],
|
||||
includes = [
|
||||
"third_party/kineto/libkineto/include",
|
||||
"torch/csrc",
|
||||
|
||||
@ -388,9 +388,9 @@ cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker
|
||||
|
||||
option(USE_MIMALLOC "Use mimalloc" OFF)
|
||||
# Enable third party mimalloc library to improve memory allocation performance
|
||||
# on Windows and AArch64.
|
||||
# on Windows.
|
||||
option(USE_MIMALLOC_ON_MKL "Use mimalloc on MKL" OFF)
|
||||
if(WIN32 OR (CPU_AARCH64 AND NOT APPLE))
|
||||
if(WIN32)
|
||||
set(USE_MIMALLOC ON)
|
||||
|
||||
# Not enable USE_MIMALLOC_ON_MKL due to it caused issue:
|
||||
|
||||
@ -28,19 +28,4 @@ inline std::ostream& operator<<(std::ostream& stream, at::BlasBackend backend) {
|
||||
return stream << BlasBackendToString(backend);
|
||||
}
|
||||
|
||||
namespace blas {
|
||||
|
||||
enum class ScalingType : std::uint8_t {
|
||||
TensorWise, // fp32 scales
|
||||
RowWise, // fp32 scales
|
||||
BlockWise1x16, // fp8_e4m3fn scales
|
||||
BlockWise1x32, // fp8_e8m0fnu scales
|
||||
BlockWise1x128, // fp32 scales
|
||||
BlockWise128x128, // fp32 scales
|
||||
};
|
||||
|
||||
enum class SwizzleType : std::uint8_t { NO_SWIZZLE = 0, SWIZZLE_32_4_4 = 1 };
|
||||
|
||||
} // namespace blas
|
||||
|
||||
} // namespace at
|
||||
|
||||
@ -144,7 +144,8 @@ inline std::string _all_equal_numel_error(at::ArrayRef<Tensor> tensors) {
|
||||
inline bool _apply_preamble(ArrayRef<Tensor> tensors) {
|
||||
checkDeviceType("CPU_tensor_apply", tensors, kCPU);
|
||||
checkLayout("CPU_tensor_apply", tensors, kStrided);
|
||||
TORCH_CHECK(_all_equal_numel(tensors), _all_equal_numel_error(tensors));
|
||||
if (!_all_equal_numel(tensors))
|
||||
TORCH_CHECK(false, _all_equal_numel_error(tensors));
|
||||
// An empty tensor has no elements
|
||||
for (auto& t : tensors)
|
||||
if (t.numel() == 0)
|
||||
|
||||
@ -483,8 +483,8 @@ at::BlasBackend Context::blasPreferredBackend() {
|
||||
#if ROCM_VERSION >= 60300
|
||||
"gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908",
|
||||
#endif
|
||||
#if ROCM_VERSION >= 70000
|
||||
"gfx950", "gfx1150", "gfx1151"
|
||||
#if ROCM_VERSION >= 60500
|
||||
"gfx950"
|
||||
#endif
|
||||
};
|
||||
for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
|
||||
@ -587,33 +587,20 @@ void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) {
|
||||
rocm_fa_preferred_backend = b;
|
||||
}
|
||||
|
||||
CuBLASReductionOption Context::allowFP16ReductionCuBLAS() const {
|
||||
bool Context::allowFP16ReductionCuBLAS() const {
|
||||
return allow_fp16_reduction_cublas;
|
||||
}
|
||||
|
||||
CuBLASReductionOption inline get_reduction_option(bool allow_reduced_precision, bool allow_splitk) {
|
||||
TORCH_CHECK(
|
||||
!(allow_reduced_precision && !allow_splitk),
|
||||
"allow_splitk=False is not supported when reduced precision reductions are enabled");
|
||||
if (allow_reduced_precision) {
|
||||
return CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
|
||||
} else if (allow_splitk) {
|
||||
return CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK;
|
||||
} else {
|
||||
return CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK;
|
||||
}
|
||||
void Context::setAllowFP16ReductionCuBLAS(bool b) {
|
||||
allow_fp16_reduction_cublas = b;
|
||||
}
|
||||
|
||||
void Context::setAllowFP16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) {
|
||||
allow_fp16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk);
|
||||
}
|
||||
|
||||
CuBLASReductionOption Context::allowBF16ReductionCuBLAS() const {
|
||||
bool Context::allowBF16ReductionCuBLAS() const {
|
||||
return allow_bf16_reduction_cublas;
|
||||
}
|
||||
|
||||
void Context::setAllowBF16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) {
|
||||
allow_bf16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk);
|
||||
void Context::setAllowBF16ReductionCuBLAS(bool b) {
|
||||
allow_bf16_reduction_cublas = b;
|
||||
}
|
||||
|
||||
bool Context::allowFP16AccumulationCuBLAS() const {
|
||||
@ -825,14 +812,6 @@ void Context::setDisplayVmapFallbackWarnings(bool enabled) {
|
||||
display_vmap_fallback_warnings_ = enabled;
|
||||
}
|
||||
|
||||
bool Context::warnOnAccumulateGradStreamMismatch() const {
|
||||
return warn_on_accumulate_grad_stream_mismatch_;
|
||||
}
|
||||
|
||||
void Context::setWarnOnAccumulateGradStreamMismatch(bool enabled) {
|
||||
warn_on_accumulate_grad_stream_mismatch_ = enabled;
|
||||
}
|
||||
|
||||
bool Context::isDefaultMobileCPUAllocatorSet() {
|
||||
return prev_allocator_ptr_ != nullptr;
|
||||
}
|
||||
|
||||
@ -38,12 +38,6 @@ namespace at {
|
||||
class Tensor;
|
||||
|
||||
enum class TORCH_API Float32MatmulPrecision { HIGHEST, HIGH, MEDIUM };
|
||||
|
||||
enum class CuBLASReductionOption : uint8_t {
|
||||
AllowReducedPrecisionWithSplitK = 0,
|
||||
DisallowReducedPrecisionAllowSplitK = 1,
|
||||
DisallowReducedPrecisionDisallowSplitK = 2,
|
||||
};
|
||||
enum class TORCH_API Float32Backend { GENERIC, CUDA, MKLDNN };
|
||||
enum class TORCH_API Float32Op { ALL, CONV, RNN, MATMUL };
|
||||
enum class TORCH_API Float32Precision { NONE, IEEE, TF32, BF16 };
|
||||
@ -226,15 +220,15 @@ class TORCH_API Context {
|
||||
bool userEnabledMkldnn() const;
|
||||
void setUserEnabledMkldnn(bool e);
|
||||
bool benchmarkCuDNN() const;
|
||||
void setBenchmarkCuDNN(bool /*b*/);
|
||||
void setBenchmarkCuDNN(bool);
|
||||
int benchmarkLimitCuDNN() const;
|
||||
void setBenchmarkLimitCuDNN(int /*b*/);
|
||||
void setBenchmarkLimitCuDNN(int);
|
||||
bool immediateMiopen() const;
|
||||
void setImmediateMiopen(bool /*b*/);
|
||||
void setImmediateMiopen(bool);
|
||||
bool deterministicCuDNN() const;
|
||||
void setDeterministicCuDNN(bool /*b*/);
|
||||
void setDeterministicCuDNN(bool);
|
||||
bool deterministicMkldnn() const;
|
||||
void setDeterministicMkldnn(bool /*b*/);
|
||||
void setDeterministicMkldnn(bool);
|
||||
bool userEnabledNNPACK() const;
|
||||
void setUserEnabledNNPACK(bool e);
|
||||
|
||||
@ -252,32 +246,32 @@ class TORCH_API Context {
|
||||
void setSDPPriorityOrder(const std::vector<int64_t>& order);
|
||||
std::array<at::SDPBackend, at::num_sdp_backends> sDPPriorityOrder();
|
||||
|
||||
void setSDPUseFlash(bool /*e*/);
|
||||
void setSDPUseFlash(bool);
|
||||
bool userEnabledFlashSDP() const;
|
||||
|
||||
void setSDPUseMemEfficient(bool /*e*/);
|
||||
void setSDPUseMemEfficient(bool);
|
||||
bool userEnabledMemEfficientSDP() const;
|
||||
|
||||
void setSDPUseMath(bool /*e*/);
|
||||
void setSDPUseMath(bool);
|
||||
bool userEnabledMathSDP() const;
|
||||
|
||||
void setSDPUseCuDNN(bool /*e*/);
|
||||
void setSDPUseCuDNN(bool);
|
||||
bool userEnabledCuDNNSDP() const;
|
||||
|
||||
void setAllowFP16BF16ReductionMathSDP(bool /*e*/);
|
||||
void setAllowFP16BF16ReductionMathSDP(bool);
|
||||
bool allowFP16BF16ReductionMathSDP() const;
|
||||
|
||||
void setSDPUseOverrideable(bool /*e*/);
|
||||
void setSDPUseOverrideable(bool);
|
||||
bool userEnabledOverrideableSDP() const;
|
||||
|
||||
at::LinalgBackend linalgPreferredBackend() const;
|
||||
void setLinalgPreferredBackend(at::LinalgBackend /*b*/);
|
||||
void setLinalgPreferredBackend(at::LinalgBackend);
|
||||
|
||||
at::BlasBackend blasPreferredBackend();
|
||||
void setBlasPreferredBackend(at::BlasBackend /*b*/);
|
||||
void setBlasPreferredBackend(at::BlasBackend);
|
||||
|
||||
at::ROCmFABackend getROCmFAPreferredBackend();
|
||||
void setROCmFAPreferredBackend(at::ROCmFABackend /*b*/);
|
||||
void setROCmFAPreferredBackend(at::ROCmFABackend);
|
||||
|
||||
// Note [Enabling Deterministic Operations]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
@ -310,9 +304,9 @@ class TORCH_API Context {
|
||||
|
||||
bool deterministicAlgorithms() const;
|
||||
bool deterministicAlgorithmsWarnOnly() const;
|
||||
void setDeterministicAlgorithms(bool /*b*/, bool /*warn_only*/);
|
||||
void setDeterministicAlgorithms(bool, bool);
|
||||
bool deterministicFillUninitializedMemory() const;
|
||||
void setDeterministicFillUninitializedMemory(bool /*b*/);
|
||||
void setDeterministicFillUninitializedMemory(bool);
|
||||
|
||||
// Note [Writing Nondeterministic Operations]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
@ -356,23 +350,19 @@ class TORCH_API Context {
|
||||
Float32Op op,
|
||||
Float32Precision p);
|
||||
bool allowTF32CuDNN(std::optional<Float32Op> op = std::nullopt) const;
|
||||
void setAllowTF32CuDNN(bool /*b*/);
|
||||
void setAllowTF32CuDNN(bool);
|
||||
bool allowTF32OneDNN() const;
|
||||
void setAllowTF32OneDNN(bool /*b*/);
|
||||
void setAllowTF32OneDNN(bool);
|
||||
bool allowTF32CuBLAS() const;
|
||||
void setAllowTF32CuBLAS(bool /*b*/);
|
||||
void setAllowTF32CuBLAS(bool);
|
||||
Float32MatmulPrecision float32MatmulPrecision() const;
|
||||
Float32Precision float32Precision(Float32Backend backend, Float32Op op) const;
|
||||
CuBLASReductionOption allowFP16ReductionCuBLAS() const;
|
||||
void setAllowFP16ReductionCuBLAS(
|
||||
bool allow_reduced_precision,
|
||||
bool allow_splitk = true);
|
||||
CuBLASReductionOption allowBF16ReductionCuBLAS() const;
|
||||
void setAllowBF16ReductionCuBLAS(
|
||||
bool allow_reduced_precision,
|
||||
bool allow_splitk = true);
|
||||
bool allowFP16ReductionCuBLAS() const;
|
||||
void setAllowFP16ReductionCuBLAS(bool);
|
||||
bool allowBF16ReductionCuBLAS() const;
|
||||
void setAllowBF16ReductionCuBLAS(bool);
|
||||
bool allowFP16AccumulationCuBLAS() const;
|
||||
void setAllowFP16AccumulationCuBLAS(bool /*b*/);
|
||||
void setAllowFP16AccumulationCuBLAS(bool);
|
||||
|
||||
// Matmuls can use a so-called "persistent" kernel which launches one CUDA
|
||||
// block for each SM on the GPU, and each block then iterates over multiple
|
||||
@ -384,7 +374,7 @@ class TORCH_API Context {
|
||||
// to make matmuls target only a subset of the SMs, so they can fully schedule
|
||||
// even next to a comms kernel, and only be a few percent slower.
|
||||
std::optional<int32_t> _SMCarveout_EXPERIMENTAL() const;
|
||||
void _setSMCarveout_EXPERIMENTAL(std::optional<int32_t> /*c*/);
|
||||
void _setSMCarveout_EXPERIMENTAL(std::optional<int32_t>);
|
||||
|
||||
at::QEngine qEngine() const;
|
||||
void setQEngine(at::QEngine e);
|
||||
@ -401,14 +391,11 @@ class TORCH_API Context {
|
||||
void setDisplayVmapFallbackWarnings(bool enabled);
|
||||
bool areVmapFallbackWarningsEnabled() const;
|
||||
|
||||
void setWarnOnAccumulateGradStreamMismatch(bool enabled);
|
||||
bool warnOnAccumulateGradStreamMismatch() const;
|
||||
|
||||
bool isDefaultMobileCPUAllocatorSet();
|
||||
void setDefaultMobileCPUAllocator();
|
||||
void unsetDefaultMobileCPUAllocator();
|
||||
bool allowFP16ReductionCPU() const;
|
||||
void setAllowFP16ReductionCPU(bool /*b*/);
|
||||
void setAllowFP16ReductionCPU(bool);
|
||||
|
||||
// Preserved for BC
|
||||
void lazyInitCUDA() {
|
||||
@ -465,10 +452,8 @@ class TORCH_API Context {
|
||||
: at::Float32MatmulPrecision::HIGHEST;
|
||||
int benchmark_limit_cudnn = 10;
|
||||
bool allow_tf32_cudnn = true;
|
||||
CuBLASReductionOption allow_fp16_reduction_cublas =
|
||||
CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
|
||||
CuBLASReductionOption allow_bf16_reduction_cublas =
|
||||
CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
|
||||
bool allow_fp16_reduction_cublas = true;
|
||||
bool allow_bf16_reduction_cublas = true;
|
||||
bool allow_fp16_accumulation_cublas = false;
|
||||
std::optional<int32_t> sm_carveout = std::nullopt;
|
||||
bool enabled_mkldnn = true;
|
||||
@ -494,7 +479,6 @@ class TORCH_API Context {
|
||||
bool release_original_weights = false;
|
||||
#endif
|
||||
bool display_vmap_fallback_warnings_ = false;
|
||||
bool warn_on_accumulate_grad_stream_mismatch_ = true;
|
||||
std::atomic<at::QEngine> quantized_engine = at::QEngine::NoQEngine;
|
||||
bool enable_sparse_tensor_invariant_checks = false;
|
||||
bool allow_fp16_reduction_cpu = false;
|
||||
|
||||
@ -16,8 +16,8 @@ inline void check_size_nonnegative(ArrayRef<int64_t> size) {
|
||||
|
||||
inline void check_size_nonnegative(ArrayRef<c10::SymInt> size) {
|
||||
for (const auto& x : size) {
|
||||
TORCH_SYM_CHECK(
|
||||
x.sym_ge(0),
|
||||
TORCH_CHECK(
|
||||
x.expect_size(__FILE__, __LINE__),
|
||||
"Trying to create tensor with negative dimension ",
|
||||
x,
|
||||
": ",
|
||||
|
||||
@ -62,7 +62,7 @@ constexpr const char* unknown_eventname = "eventname not specified";
|
||||
#endif
|
||||
} // namespace (anonymous)
|
||||
|
||||
MapAllocator::MapAllocator(WithFd /*unused*/, std::string_view filename, int fd, int flags, size_t size)
|
||||
MapAllocator::MapAllocator(WithFd, std::string_view filename, int fd, int flags, size_t size)
|
||||
: filename_(filename.empty() ? unknown_filename : filename)
|
||||
, size_(0) // to be filled later
|
||||
#ifdef _WIN32
|
||||
@ -494,7 +494,7 @@ RefcountedMapAllocator::RefcountedMapAllocator(const char *filename, int flags,
|
||||
|
||||
initializeAlloc();
|
||||
}
|
||||
RefcountedMapAllocator::RefcountedMapAllocator(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size)
|
||||
RefcountedMapAllocator::RefcountedMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size)
|
||||
: RefcountedMapAllocatorArgCheck(flags)
|
||||
, MapAllocator(WITH_FD, filename, flags, fd, size + map_alloc_alignment) {
|
||||
|
||||
@ -614,7 +614,7 @@ at::DataPtr MapAllocator::makeDataPtr(std::string_view filename, int flags, size
|
||||
return {context->data(), context, &deleteMapAllocator, at::DeviceType::CPU};
|
||||
}
|
||||
|
||||
at::DataPtr MapAllocator::makeDataPtr(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
|
||||
at::DataPtr MapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
|
||||
auto* context = new MapAllocator(WITH_FD, filename, fd, flags, size);
|
||||
if (actual_size_out) *actual_size_out = context->size();
|
||||
return {context->data(), context, &deleteMapAllocator, at::DeviceType::CPU};
|
||||
@ -626,7 +626,7 @@ at::DataPtr RefcountedMapAllocator::makeDataPtr(const char *filename, int flags,
|
||||
return {context->data(), context, &deleteRefcountedMapAllocator, at::DeviceType::CPU};
|
||||
}
|
||||
|
||||
at::DataPtr RefcountedMapAllocator::makeDataPtr(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
|
||||
at::DataPtr RefcountedMapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
|
||||
auto* context = new RefcountedMapAllocator(WITH_FD, filename, fd, flags, size);
|
||||
if (actual_size_out) *actual_size_out = context->size() - map_alloc_alignment;
|
||||
return {context->data(), context, &deleteRefcountedMapAllocator, at::DeviceType::CPU};
|
||||
|
||||
@ -25,7 +25,7 @@ class TORCH_API MapAllocator {
|
||||
public:
|
||||
MapAllocator(std::string_view filename, int flags, size_t size);
|
||||
MapAllocator(
|
||||
WithFd /*unused*/,
|
||||
WithFd,
|
||||
std::string_view filename,
|
||||
int fd,
|
||||
int flags,
|
||||
@ -59,14 +59,14 @@ class TORCH_API MapAllocator {
|
||||
return flags_;
|
||||
}
|
||||
|
||||
static MapAllocator* fromDataPtr(const at::DataPtr& /*dptr*/);
|
||||
static MapAllocator* fromDataPtr(const at::DataPtr&);
|
||||
static at::DataPtr makeDataPtr(
|
||||
std::string_view filename,
|
||||
int flags,
|
||||
size_t size,
|
||||
size_t* actual_size_out);
|
||||
static at::DataPtr makeDataPtr(
|
||||
WithFd /*unused*/,
|
||||
WithFd,
|
||||
const char* filename,
|
||||
int fd,
|
||||
int flags,
|
||||
@ -105,13 +105,13 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck,
|
||||
public:
|
||||
RefcountedMapAllocator(const char* filename, int flags, size_t size);
|
||||
RefcountedMapAllocator(
|
||||
WithFd /*unused*/,
|
||||
WithFd,
|
||||
const char* filename,
|
||||
int fd,
|
||||
int flags,
|
||||
size_t size);
|
||||
|
||||
static RefcountedMapAllocator* fromDataPtr(const at::DataPtr& /*dptr*/);
|
||||
static RefcountedMapAllocator* fromDataPtr(const at::DataPtr&);
|
||||
RefcountedMapAllocator(const RefcountedMapAllocator&) = delete;
|
||||
RefcountedMapAllocator(RefcountedMapAllocator&&) = delete;
|
||||
RefcountedMapAllocator& operator=(const RefcountedMapAllocator&) = delete;
|
||||
@ -122,7 +122,7 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck,
|
||||
size_t size,
|
||||
size_t* actual_size_out);
|
||||
static at::DataPtr makeDataPtr(
|
||||
WithFd /*unused*/,
|
||||
WithFd,
|
||||
const char* filename,
|
||||
int fd,
|
||||
int flags,
|
||||
|
||||
@ -273,7 +273,7 @@ c10::SymInt NestedTensorImpl::sym_numel_custom() const {
|
||||
return NestedTensorImpl::numel_custom();
|
||||
}
|
||||
|
||||
c10::SymBool NestedTensorImpl::sym_is_contiguous_custom(MemoryFormat /*memory_format*/) const {
|
||||
c10::SymBool NestedTensorImpl::sym_is_contiguous_custom(MemoryFormat) const {
|
||||
return nested_tensor_impl_is_contiguous(this);
|
||||
}
|
||||
IntArrayRef NestedTensorImpl::sizes_custom() const {
|
||||
|
||||
@ -115,8 +115,7 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
|
||||
// with real implementations
|
||||
int64_t numel_custom() const override;
|
||||
c10::SymInt sym_numel_custom() const override;
|
||||
c10::SymBool sym_is_contiguous_custom(
|
||||
MemoryFormat /*memory_format*/) const override;
|
||||
c10::SymBool sym_is_contiguous_custom(MemoryFormat) const override;
|
||||
int64_t size_custom(int64_t d) const override {
|
||||
return this->size(d);
|
||||
}
|
||||
|
||||
@ -14,7 +14,7 @@ inline int64_t divup(int64_t x, int64_t y) {
|
||||
TORCH_API void init_num_threads();
|
||||
|
||||
// Sets the number of threads to be used in parallel region
|
||||
TORCH_API void set_num_threads(int /*nthreads*/);
|
||||
TORCH_API void set_num_threads(int);
|
||||
|
||||
// Returns the maximum number of threads that may be used in a parallel region
|
||||
TORCH_API int get_num_threads();
|
||||
@ -37,7 +37,7 @@ inline void lazy_init_num_threads() {
|
||||
}
|
||||
}
|
||||
|
||||
TORCH_API void set_thread_num(int /*id*/);
|
||||
TORCH_API void set_thread_num(int);
|
||||
|
||||
class TORCH_API ThreadIdGuard {
|
||||
public:
|
||||
@ -130,7 +130,7 @@ inline scalar_t parallel_reduce(
|
||||
TORCH_API std::string get_parallel_info();
|
||||
|
||||
// Sets number of threads used for inter-op parallelism
|
||||
TORCH_API void set_num_interop_threads(int /*nthreads*/);
|
||||
TORCH_API void set_num_interop_threads(int);
|
||||
|
||||
// Returns the number of threads used for inter-op parallelism
|
||||
TORCH_API size_t get_num_interop_threads();
|
||||
|
||||
@ -252,7 +252,7 @@ void SparseCsrTensorImpl::set_stride(int64_t dim, int64_t new_stride) {
|
||||
void SparseCsrTensorImpl::set_storage_offset(int64_t storage_offset) {
|
||||
TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have set_storage_offset.");
|
||||
}
|
||||
c10::SymBool SparseCsrTensorImpl::sym_is_contiguous_custom(MemoryFormat /*memory_format*/) const {
|
||||
c10::SymBool SparseCsrTensorImpl::sym_is_contiguous_custom(MemoryFormat) const {
|
||||
TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have is_contiguous");
|
||||
}
|
||||
} // namespace at
|
||||
|
||||
@ -32,10 +32,10 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
|
||||
|
||||
public:
|
||||
explicit SparseCsrTensorImpl(
|
||||
at::DispatchKeySet /*key_set*/,
|
||||
at::DispatchKeySet,
|
||||
at::Device device,
|
||||
Layout layout,
|
||||
const caffe2::TypeMeta /*data_type*/);
|
||||
const caffe2::TypeMeta);
|
||||
|
||||
void resize_(int64_t nnz, IntArrayRef size);
|
||||
void resize_and_clear_(
|
||||
@ -86,8 +86,7 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
|
||||
protected:
|
||||
IntArrayRef strides_custom() const override;
|
||||
SymIntArrayRef sym_strides_custom() const override;
|
||||
SymBool sym_is_contiguous_custom(
|
||||
MemoryFormat /*memory_format*/) const override;
|
||||
SymBool sym_is_contiguous_custom(MemoryFormat) const override;
|
||||
|
||||
public:
|
||||
void set_size(int64_t dim, int64_t new_size) override;
|
||||
|
||||
@ -46,9 +46,7 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
|
||||
|
||||
public:
|
||||
// Public for now...
|
||||
explicit SparseTensorImpl(
|
||||
at::DispatchKeySet /*key_set*/,
|
||||
const caffe2::TypeMeta /*data_type*/);
|
||||
explicit SparseTensorImpl(at::DispatchKeySet, const caffe2::TypeMeta);
|
||||
|
||||
void release_resources() override;
|
||||
|
||||
@ -231,14 +229,14 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
|
||||
}
|
||||
|
||||
void resize_(int64_t sparse_dim, int64_t dense_dim, ArrayRef<int64_t> size) {
|
||||
_resize_(sparse_dim, dense_dim, size);
|
||||
return _resize_(sparse_dim, dense_dim, size);
|
||||
}
|
||||
|
||||
void resize_(
|
||||
int64_t sparse_dim,
|
||||
int64_t dense_dim,
|
||||
ArrayRef<c10::SymInt> size) {
|
||||
_resize_(sparse_dim, dense_dim, size);
|
||||
return _resize_(sparse_dim, dense_dim, size);
|
||||
}
|
||||
|
||||
// NOTE: this function will resize the sparse tensor and also set `indices`
|
||||
@ -386,8 +384,8 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
|
||||
|
||||
private:
|
||||
explicit SparseTensorImpl(
|
||||
at::DispatchKeySet /*key_set*/,
|
||||
const caffe2::TypeMeta /*data_type*/,
|
||||
at::DispatchKeySet,
|
||||
const caffe2::TypeMeta,
|
||||
at::Tensor indices,
|
||||
at::Tensor values);
|
||||
|
||||
|
||||
@ -59,7 +59,7 @@ static inline void set_item(const Tensor& self, ArrayRef<TensorIndex> indices, c
|
||||
}
|
||||
}
|
||||
|
||||
set_item(self, indices, value);
|
||||
return set_item(self, indices, value);
|
||||
}
|
||||
|
||||
} // namespace indexing
|
||||
|
||||
@ -112,10 +112,10 @@ TORCH_API std::ostream& operator<<(std::ostream& stream, const Slice& slice);
|
||||
// `torch.tensor([1, 2])`) | `torch::tensor({1, 2})`
|
||||
struct TORCH_API TensorIndex final {
|
||||
// Case 1: `at::indexing::None`
|
||||
TensorIndex(std::nullopt_t /*unused*/) : type_(TensorIndexType::None) {}
|
||||
TensorIndex(std::nullopt_t) : type_(TensorIndexType::None) {}
|
||||
|
||||
// Case 2: "..." / `at::indexing::Ellipsis`
|
||||
TensorIndex(at::indexing::EllipsisIndexType /*unused*/)
|
||||
TensorIndex(at::indexing::EllipsisIndexType)
|
||||
: type_(TensorIndexType::Ellipsis) {}
|
||||
TensorIndex(const char* str) : TensorIndex(at::indexing::Ellipsis) {
|
||||
TORCH_CHECK_VALUE(
|
||||
|
||||
@ -765,8 +765,7 @@ void TensorIteratorBase::for_each(loop2d_t loop, int64_t grain_size) {
|
||||
if (numel == 0) {
|
||||
return;
|
||||
} else if (numel < grain_size || at::get_num_threads() == 1) {
|
||||
serial_for_each(loop, {0, numel});
|
||||
return;
|
||||
return serial_for_each(loop, {0, numel});
|
||||
} else {
|
||||
at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
|
||||
serial_for_each(loop, {begin, end});
|
||||
|
||||
@ -250,7 +250,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
|
||||
using PtrVector = SmallVector<char*, 4>;
|
||||
using StrideVector = SmallVector<int64_t, 6>;
|
||||
|
||||
void build(TensorIteratorConfig& /*config*/);
|
||||
void build(TensorIteratorConfig&);
|
||||
|
||||
// The inner-loop function operates on the fastest moving dimension. It
|
||||
// implements element-wise operations in terms of 1-d strided tensors.
|
||||
@ -618,20 +618,20 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
|
||||
#undef TORCH_DISALLOW_TEMPORARIES
|
||||
protected:
|
||||
// Mutable reference as it moves tensors out of TensorIteratorConfig
|
||||
void populate_operands(TensorIteratorConfig& /*config*/);
|
||||
void populate_operands(TensorIteratorConfig&);
|
||||
void mark_outputs();
|
||||
void mark_resize_outputs(const TensorIteratorConfig& /*config*/);
|
||||
void compute_mem_overlaps(const TensorIteratorConfig& /*config*/);
|
||||
void compute_shape(const TensorIteratorConfig& /*config*/);
|
||||
void compute_strides(const TensorIteratorConfig& /*config*/);
|
||||
void mark_resize_outputs(const TensorIteratorConfig&);
|
||||
void compute_mem_overlaps(const TensorIteratorConfig&);
|
||||
void compute_shape(const TensorIteratorConfig&);
|
||||
void compute_strides(const TensorIteratorConfig&);
|
||||
void reorder_dimensions();
|
||||
void permute_dimensions(IntArrayRef perm);
|
||||
void compute_types(const TensorIteratorConfig& /*config*/);
|
||||
void compute_types(const TensorIteratorConfig&);
|
||||
ScalarType compute_common_dtype();
|
||||
void allocate_or_resize_outputs();
|
||||
bool fast_set_up(const TensorIteratorConfig& /*config*/);
|
||||
FastSetupType compute_fast_setup_type(const TensorIteratorConfig& /*config*/);
|
||||
void compute_names(const TensorIteratorConfig& /*config*/);
|
||||
bool fast_set_up(const TensorIteratorConfig&);
|
||||
FastSetupType compute_fast_setup_type(const TensorIteratorConfig&);
|
||||
void compute_names(const TensorIteratorConfig&);
|
||||
void propagate_names_to_outputs();
|
||||
void coalesce_dimensions();
|
||||
|
||||
|
||||
@ -20,7 +20,7 @@
|
||||
|
||||
namespace at {
|
||||
|
||||
TORCH_API int _crash_if_asan(int /*arg*/);
|
||||
TORCH_API int _crash_if_asan(int);
|
||||
|
||||
// Converts a TensorList (i.e. ArrayRef<Tensor> to vector of TensorImpl*)
|
||||
// NB: This is ONLY used by legacy TH bindings, and ONLY used by cat.
|
||||
|
||||
@ -103,7 +103,9 @@ std::string get_cpu_capability() {
|
||||
#elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
|
||||
case native::CPUCapability::ZVECTOR:
|
||||
return "Z VECTOR";
|
||||
#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
|
||||
#elif defined(HAVE_SVE_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
|
||||
case native::CPUCapability::SVE128:
|
||||
return "SVE128";
|
||||
case native::CPUCapability::SVE256:
|
||||
return "SVE256";
|
||||
#else
|
||||
|
||||
@ -148,7 +148,7 @@ Tensor cached_cast(at::ScalarType to_type, const Tensor& arg, DeviceType device_
|
||||
Banned functions
|
||||
*******************************/
|
||||
|
||||
static Tensor binary_cross_entropy_banned(const Tensor & /*unused*/, const Tensor & /*unused*/, const std::optional<Tensor>& /*unused*/, int64_t /*unused*/) {
|
||||
static Tensor binary_cross_entropy_banned(const Tensor &, const Tensor &, const std::optional<Tensor>&, int64_t) {
|
||||
TORCH_CHECK(false, "torch.nn.functional.binary_cross_entropy and torch.nn.BCELoss are unsafe to autocast.\n"
|
||||
"Many models use a sigmoid layer right before the binary cross entropy layer.\n"
|
||||
"In this case, combine the two layers using torch.nn.functional.binary_cross_entropy_with_logits\n"
|
||||
|
||||
@ -49,7 +49,7 @@ static void check_unique_names(DimnameList names) {
|
||||
}
|
||||
|
||||
void check_names_valid_for(const TensorBase& tensor, DimnameList names) {
|
||||
impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names);
|
||||
return impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names);
|
||||
}
|
||||
|
||||
void check_names_valid_for(size_t tensor_dim, DimnameList names) {
|
||||
|
||||
@ -27,11 +27,11 @@ struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface {
|
||||
HasNonWildcard
|
||||
};
|
||||
|
||||
explicit NamedTensorMeta(HAS_NON_WILDCARD /*unused*/, DimnameList names)
|
||||
explicit NamedTensorMeta(HAS_NON_WILDCARD, DimnameList names)
|
||||
: names_(names.vec()) {
|
||||
check_invariants();
|
||||
}
|
||||
explicit NamedTensorMeta(HAS_NON_WILDCARD /*unused*/, std::vector<Dimname>&& names)
|
||||
explicit NamedTensorMeta(HAS_NON_WILDCARD, std::vector<Dimname>&& names)
|
||||
: names_(std::move(names)) {
|
||||
check_invariants();
|
||||
}
|
||||
@ -52,13 +52,13 @@ struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface {
|
||||
std::any_of(names_.begin(), names_.end(), [](const Dimname& n) { return !n.isWildcard(); }));
|
||||
}
|
||||
|
||||
void set_names(HAS_NON_WILDCARD /*unused*/, DimnameList new_names) {
|
||||
void set_names(HAS_NON_WILDCARD, DimnameList new_names) {
|
||||
TORCH_INTERNAL_ASSERT(new_names.size() == names_.size());
|
||||
std::copy(new_names.begin(), new_names.end(), names_.begin());
|
||||
check_invariants();
|
||||
}
|
||||
|
||||
void set_names(HAS_NON_WILDCARD /*unused*/, std::vector<Dimname>&& new_names) {
|
||||
void set_names(HAS_NON_WILDCARD, std::vector<Dimname>&& new_names) {
|
||||
TORCH_INTERNAL_ASSERT(new_names.size() == names_.size());
|
||||
names_ = std::move(new_names);
|
||||
check_invariants();
|
||||
|
||||
@ -13,7 +13,7 @@ class TORCH_API PythonOpRegistrationTrampoline final {
|
||||
public:
|
||||
// Returns true if you successfully registered yourself (that means
|
||||
// you are in the hot seat for doing the operator registrations!)
|
||||
static bool registerInterpreter(c10::impl::PyInterpreter* /*interp*/);
|
||||
static bool registerInterpreter(c10::impl::PyInterpreter*);
|
||||
|
||||
// Returns nullptr if no interpreter has been registered yet.
|
||||
static c10::impl::PyInterpreter* getInterpreter();
|
||||
|
||||
@ -138,7 +138,7 @@ void Tensor::_backward(TensorList inputs,
|
||||
const std::optional<Tensor>& gradient,
|
||||
std::optional<bool> keep_graph,
|
||||
bool create_graph) const {
|
||||
impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph);
|
||||
return impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph);
|
||||
}
|
||||
|
||||
const TensorBase& TensorBase::requires_grad_(bool _requires_grad) const {
|
||||
@ -173,12 +173,4 @@ unsigned TensorBase::_register_hook(std::function<TensorBase(const TensorBase&)>
|
||||
return impl::GetVariableHooks()->_register_hook(*this, std::move(hook));
|
||||
}
|
||||
|
||||
std::optional<ScalarType> TensorBase::grad_dtype() const {
|
||||
return impl::GetVariableHooks()->grad_dtype(*this);
|
||||
}
|
||||
|
||||
void TensorBase::set_grad_dtype(const std::optional<ScalarType>& grad_dtype) const {
|
||||
return impl::GetVariableHooks()->set_grad_dtype(*this, grad_dtype);
|
||||
}
|
||||
|
||||
} // namespace at
|
||||
|
||||
@ -100,7 +100,7 @@ class TORCH_API TensorBase {
|
||||
// Create a Tensor with a +0 reference count. Special care must be
|
||||
// taken to avoid decrementing this reference count at destruction
|
||||
// time. Intended to support MaybeOwnedTraits<Tensor>.
|
||||
explicit TensorBase(unsafe_borrow_t /*unused*/, const TensorBase& rhs)
|
||||
explicit TensorBase(unsafe_borrow_t, const TensorBase& rhs)
|
||||
: impl_(c10::intrusive_ptr<at::TensorImpl, UndefinedTensorImpl>(rhs.impl_.get(), c10::raw::DontIncreaseRefcount{})) {}
|
||||
friend MaybeOwnedTraits<TensorBase>;
|
||||
|
||||
@ -930,10 +930,6 @@ public:
|
||||
|
||||
const TensorBase& requires_grad_(bool _requires_grad=true) const;
|
||||
|
||||
std::optional<ScalarType> grad_dtype() const;
|
||||
|
||||
void set_grad_dtype(const std::optional<ScalarType>& grad_dtype) const;
|
||||
|
||||
// View Variables
|
||||
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
@ -954,7 +950,7 @@ protected:
|
||||
c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> impl_;
|
||||
|
||||
private:
|
||||
TensorBase __dispatch_contiguous(c10::MemoryFormat /*memory_format*/) const;
|
||||
TensorBase __dispatch_contiguous(c10::MemoryFormat) const;
|
||||
};
|
||||
|
||||
inline DeviceIndex get_device(const TensorBase& self) {
|
||||
|
||||
@ -68,8 +68,6 @@ struct TORCH_API VariableHooksInterface {
|
||||
const c10::OperatorHandle& op,
|
||||
c10::DispatchKeySet dispatch_keys,
|
||||
torch::jit::Stack* stack) const = 0;
|
||||
virtual std::optional<c10::ScalarType> grad_dtype(const TensorBase&) const = 0;
|
||||
virtual void set_grad_dtype(const TensorBase&, const std::optional<c10::ScalarType>&) const = 0;
|
||||
};
|
||||
|
||||
TORCH_API void SetVariableHooks(VariableHooksInterface* hooks);
|
||||
|
||||
@ -18,10 +18,10 @@ class KernelFunction;
|
||||
// implementation notes; notably, this does NOT actually go through the
|
||||
// boxing/unboxing codepath.
|
||||
TORCH_API void fallthrough_kernel(
|
||||
OperatorKernel* /*unused*/,
|
||||
const OperatorHandle& /*unused*/,
|
||||
DispatchKeySet /*unused*/,
|
||||
Stack* /*unused*/);
|
||||
OperatorKernel*,
|
||||
const OperatorHandle&,
|
||||
DispatchKeySet,
|
||||
Stack*);
|
||||
|
||||
// Note [Ambiguity in AutogradOther kernel]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
@ -62,10 +62,10 @@ TORCH_API void fallthrough_kernel(
|
||||
// than arbitrarily pick one or the other, we just register a kernel that raises
|
||||
// an error and let the user decide how to proceed.
|
||||
TORCH_API void ambiguous_autogradother_kernel(
|
||||
OperatorKernel* /*unused*/,
|
||||
const OperatorHandle& /*op*/,
|
||||
DispatchKeySet /*unused*/,
|
||||
Stack* /*unused*/);
|
||||
OperatorKernel*,
|
||||
const OperatorHandle&,
|
||||
DispatchKeySet,
|
||||
Stack*);
|
||||
|
||||
// Note [named_not_supported_kernel]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
@ -75,10 +75,10 @@ TORCH_API void ambiguous_autogradother_kernel(
|
||||
// give a good error message in cases when boxing is not supported). When
|
||||
// boxing is universally supported this can be removed.
|
||||
[[noreturn]] TORCH_API void named_not_supported_kernel(
|
||||
OperatorKernel* /*unused*/,
|
||||
const OperatorHandle& /*op*/,
|
||||
DispatchKeySet /*unused*/,
|
||||
Stack* /*unused*/);
|
||||
OperatorKernel*,
|
||||
const OperatorHandle&,
|
||||
DispatchKeySet,
|
||||
Stack*);
|
||||
|
||||
/**
|
||||
* BoxedKernel is similar to a std::function storing a boxed kernel.
|
||||
@ -185,16 +185,16 @@ class TORCH_API BoxedKernel final {
|
||||
|
||||
template <BoxedKernelFunction* func>
|
||||
static void make_boxed_function(
|
||||
OperatorKernel* /*unused*/,
|
||||
OperatorKernel*,
|
||||
const OperatorHandle& opHandle,
|
||||
DispatchKeySet /*unused*/,
|
||||
DispatchKeySet,
|
||||
Stack* stack);
|
||||
|
||||
template <BoxedKernelFunction_withDispatchKeys* func>
|
||||
static void make_boxed_function(
|
||||
OperatorKernel* /*unused*/,
|
||||
OperatorKernel*,
|
||||
const OperatorHandle& opHandle,
|
||||
DispatchKeySet /*ks*/,
|
||||
DispatchKeySet,
|
||||
Stack* stack);
|
||||
|
||||
explicit BoxedKernel(
|
||||
|
||||
@ -11,9 +11,9 @@ inline BoxedKernel::BoxedKernel(
|
||||
|
||||
template <BoxedKernel::BoxedKernelFunction* func>
|
||||
inline void BoxedKernel::make_boxed_function(
|
||||
OperatorKernel* /*unused*/,
|
||||
OperatorKernel*,
|
||||
const OperatorHandle& opHandle,
|
||||
DispatchKeySet /*unused*/,
|
||||
DispatchKeySet,
|
||||
Stack* stack) {
|
||||
// Note that we're dropping the DispatchKeySet argument.
|
||||
// See Note [Plumbing Keys Through The Dispatcher 2] for details.
|
||||
@ -22,7 +22,7 @@ inline void BoxedKernel::make_boxed_function(
|
||||
|
||||
template <BoxedKernel::BoxedKernelFunction_withDispatchKeys* func>
|
||||
inline void BoxedKernel::make_boxed_function(
|
||||
OperatorKernel* /*unused*/,
|
||||
OperatorKernel*,
|
||||
const OperatorHandle& opHandle,
|
||||
DispatchKeySet ks,
|
||||
Stack* stack) {
|
||||
|
||||
@ -10,7 +10,7 @@ namespace c10 {
|
||||
// be handled specially. Its semantics is that it redispatches to the
|
||||
// *next* dispatch key that would have been processed, skipping the current
|
||||
// one.
|
||||
void fallthrough_kernel(OperatorKernel* /*unused*/, const OperatorHandle& /*unused*/, DispatchKeySet /*unused*/, Stack* /*unused*/) {
|
||||
void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, DispatchKeySet, Stack*) {
|
||||
TORCH_INTERNAL_ASSERT(0,
|
||||
"fallthrough_kernel was executed but it should have been short-circuited by the dispatcher. "
|
||||
"This could occur if you registered a fallthrough kernel as a override for a specific operator "
|
||||
@ -19,7 +19,7 @@ void fallthrough_kernel(OperatorKernel* /*unused*/, const OperatorHandle& /*unus
|
||||
"let us know in the bug tracker.");
|
||||
}
|
||||
|
||||
void ambiguous_autogradother_kernel(OperatorKernel* /*unused*/, const OperatorHandle& op, DispatchKeySet /*unused*/, Stack* /*unused*/) {
|
||||
void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, DispatchKeySet, Stack*) {
|
||||
TORCH_INTERNAL_ASSERT(0,
|
||||
op.operator_name(), " has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. "
|
||||
"This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering "
|
||||
@ -32,7 +32,7 @@ void ambiguous_autogradother_kernel(OperatorKernel* /*unused*/, const OperatorHa
|
||||
"\nCanonical state\n~~~~~~~~~~~\n", op.dumpState(), "\n\n");
|
||||
}
|
||||
|
||||
void named_not_supported_kernel(OperatorKernel* /*unused*/, const OperatorHandle& op, DispatchKeySet /*unused*/, Stack* /*unused*/) {
|
||||
void named_not_supported_kernel(OperatorKernel*, const OperatorHandle& op, DispatchKeySet, Stack*) {
|
||||
// DO NOT LOOK AT STACK, YOU HAVE SHORT CIRCUITED BOXING
|
||||
// See Note [named_not_supported_kernel]
|
||||
TORCH_CHECK(0,
|
||||
|
||||
@ -229,7 +229,7 @@ class TORCH_API KernelFunction final {
|
||||
* &unboxed_func>();
|
||||
*/
|
||||
template <class FuncPtr, bool AllowLegacyTypes = false>
|
||||
static KernelFunction makeFromUnboxedFunction(FuncPtr /*func_ptr*/);
|
||||
static KernelFunction makeFromUnboxedFunction(FuncPtr);
|
||||
|
||||
/**
|
||||
* Create a KernelFunction from an unboxed function.
|
||||
@ -271,7 +271,7 @@ class TORCH_API KernelFunction final {
|
||||
|
||||
std::string dumpState() const;
|
||||
// For testing internal invariants only
|
||||
bool _equalsBoxedAndUnboxed(const KernelFunction& /*other*/) const;
|
||||
bool _equalsBoxedAndUnboxed(const KernelFunction&) const;
|
||||
|
||||
// Register a token to be invalidated when this KernelFunction is destroyed
|
||||
void registerToken(std::weak_ptr<KernelToken> token) const;
|
||||
|
||||
@ -131,7 +131,7 @@ C10_ALWAYS_INLINE_UNLESS_MOBILE void boxToStack(
|
||||
new (dest++) IValue(options.pinned_memory());
|
||||
}
|
||||
|
||||
inline void boxArgsToStack(IValue*& /*unused*/) {}
|
||||
inline void boxArgsToStack(IValue*&) {}
|
||||
|
||||
template <typename T, typename... Args>
|
||||
C10_ALWAYS_INLINE_UNLESS_MOBILE void boxArgsToStack(
|
||||
@ -185,7 +185,7 @@ struct PopResult<std::tuple<Types...>> final {
|
||||
template <size_t... indices>
|
||||
static Result pop_to_tuple_impl(
|
||||
Stack& stack,
|
||||
std::index_sequence<indices...> /*unused*/) {
|
||||
std::index_sequence<indices...>) {
|
||||
return std::make_tuple((std::move(stack[indices]).template to<Types>())...);
|
||||
}
|
||||
};
|
||||
|
||||
@ -561,7 +561,7 @@ struct wrap_kernel_functor_unboxed_<
|
||||
// doesn't use &&
|
||||
static ReturnType call(
|
||||
OperatorKernel* functor,
|
||||
DispatchKeySet /*unused*/,
|
||||
DispatchKeySet,
|
||||
ParameterTypes... args) {
|
||||
KernelFunctor* functor_ = static_cast<KernelFunctor*>(functor);
|
||||
// Note [Plumbing Keys Through The Dispatcher 2]
|
||||
@ -629,8 +629,8 @@ call_functor_with_args_from_stack_(
|
||||
OperatorKernel* functor,
|
||||
DispatchKeySet dispatchKeySet,
|
||||
Stack* stack,
|
||||
std::index_sequence<ivalue_arg_indices...> /*unused*/,
|
||||
guts::typelist::typelist<ArgTypes...>* /*unused*/) {
|
||||
std::index_sequence<ivalue_arg_indices...>,
|
||||
guts::typelist::typelist<ArgTypes...>*) {
|
||||
(void)(stack); // when sizeof...(ivalue_arg_indices) == 0, this argument would
|
||||
// be unused and we have to silence the compiler warning.
|
||||
|
||||
@ -708,7 +708,7 @@ struct push_outputs<std::tuple<OutputTypes...>, AllowDeprecatedTypes> final {
|
||||
static void call_(
|
||||
std::tuple<OutputTypes...>&& output,
|
||||
Stack* stack,
|
||||
std::index_sequence<indices...> /*unused*/) {
|
||||
std::index_sequence<indices...>) {
|
||||
torch::jit::push(
|
||||
*stack,
|
||||
return_to_ivalue<OutputTypes, AllowDeprecatedTypes>::call(
|
||||
@ -718,7 +718,7 @@ struct push_outputs<std::tuple<OutputTypes...>, AllowDeprecatedTypes> final {
|
||||
static void copy_(
|
||||
const std::tuple<OutputTypes...>& output,
|
||||
Stack* stack,
|
||||
std::index_sequence<indices...> /*unused*/) {
|
||||
std::index_sequence<indices...>) {
|
||||
torch::jit::push(
|
||||
*stack,
|
||||
return_to_ivalue<OutputTypes, AllowDeprecatedTypes>::copy(
|
||||
@ -741,7 +741,7 @@ struct make_boxed_from_unboxed_functor final {
|
||||
|
||||
static void call(
|
||||
OperatorKernel* functor,
|
||||
const OperatorHandle& /*unused*/,
|
||||
const OperatorHandle&,
|
||||
DispatchKeySet dispatchKeySet,
|
||||
Stack* stack) {
|
||||
using ReturnType =
|
||||
|
||||
@ -63,13 +63,13 @@ struct BuiltinOpFunction : public Function {
|
||||
|
||||
bool call(
|
||||
Stack& stack,
|
||||
std::optional<size_t> /*unused*/,
|
||||
c10::function_ref<void(const Code&)> /*unused*/) override {
|
||||
std::optional<size_t>,
|
||||
c10::function_ref<void(const Code&)>) override {
|
||||
run(stack);
|
||||
return false;
|
||||
}
|
||||
|
||||
bool call(Stack& stack, c10::function_ref<void(const mobile::Code&)> /*unused*/)
|
||||
bool call(Stack& stack, c10::function_ref<void(const mobile::Code&)>)
|
||||
override {
|
||||
run(stack);
|
||||
return false;
|
||||
|
||||
@ -80,8 +80,7 @@ struct MultiDispatchKeySet : at::IterArgs<MultiDispatchKeySet> {
|
||||
ts = ts | x.key_set();
|
||||
}
|
||||
}
|
||||
[[noreturn]] void operator()(
|
||||
at::ArrayRef<std::optional<at::Tensor>> /*unused*/) {
|
||||
[[noreturn]] void operator()(at::ArrayRef<std::optional<at::Tensor>>) {
|
||||
// Just checking that the handling of Tensor?[] didn't change.
|
||||
TORCH_INTERNAL_ASSERT(false);
|
||||
}
|
||||
@ -96,7 +95,7 @@ struct MultiDispatchKeySet : at::IterArgs<MultiDispatchKeySet> {
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
void operator()(const T& /*unused*/) {
|
||||
void operator()(const T&) {
|
||||
// do nothing
|
||||
}
|
||||
};
|
||||
|
||||
@ -496,7 +496,7 @@ class TORCH_API OperatorHandle {
|
||||
}
|
||||
|
||||
void checkInvariants() const {
|
||||
operatorDef_->op.checkInvariants();
|
||||
return operatorDef_->op.checkInvariants();
|
||||
}
|
||||
|
||||
c10::ArrayRef<at::Tag> getTags() const {
|
||||
@ -633,7 +633,7 @@ class TypedOperatorHandle<Return(Args...)> final : public OperatorHandle {
|
||||
|
||||
namespace detail {
|
||||
template <class... Args>
|
||||
inline void unused_arg_(const Args&... /*unused*/) {}
|
||||
inline void unused_arg_(const Args&...) {}
|
||||
|
||||
// CaptureKernelCall is intended to capture return values from Dispatcher
|
||||
// unboxed kernel calls. A record function may request to get outputs from the
|
||||
@ -932,7 +932,7 @@ inline void Dispatcher::redispatchBoxed(
|
||||
}
|
||||
#endif
|
||||
const auto& kernel = entry.lookup(dispatchKeySet);
|
||||
kernel.callBoxed(op, dispatchKeySet, stack);
|
||||
return kernel.callBoxed(op, dispatchKeySet, stack);
|
||||
}
|
||||
|
||||
} // namespace c10
|
||||
|
||||
@ -105,7 +105,7 @@ class TORCH_API OperatorEntry final {
|
||||
// versa that is an error. (Refcounting for the registrations is
|
||||
// handled in the OperatorHandle in Dispatcher)
|
||||
void registerSchema(
|
||||
FunctionSchema&& /*schema*/,
|
||||
FunctionSchema&&,
|
||||
std::string&& debug,
|
||||
std::vector<at::Tag> tags = {});
|
||||
void deregisterSchema();
|
||||
|
||||
@ -177,7 +177,7 @@ bool DynamicType::equals(const Type& rhs) const {
|
||||
return equals(*create(rhs));
|
||||
}
|
||||
|
||||
bool DynamicType::isSubtypeOfExt(const Type& rhs, std::ostream* /*why_not*/) const {
|
||||
bool DynamicType::isSubtypeOfExt(const Type& rhs, std::ostream*) const {
|
||||
auto other = create(rhs);
|
||||
if (tag_ == other->tag_) {
|
||||
if (equals(*other)) {
|
||||
@ -371,7 +371,7 @@ DynamicTypePtr ivalue::TupleTypeFactory<c10::DynamicType>::create(
|
||||
}
|
||||
|
||||
DynamicTypePtr ivalue::TupleTypeFactory<c10::DynamicType>::fallback(
|
||||
const Type& /*unused*/) {
|
||||
const Type&) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@ -138,8 +138,8 @@ class DynamicType : public SharedType {
|
||||
|
||||
struct Arguments {
|
||||
Arguments() = default;
|
||||
Arguments(c10::ArrayRef<TypePtr> /*args*/);
|
||||
Arguments(const std::vector<std::string_view>& /*names*/, c10::ArrayRef<TypePtr> /*args*/);
|
||||
Arguments(c10::ArrayRef<TypePtr>);
|
||||
Arguments(const std::vector<std::string_view>&, c10::ArrayRef<TypePtr>);
|
||||
std::vector<LabeledDynamicType> elems;
|
||||
};
|
||||
|
||||
@ -156,15 +156,15 @@ class DynamicType : public SharedType {
|
||||
static const TypeKind Kind = TypeKind::DynamicType;
|
||||
static TORCH_API DynamicTypePtr create(Type& ty);
|
||||
|
||||
explicit DynamicType(Tag /*tag*/, Arguments /*arguments*/);
|
||||
explicit DynamicType(Tag /*tag*/, std::string_view /*name*/, Arguments /*arguments*/);
|
||||
explicit DynamicType(Tag, Arguments);
|
||||
explicit DynamicType(Tag, std::string_view, Arguments);
|
||||
|
||||
DynamicType(DynamicType&& other) = delete;
|
||||
DynamicType(const DynamicType&) = delete;
|
||||
DynamicType& operator=(const DynamicType&) = delete;
|
||||
DynamicType& operator=(DynamicType&&) = delete;
|
||||
|
||||
TypePtr containedType(size_t /*i*/) const override;
|
||||
TypePtr containedType(size_t) const override;
|
||||
size_t containedTypeSize() const override;
|
||||
Tag tag() const {
|
||||
return tag_;
|
||||
|
||||
@ -96,15 +96,15 @@ struct TORCH_API Function {
|
||||
// Overload for server interpreter, a bailout size is needed for graph
|
||||
// executor.
|
||||
virtual bool call(
|
||||
Stack& /*unused*/,
|
||||
std::optional<size_t> /*unused*/,
|
||||
c10::function_ref<void(const Code&)> /*unused*/) {
|
||||
Stack&,
|
||||
std::optional<size_t>,
|
||||
c10::function_ref<void(const Code&)>) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Overload for mobile interpreter.
|
||||
virtual bool call(Stack& /*unused*/, c10::function_ref<void(const mobile::Code&)> /*unused*/) {
|
||||
virtual bool call(Stack&, c10::function_ref<void(const mobile::Code&)>) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -847,7 +847,7 @@ struct TORCH_API IValue final {
|
||||
IValue(std::optional<T> v);
|
||||
template <class T, enable_if_list_is_ivalue_constructible<T> = nullptr>
|
||||
IValue(c10::OptionalArrayRef<T> v);
|
||||
IValue(std::nullopt_t /*unused*/);
|
||||
IValue(std::nullopt_t);
|
||||
|
||||
// ClassType
|
||||
IValue(c10::intrusive_ptr<ivalue::Object> v);
|
||||
|
||||
@ -660,7 +660,7 @@ struct TORCH_API TupleTypeFactory<TupleType> {
|
||||
template <>
|
||||
struct TORCH_API TupleTypeFactory<c10::DynamicType> {
|
||||
static DynamicTypePtr create(const std::vector<TypePtr>& elemTypes);
|
||||
static DynamicTypePtr fallback(const Type& /*unused*/);
|
||||
static DynamicTypePtr fallback(const Type&);
|
||||
};
|
||||
|
||||
struct TORCH_API Tuple : c10::intrusive_ptr_target {
|
||||
@ -1682,7 +1682,7 @@ struct ivalue::EnumHolder : c10::intrusive_ptr_target {
|
||||
namespace detail {
|
||||
|
||||
struct _guarded_unsigned_long_unique_dummy final {
|
||||
_guarded_unsigned_long_unique_dummy(int64_t /*unused*/){}
|
||||
_guarded_unsigned_long_unique_dummy(int64_t){}
|
||||
};
|
||||
using _guarded_unsigned_long = std::conditional_t<
|
||||
std::is_same_v<unsigned long, uint32_t> ||
|
||||
@ -1776,7 +1776,7 @@ template <class Elem>
|
||||
// native_functions.yaml still return std::vector.
|
||||
// C10_DEPRECATED_MESSAGE("IValues based on std::vector<T> are potentially slow
|
||||
// and deprecated. Please use torch::List<T> instead.")
|
||||
std::vector<Elem> generic_to(IValue ivalue, _fake_type<std::vector<Elem>> /*unused*/) {
|
||||
std::vector<Elem> generic_to(IValue ivalue, _fake_type<std::vector<Elem>>) {
|
||||
// We need to do a deep copy of the vector because there might be other
|
||||
// references to this same IValue that also use the list. We can't just
|
||||
// move the elements out.
|
||||
@ -1826,18 +1826,18 @@ c10::intrusive_ptr<T> IValue::toCustomClass() const& {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T generic_to(IValue ivalue, _fake_type<T> /*unused*/) {
|
||||
T generic_to(IValue ivalue, _fake_type<T>) {
|
||||
using ElemType = typename std::remove_pointer<T>::type::element_type;
|
||||
return std::move(ivalue).template toCustomClass<ElemType>();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
tagged_capsule<T> generic_to(IValue ivalue, _fake_type<tagged_capsule<T>> /*unused*/) {
|
||||
tagged_capsule<T> generic_to(IValue ivalue, _fake_type<tagged_capsule<T>>) {
|
||||
return tagged_capsule<T>{std::move(ivalue)};
|
||||
}
|
||||
|
||||
template <typename Elem>
|
||||
c10::List<Elem> generic_to(IValue ivalue, _fake_type<c10::List<Elem>> /*unused*/) {
|
||||
c10::List<Elem> generic_to(IValue ivalue, _fake_type<c10::List<Elem>>) {
|
||||
return impl::toTypedList<Elem>(std::move(ivalue).toList());
|
||||
}
|
||||
|
||||
@ -1867,7 +1867,7 @@ std::vector<T> createVectorFromList(const c10::List<T>& impl) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
OptionalArray<T> generic_to(IValue ivalue, _fake_type<OptionalArray<T>> /*unused*/) {
|
||||
OptionalArray<T> generic_to(IValue ivalue, _fake_type<OptionalArray<T>>) {
|
||||
if (ivalue.isNone()) {
|
||||
return {};
|
||||
}
|
||||
@ -1880,8 +1880,8 @@ namespace detail {
|
||||
template <typename Elem, size_t... I>
|
||||
std::array<Elem, sizeof...(I)> generic_to_array(
|
||||
IValue ivalue,
|
||||
_fake_type<std::array<Elem, sizeof...(I)>> /*unused*/,
|
||||
std::index_sequence<I...> /*unused*/) {
|
||||
_fake_type<std::array<Elem, sizeof...(I)>>,
|
||||
std::index_sequence<I...>) {
|
||||
// We need to do a deep copy of the array because there might be other
|
||||
// references to this same IValue that also use the list. We can't just
|
||||
// move the elements out.
|
||||
@ -1906,7 +1906,7 @@ std::array<Elem, N> generic_to(
|
||||
template <typename Key, typename Value>
|
||||
c10::Dict<Key, Value> generic_to(
|
||||
IValue ivalue,
|
||||
_fake_type<c10::Dict<Key, Value>> /*unused*/) {
|
||||
_fake_type<c10::Dict<Key, Value>>) {
|
||||
return impl::toTypedDict<Key, Value>(std::move(ivalue).toGenericDict());
|
||||
}
|
||||
|
||||
@ -1915,7 +1915,7 @@ C10_DEPRECATED_MESSAGE(
|
||||
"IValues based on std::unordered_map are slow and deprecated. Please use c10::Dict<K, V> instead.")
|
||||
std::unordered_map<K, V> generic_to(
|
||||
IValue ivalue,
|
||||
_fake_type<std::unordered_map<K, V>> /*unused*/) {
|
||||
_fake_type<std::unordered_map<K, V>>) {
|
||||
std::unordered_map<K, V> specialized_dict;
|
||||
|
||||
for (const auto& item : std::move(ivalue).toGenericDict()) {
|
||||
@ -1926,7 +1926,7 @@ std::unordered_map<K, V> generic_to(
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::optional<T> generic_to(IValue ivalue, _fake_type<std::optional<T>> /*unused*/) {
|
||||
std::optional<T> generic_to(IValue ivalue, _fake_type<std::optional<T>>) {
|
||||
if (ivalue.isNone()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
@ -1937,7 +1937,7 @@ namespace detail {
|
||||
template <typename Tuple, std::size_t... INDEX>
|
||||
Tuple generic_to_tuple_impl(
|
||||
const ivalue::TupleElements& t,
|
||||
std::index_sequence<INDEX...> /*unused*/) {
|
||||
std::index_sequence<INDEX...>) {
|
||||
return std::make_tuple(
|
||||
t[INDEX].to<typename std::tuple_element<INDEX, Tuple>::type>()...);
|
||||
}
|
||||
@ -1951,7 +1951,7 @@ template <
|
||||
std::is_lvalue_reference<Args>...,
|
||||
std::negation<std::is_constructible<IValue, Args>>...>,
|
||||
std::nullptr_t> = nullptr>
|
||||
std::tuple<Args...> generic_to(const IValue& ivalue, _fake_type<std::tuple<Args...>> /*unused*/) {
|
||||
std::tuple<Args...> generic_to(const IValue& ivalue, _fake_type<std::tuple<Args...>>) {
|
||||
const auto& vals = ivalue.toTupleRef().elements();
|
||||
TORCH_CHECK(vals.size() == sizeof...(Args));
|
||||
return detail::generic_to_tuple_impl<std::tuple<Args...>>(vals, Indices{});
|
||||
@ -2311,7 +2311,7 @@ inline IValue::IValue(std::optional<T> v) : IValue() {
|
||||
}
|
||||
}
|
||||
|
||||
inline IValue::IValue(std::nullopt_t /*unused*/) : IValue() {}
|
||||
inline IValue::IValue(std::nullopt_t) : IValue() {}
|
||||
|
||||
inline IValue::IValue(c10::intrusive_ptr<ivalue::Object> v)
|
||||
: tag(Tag::Object) {
|
||||
@ -2482,15 +2482,15 @@ namespace ivalue {
|
||||
namespace detail {
|
||||
|
||||
template <typename T>
|
||||
IValue from_(T&& x, std::true_type /*unused*/) {
|
||||
IValue from_(T&& x, std::true_type) {
|
||||
return IValue(std::forward<T>(x));
|
||||
}
|
||||
template <typename T>
|
||||
IValue from_(c10::intrusive_ptr<T> x, std::false_type /*unused*/) {
|
||||
IValue from_(c10::intrusive_ptr<T> x, std::false_type) {
|
||||
return IValue(std::move(x));
|
||||
}
|
||||
template <typename T>
|
||||
IValue from_(T&& /*x*/, std::false_type /*unused*/) {
|
||||
IValue from_(T&& /*x*/, std::false_type) {
|
||||
static_assert(
|
||||
guts::false_t<T>::value,
|
||||
"You are calling from with a type that it doesn't support, and isn't a potential custom class (ie: is an intrusive_ptr)");
|
||||
@ -2546,19 +2546,19 @@ struct MaybeOwnedTraits<IValue> {
|
||||
return &borrow;
|
||||
}
|
||||
|
||||
static bool debugBorrowIsValid(const borrow_type& /*unused*/) {
|
||||
static bool debugBorrowIsValid(const borrow_type&) {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct IValue::TagType<c10::Type> {
|
||||
static TORCH_API c10::TypePtr get(const IValue& /*v*/);
|
||||
static TORCH_API c10::TypePtr get(const IValue&);
|
||||
};
|
||||
|
||||
template <>
|
||||
struct IValue::TagType<c10::DynamicType> {
|
||||
static TORCH_API c10::TypePtr get(const IValue& /*v*/);
|
||||
static TORCH_API c10::TypePtr get(const IValue&);
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
|
||||
@ -44,7 +44,7 @@ constexpr int checkStaticTypes() {
|
||||
}
|
||||
|
||||
template <typename... Ts, size_t... Is>
|
||||
constexpr std::array<ArgumentDef, sizeof...(Ts)> createArgumentVectorFromTypes(std::index_sequence<Is...> /*unused*/) {
|
||||
constexpr std::array<ArgumentDef, sizeof...(Ts)> createArgumentVectorFromTypes(std::index_sequence<Is...>) {
|
||||
return (
|
||||
// Check types for common errors
|
||||
checkStaticTypes<Ts...>(),
|
||||
|
||||
@ -83,7 +83,7 @@ inline bool operator!=(const OperatorName& lhs, const OperatorName& rhs) {
|
||||
}
|
||||
|
||||
TORCH_API std::string toString(const OperatorName& opName);
|
||||
TORCH_API std::ostream& operator<<(std::ostream& /*os*/, const OperatorName& /*opName*/);
|
||||
TORCH_API std::ostream& operator<<(std::ostream&, const OperatorName&);
|
||||
|
||||
} // namespace c10
|
||||
|
||||
|
||||
@ -16,7 +16,7 @@ class SingletonTypePtr {
|
||||
/* implicit */ SingletonTypePtr(T* p) : repr_(p) {}
|
||||
|
||||
// We need this to satisfy Pybind11, but it shouldn't be hit.
|
||||
explicit SingletonTypePtr(std::shared_ptr<T> /*unused*/) { TORCH_CHECK(false); }
|
||||
explicit SingletonTypePtr(std::shared_ptr<T>) { TORCH_CHECK(false); }
|
||||
|
||||
using element_type = typename std::shared_ptr<T>::element_type;
|
||||
|
||||
|
||||
@ -102,8 +102,31 @@ struct VecReduceAllSIMD<float, Op> {
|
||||
#endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) &&
|
||||
// !defined(C10_MOBILE)
|
||||
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
!defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
|
||||
#if defined(CPU_CAPABILITY_SVE256)
|
||||
template <typename Op>
|
||||
struct VecReduceAllSIMD<float, Op> {
|
||||
static inline float apply(
|
||||
const Op& vec_fun,
|
||||
const Vectorized<float>& acc_vec) {
|
||||
using Vec = Vectorized<float>;
|
||||
Vec v = acc_vec;
|
||||
// 128-bit shuffle
|
||||
svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
|
||||
Vec v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
// 64-bit shuffle
|
||||
ind = svdupq_n_u32(2, 3, 0, 1);
|
||||
v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
// 32-bit shuffle
|
||||
ind = svdupq_n_u32(1, 0, 2, 3);
|
||||
v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
return svlasta(svpfalse(), v);
|
||||
}
|
||||
};
|
||||
#else
|
||||
template <typename Op>
|
||||
struct VecReduceAllSIMD<float, Op> {
|
||||
static inline float apply(
|
||||
@ -140,35 +163,8 @@ struct VecReduceAllSIMD<float, std::plus<Vectorized<float>>> {
|
||||
return vaddvq_f32(acc_vec);
|
||||
}
|
||||
};
|
||||
#endif // defined(CPU_CAPABILITY_SVE256)
|
||||
#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
|
||||
// && !defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
defined(CPU_CAPABILITY_SVE256)
|
||||
template <typename Op>
|
||||
struct VecReduceAllSIMD<float, Op> {
|
||||
static inline float apply(
|
||||
const Op& vec_fun,
|
||||
const Vectorized<float>& acc_vec) {
|
||||
using Vec = Vectorized<float>;
|
||||
Vec v = acc_vec;
|
||||
// 128-bit shuffle
|
||||
svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
|
||||
Vec v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
// 64-bit shuffle
|
||||
ind = svdupq_n_u32(2, 3, 0, 1);
|
||||
v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
// 32-bit shuffle
|
||||
ind = svdupq_n_u32(1, 0, 2, 3);
|
||||
v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
return svlasta(svpfalse(), v);
|
||||
}
|
||||
};
|
||||
#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
|
||||
// && defined(CPU_CAPABILITY_SVE256)
|
||||
|
||||
template <typename scalar_t, typename Op>
|
||||
inline scalar_t vec_reduce_all(
|
||||
|
||||
@ -1,9 +1,21 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <cstdint>
|
||||
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
|
||||
#if defined(__aarch64__) && \
|
||||
(defined(AT_BUILD_ARM_VEC256_WITH_SLEEF) || \
|
||||
defined(AT_BUILD_ARM_VECSVE_WITH_SLEEF))
|
||||
#define SLEEF_STATIC_LIBS
|
||||
#include <sleef.h>
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
|
||||
#else
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
|
||||
#endif
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
// Define the data type of VLS(vector-length specific).
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
#include <ATen/cpu/vec/sve/vec_common_sve.h>
|
||||
#include <ATen/cpu/vec/sve/vec_float.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <c10/util/bit_cast.h>
|
||||
@ -308,8 +307,8 @@ Vectorized<c10::BFloat16> inline operator/(
|
||||
}
|
||||
|
||||
inline Vectorized<BFloat16>::Vectorized() {
|
||||
auto vals_f = svdup_n_f32(0);
|
||||
values = convert_float_bfloat16(vals_f, vals_f);
|
||||
const short zero = 0;
|
||||
values = svdup_n_bf16(c10::bit_cast<bfloat16_t>(zero));
|
||||
}
|
||||
|
||||
inline Vectorized<BFloat16>::Vectorized(int val) {
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
#pragma once
|
||||
|
||||
#if defined(CPU_CAPABILITY_AVX512)
|
||||
#if defined(__aarch64__)
|
||||
#include <ATen/cpu/vec/vec_common_aarch64.h>
|
||||
#elif defined(CPU_CAPABILITY_AVX512)
|
||||
#include <ATen/cpu/vec/vec512/vec512.h>
|
||||
#else
|
||||
#include <ATen/cpu/vec/vec128/vec128.h>
|
||||
@ -11,6 +13,34 @@ namespace at::vec {
|
||||
// See Note [CPU_CAPABILITY namespace]
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) {
|
||||
stream << val.val_;
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) {
|
||||
stream << static_cast<int>(val.val_);
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) {
|
||||
stream << static_cast<unsigned int>(val.val_);
|
||||
return stream;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
|
||||
T buf[Vectorized<T>::size()];
|
||||
vec.store(buf);
|
||||
stream << "vec[";
|
||||
for (int i = 0; i != Vectorized<T>::size(); i++) {
|
||||
if (i != 0) {
|
||||
stream << ", ";
|
||||
}
|
||||
stream << buf[i];
|
||||
}
|
||||
stream << "]";
|
||||
return stream;
|
||||
}
|
||||
|
||||
inline Vectorized<bool> convert_to_bool(Vectorized<int8_t> x) {
|
||||
__at_align__ bool buffer[x.size()];
|
||||
x.ne(Vectorized<int8_t>(0)).store(buffer);
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
|
||||
// DO NOT DEFINE STATIC DATA IN THIS HEADER!
|
||||
// See Note [Do not compile initializers with AVX]
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
@ -262,6 +263,13 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
|
||||
c10::bit_cast<at_bfloat16_t>(val6.x),
|
||||
c10::bit_cast<at_bfloat16_t>(val7.x)}) {}
|
||||
|
||||
#ifdef CPU_CAPABILITY_SVE128
|
||||
Vectorized(svbfloat16_t v) : Vectorized16(svget_neonq(v)) {}
|
||||
operator svbfloat16_t() const {
|
||||
return svset_neonq(svundef_bf16(), values);
|
||||
}
|
||||
#endif
|
||||
|
||||
static Vectorized<c10::BFloat16> blendv(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b,
|
||||
@ -374,6 +382,23 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
|
||||
Vectorized ge(const Vectorized& other) const;
|
||||
Vectorized lt(const Vectorized& other) const;
|
||||
Vectorized le(const Vectorized& other) const;
|
||||
|
||||
#ifdef CPU_CAPABILITY_SVE128
|
||||
|
||||
template <typename step_t>
|
||||
static Vectorized<BFloat16> arange(
|
||||
BFloat16 base = 0.f,
|
||||
step_t step = static_cast<step_t>(1)) {
|
||||
__at_align__ BFloat16 buffer[size()];
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
buffer[i] = base + i * step;
|
||||
}
|
||||
return svget_neonq(
|
||||
svld1_bf16(ptrue, reinterpret_cast<bfloat16_t*>(buffer)));
|
||||
}
|
||||
|
||||
#endif // CPU_CAPABILITY_SVE128
|
||||
|
||||
}; // Vectorized<c10::BFloat16>
|
||||
|
||||
inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(
|
||||
@ -397,6 +422,24 @@ inline Vectorized<c10::BFloat16> convert_float_bfloat16(
|
||||
return Vectorized<c10::BFloat16>(at_vcombine_bf16(x1, x2));
|
||||
}
|
||||
|
||||
inline void load_fp32_from_bf16(const BFloat16* data, Vectorized<float>& out) {
|
||||
__at_align__ float values[Vectorized<float>::size()];
|
||||
for (const auto k : c10::irange(Vectorized<float>::size())) {
|
||||
values[k] = data[k];
|
||||
}
|
||||
out = Vectorized<float>::loadu(values);
|
||||
}
|
||||
|
||||
inline void load_fp32_from_bf16(
|
||||
const BFloat16* data,
|
||||
Vectorized<float>& out1,
|
||||
Vectorized<float>& out2) {
|
||||
Vectorized<BFloat16> bf16_vec = Vectorized<BFloat16>::loadu(data);
|
||||
auto floats = convert_bfloat16_float(bf16_vec);
|
||||
out1 = std::get<0>(floats);
|
||||
out2 = std::get<1>(floats);
|
||||
}
|
||||
|
||||
template <typename Op>
|
||||
Vectorized<c10::BFloat16> binary_operator_via_float(
|
||||
Op op,
|
||||
@ -579,6 +622,12 @@ Vectorized<c10::BFloat16> inline fnmsub(
|
||||
return -a * b - c;
|
||||
}
|
||||
|
||||
#else //
|
||||
|
||||
CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16)
|
||||
|
||||
LOAD_FP32_NON_VECTORIZED_INIT(BFloat16, bf16)
|
||||
|
||||
#endif // !defined(C10_MOBILE) && defined(__aarch64__)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
|
||||
namespace at::vec {
|
||||
inline namespace CPU_CAPABILITY {
|
||||
#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
|
||||
#if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256)
|
||||
template <typename src_t>
|
||||
struct VecConvert<
|
||||
float,
|
||||
@ -60,6 +60,7 @@ struct VecConvert<float, 1, BFloat16, 1> {
|
||||
}
|
||||
};
|
||||
|
||||
#endif // defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256)
|
||||
#endif // defined(__aarch64__) && (!defined(CPU_CAPABILITY_SVE) ||
|
||||
// defined(CPU_CAPABILITY_SVE128))
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
|
||||
@ -4,13 +4,10 @@
|
||||
// See Note [Do not compile initializers with AVX]
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
|
||||
#include <sleef.h>
|
||||
#endif
|
||||
|
||||
// Sleef offers vectorized versions of some transcedentals
|
||||
// such as sin, cos, tan etc..
|
||||
// However for now opting for STL, since we are not building
|
||||
@ -35,12 +32,6 @@ inline namespace CPU_CAPABILITY {
|
||||
#error "Big endian is not supported."
|
||||
#endif
|
||||
|
||||
#if defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
|
||||
#else
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
|
||||
#endif
|
||||
|
||||
template <int index, bool mask_val>
|
||||
struct BlendRegs {
|
||||
static float32x4_t impl(
|
||||
@ -94,6 +85,12 @@ class Vectorized<float> {
|
||||
operator float32x4_t() const {
|
||||
return values;
|
||||
}
|
||||
#ifdef CPU_CAPABILITY_SVE128
|
||||
Vectorized(svfloat32_t v) : values(svget_neonq(v)) {}
|
||||
operator svfloat32_t() const {
|
||||
return svset_neonq(svundef_f32(), values);
|
||||
}
|
||||
#endif
|
||||
template <int64_t mask>
|
||||
static Vectorized<float> blend(
|
||||
const Vectorized<float>& a,
|
||||
|
||||
@ -4,7 +4,6 @@
|
||||
// See Note [Do not compile initializers with AVX]
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_convert.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
@ -25,7 +24,6 @@ inline namespace CPU_CAPABILITY {
|
||||
// https://bugs.llvm.org/show_bug.cgi?id=45824
|
||||
// Most likely we will do aarch32 support with inline asm.
|
||||
#if !defined(C10_MOBILE) && defined(__aarch64__)
|
||||
|
||||
#ifdef __BIG_ENDIAN__
|
||||
#error "Big endian is not supported."
|
||||
#endif
|
||||
@ -421,6 +419,24 @@ Vectorized<c10::Half> inline operator+(
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void load_fp32_from_fp16(const c10::Half* data, Vectorized<float>& out) {
|
||||
__at_align__ float values[Vectorized<float>::size()];
|
||||
for (const auto k : c10::irange(Vectorized<float>::size())) {
|
||||
values[k] = data[k];
|
||||
}
|
||||
out = Vectorized<float>::loadu(values);
|
||||
}
|
||||
|
||||
inline void load_fp32_from_fp16(
|
||||
const c10::Half* data,
|
||||
Vectorized<float>& out1,
|
||||
Vectorized<float>& out2) {
|
||||
Vectorized<c10::Half> f16_vec = Vectorized<c10::Half>::loadu(data);
|
||||
auto floats = convert_half_float(f16_vec);
|
||||
out1 = std::get<0>(floats);
|
||||
out2 = std::get<1>(floats);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<c10::Half> inline operator-(
|
||||
const Vectorized<c10::Half>& a,
|
||||
@ -656,6 +672,53 @@ Vectorized<c10::Half> inline fnmsub(
|
||||
return -a * b - c;
|
||||
#endif
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define CONVERT_NON_VECTORIZED_INIT(type, name) \
|
||||
inline std::tuple<Vectorized<float>, Vectorized<float>> \
|
||||
convert_##name##_float(const Vectorized<type>& a) { \
|
||||
constexpr int64_t K = Vectorized<type>::size(); \
|
||||
__at_align__ float arr[K]; \
|
||||
__at_align__ type arr2[K]; \
|
||||
a.store(arr2); \
|
||||
convert(arr2, arr, K); \
|
||||
return std::make_tuple( \
|
||||
Vectorized<float>::loadu(arr), \
|
||||
Vectorized<float>::loadu(arr + Vectorized<float>::size())); \
|
||||
} \
|
||||
inline Vectorized<type> convert_float_##name( \
|
||||
const Vectorized<float>& a, const Vectorized<float>& b) { \
|
||||
constexpr int64_t K = Vectorized<type>::size(); \
|
||||
__at_align__ float arr[K]; \
|
||||
__at_align__ type arr2[K]; \
|
||||
a.store(arr); \
|
||||
b.store(arr + Vectorized<float>::size()); \
|
||||
convert(arr, arr2, K); \
|
||||
return Vectorized<type>::loadu(arr2); \
|
||||
}
|
||||
|
||||
#define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
|
||||
inline void load_fp32_from_##name( \
|
||||
const type* data, Vectorized<float>& out) { \
|
||||
__at_align__ float values[Vectorized<float>::size()]; \
|
||||
for (const auto k : c10::irange(Vectorized<float>::size())) { \
|
||||
values[k] = data[k]; \
|
||||
} \
|
||||
out = Vectorized<float>::loadu(values); \
|
||||
} \
|
||||
\
|
||||
inline void load_fp32_from_##name( \
|
||||
const type* data, Vectorized<float>& out1, Vectorized<float>& out2) { \
|
||||
load_fp32_from_##name(data, out1); \
|
||||
data += Vectorized<float>::size(); \
|
||||
load_fp32_from_##name(data, out2); \
|
||||
}
|
||||
|
||||
CONVERT_NON_VECTORIZED_INIT(Half, half)
|
||||
|
||||
LOAD_FP32_NON_VECTORIZED_INIT(Half, fp16)
|
||||
|
||||
#endif // !defined(C10_MOBILE) && defined(__aarch64__)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
|
||||
@ -9,21 +9,16 @@
|
||||
#if !( \
|
||||
defined(__VSX__) || defined(CPU_CAPABILITY_VSX) || \
|
||||
defined(CPU_CAPABILITY_ZVECTOR))
|
||||
#if defined(CPU_CAPABILITY_SVE256)
|
||||
#include <ATen/cpu/vec/sve/vec_common_sve.h>
|
||||
#else
|
||||
// clang-format off
|
||||
#include <ATen/cpu/vec/vec256/vec256_float.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_double.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_float.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_int.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_qint.h>
|
||||
#endif
|
||||
#if !defined(CPU_CAPABILITY_SVE256) || !defined(__ARM_FEATURE_BF16)
|
||||
#include <ATen/cpu/vec/vec256/vec256_bfloat16.h>
|
||||
#endif
|
||||
#include <ATen/cpu/vec/vec256/vec256_half.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_complex_float.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_complex_double.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_complex_float.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_half.h>
|
||||
// clang-format on
|
||||
#elif defined(__VSX__) || defined(CPU_CAPABILITY_VSX)
|
||||
#include <ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h>
|
||||
@ -56,34 +51,6 @@ namespace at::vec {
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) {
|
||||
stream << val.val_;
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) {
|
||||
stream << static_cast<int>(val.val_);
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) {
|
||||
stream << static_cast<unsigned int>(val.val_);
|
||||
return stream;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
|
||||
T buf[Vectorized<T>::size()];
|
||||
vec.store(buf);
|
||||
stream << "vec[";
|
||||
for (int i = 0; i != Vectorized<T>::size(); i++) {
|
||||
if (i != 0) {
|
||||
stream << ", ";
|
||||
}
|
||||
stream << buf[i];
|
||||
}
|
||||
stream << "]";
|
||||
return stream;
|
||||
}
|
||||
|
||||
#if defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
@ -268,9 +268,7 @@ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16)
|
||||
|
||||
#else // defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
#if !( \
|
||||
defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
!defined(CPU_CAPABILITY_SVE256))
|
||||
#if !(defined(__aarch64__))
|
||||
CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16)
|
||||
#endif
|
||||
|
||||
|
||||
@ -342,19 +342,19 @@ class Vectorized<c10::complex<double>> {
|
||||
return _mm256_cmp_pd(values, other.values, _CMP_NEQ_UQ);
|
||||
}
|
||||
Vectorized<c10::complex<double>> operator<(
|
||||
const Vectorized<c10::complex<double>>& /*unused*/) const {
|
||||
const Vectorized<c10::complex<double>>&) const {
|
||||
TORCH_CHECK(false, "not supported for complex numbers");
|
||||
}
|
||||
Vectorized<c10::complex<double>> operator<=(
|
||||
const Vectorized<c10::complex<double>>& /*unused*/) const {
|
||||
const Vectorized<c10::complex<double>>&) const {
|
||||
TORCH_CHECK(false, "not supported for complex numbers");
|
||||
}
|
||||
Vectorized<c10::complex<double>> operator>(
|
||||
const Vectorized<c10::complex<double>>& /*unused*/) const {
|
||||
const Vectorized<c10::complex<double>>&) const {
|
||||
TORCH_CHECK(false, "not supported for complex numbers");
|
||||
}
|
||||
Vectorized<c10::complex<double>> operator>=(
|
||||
const Vectorized<c10::complex<double>>& /*unused*/) const {
|
||||
const Vectorized<c10::complex<double>>&) const {
|
||||
TORCH_CHECK(false, "not supported for complex numbers");
|
||||
}
|
||||
|
||||
|
||||
@ -268,9 +268,7 @@ LOAD_FP32_VECTORIZED_INIT(Half, fp16)
|
||||
|
||||
#else // defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
#if !( \
|
||||
defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
!defined(CPU_CAPABILITY_SVE256))
|
||||
#if !defined(__aarch64__) || defined(CPU_CAPABILITY_SVE256)
|
||||
CONVERT_NON_VECTORIZED_INIT(Half, half)
|
||||
#endif
|
||||
|
||||
|
||||
@ -5,6 +5,13 @@
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
|
||||
#ifdef __aarch64__
|
||||
#if defined(CPU_CAPABILITY_SVE128) || !defined(CPU_CAPABILITY_SVE)
|
||||
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <ATen/native/quantized/AffineQuantizerBase.h>
|
||||
|
||||
#include <c10/util/irange.h>
|
||||
@ -915,7 +922,7 @@ Vectorized<c10::quint8> inline maximum(
|
||||
return a.maximum(b);
|
||||
}
|
||||
|
||||
#elif !defined(CPU_CAPABILITY_SVE256)
|
||||
#else
|
||||
|
||||
// NOTE: These are low-performance implementations that we fall back on
|
||||
// if we are not building with AVX2. This may not be an issue, because
|
||||
@ -1372,12 +1379,18 @@ Vectorized<c10::quint8> inline maximum(
|
||||
return a.maximum(b);
|
||||
}
|
||||
|
||||
#endif // if defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
|
||||
#if defined(__aarch64__) && \
|
||||
(defined(CPU_CAPABILITY_SVE128) || !defined(CPU_CAPABILITY_SVE))
|
||||
std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
|
||||
at::vec::Vectorized<int8_t> src) {
|
||||
|
||||
#ifdef CPU_CAPABILITY_SVE
|
||||
svint8_t x = src;
|
||||
auto s8x8 = vget_low_s8(svget_neonq(x));
|
||||
#else
|
||||
auto s8x8 = vld1_s8(src.operator const int8_t*());
|
||||
#endif
|
||||
|
||||
auto s16x8 = vmovl_s8(s8x8);
|
||||
|
||||
auto s32x4_hi = vmovl_s16(vget_high_s16(s16x8));
|
||||
@ -1402,7 +1415,14 @@ std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
|
||||
|
||||
Vectorized<float> inline convert_int8_half_register_to_float(
|
||||
at::vec::Vectorized<int8_t> src) {
|
||||
|
||||
#ifdef CPU_CAPABILITY_SVE
|
||||
svint8_t x = src;
|
||||
auto s8x8 = vget_low_s8(svget_neonq(x));
|
||||
#else
|
||||
auto s8x8 = vld1_s8(src.operator const int8_t*());
|
||||
#endif
|
||||
|
||||
auto s16x8 = vmovl_s8(s8x8);
|
||||
|
||||
auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8));
|
||||
@ -1420,5 +1440,8 @@ Vectorized<float> inline convert_int8_half_register_to_float(
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif // if defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
|
||||
@ -31,34 +31,6 @@ namespace vec {
|
||||
// See Note [CPU_CAPABILITY namespace]
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) {
|
||||
stream << val.val_;
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) {
|
||||
stream << static_cast<int>(val.val_);
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) {
|
||||
stream << static_cast<unsigned int>(val.val_);
|
||||
return stream;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
|
||||
T buf[Vectorized<T>::size()];
|
||||
vec.store(buf);
|
||||
stream << "vec[";
|
||||
for (int i = 0; i != Vectorized<T>::size(); i++) {
|
||||
if (i != 0) {
|
||||
stream << ", ";
|
||||
}
|
||||
stream << buf[i];
|
||||
}
|
||||
stream << "]";
|
||||
return stream;
|
||||
}
|
||||
|
||||
#if defined(CPU_CAPABILITY_AVX512)
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512)
|
||||
|
||||
@ -67,18 +67,7 @@ Windows llvm will not have this definition.
|
||||
#endif
|
||||
#define VECTOR_WIDTH 64
|
||||
#define int_vector __m512i
|
||||
#elif defined(__aarch64__) && \
|
||||
!defined(CPU_CAPABILITY_SVE) // CPU_CAPABILITY_AVX512
|
||||
// SVE code expects 256-vectors; leave that set for SVE?
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(16)))
|
||||
#elif defined(_WIN32)
|
||||
#define __at_align__ __declspec(align(16))
|
||||
#else
|
||||
#define __at_align__
|
||||
#endif
|
||||
#define VECTOR_WIDTH 16
|
||||
#else // CPU_CAPABILITY_AVX512
|
||||
#elif defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_SVE256)
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(32)))
|
||||
#elif defined(_WIN32)
|
||||
@ -88,7 +77,27 @@ Windows llvm will not have this definition.
|
||||
#endif
|
||||
#define VECTOR_WIDTH 32
|
||||
#define int_vector __m256i
|
||||
#endif // CPU_CAPABILITY_AVX512
|
||||
#elif defined(__aarch64__)
|
||||
// Define alignment and vector width for SVE128/Default (e.g., NEON)
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(16)))
|
||||
#elif defined(_WIN32)
|
||||
#define __at_align__ __declspec(align(16))
|
||||
#else
|
||||
#define __at_align__
|
||||
#endif
|
||||
#define VECTOR_WIDTH 16
|
||||
#else
|
||||
// Fallback: define default alignment and vector width
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(32)))
|
||||
#elif defined(_WIN32)
|
||||
#define __at_align__ __declspec(align(32))
|
||||
#else
|
||||
#define __at_align__
|
||||
#endif
|
||||
#define VECTOR_WIDTH 32
|
||||
#endif
|
||||
|
||||
namespace at::vec {
|
||||
// See Note [CPU_CAPABILITY namespace]
|
||||
|
||||
@ -8,13 +8,48 @@
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#include <ATen/cpu/vec/sve/vec_bfloat16.h>
|
||||
#include <ATen/cpu/vec/sve/vec_double.h>
|
||||
#include <ATen/cpu/vec/sve/vec_float.h>
|
||||
#include <ATen/cpu/vec/sve/vec_int.h>
|
||||
#ifdef CPU_CAPABILITY_SVE128
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_half_neon.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_convert.h>
|
||||
|
||||
#include <ATen/cpu/vec/sve/vec_qint.h>
|
||||
#endif
|
||||
|
||||
#elif defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
#include <ATen/cpu/vec/sve/vec_float.h>
|
||||
|
||||
#include <ATen/cpu/vec/sve/vec_bfloat16.h>
|
||||
|
||||
#include <ATen/cpu/vec/sve/vec_double.h>
|
||||
#include <ATen/cpu/vec/sve/vec_int.h>
|
||||
|
||||
#include <ATen/cpu/vec/sve/vec_qint.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec256/vec256_half.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec256/vec256_convert.h>
|
||||
|
||||
#else // NEON
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_half_neon.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_convert.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec256/vec256_qint.h>
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE128)
|
||||
|
||||
#include <ATen/cpu/vec/functional.h>
|
||||
|
||||
namespace at::vec {
|
||||
// Note [CPU_CAPABILITY namespace]
|
||||
@ -48,12 +83,6 @@ DEFINE_SVE_CAST(int32_t, s32, float, f32)
|
||||
DEFINE_SVE_CAST(int16_t, s16, float, f32)
|
||||
DEFINE_SVE_CAST(float, f32, double, f64)
|
||||
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
DEFINE_SVE_CAST(int64_t, s64, c10::BFloat16, bf16)
|
||||
DEFINE_SVE_CAST(int32_t, s32, c10::BFloat16, bf16)
|
||||
DEFINE_SVE_CAST(int16_t, s16, c10::BFloat16, bf16)
|
||||
#endif // __ARM_FEATURE_BF16
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
template <int64_t scale = 1>
|
||||
@ -173,9 +202,11 @@ std::pair<
|
||||
// group cols crossing lanes:
|
||||
// return {a0, b0, a1, b1, a2, b2, a3, b3}
|
||||
// {a4, b4, a5, b5, a6, b6, a7, b7}
|
||||
return std::make_pair(
|
||||
Vectorized<c10::BFloat16>(svzip1_bf16(a, b)),
|
||||
Vectorized<c10::BFloat16>(svzip2_bf16(a, b)));
|
||||
svbfloat16_t aReg = a;
|
||||
svbfloat16_t bReg = b;
|
||||
Vectorized<c10::BFloat16> c = svzip1_bf16(aReg, bReg);
|
||||
Vectorized<c10::BFloat16> d = svzip2_bf16(aReg, bReg);
|
||||
return std::make_pair(c, d);
|
||||
}
|
||||
#endif // __ARM_FEATURE_BF16
|
||||
|
||||
@ -224,12 +255,27 @@ std::pair<
|
||||
// swap lanes:
|
||||
// return {a0, a1, a2, a3, a4, a5, a6, a7}
|
||||
// {b0, b1, b2, b3, b4, b5, b6, b7}
|
||||
return std::make_pair(
|
||||
Vectorized<c10::BFloat16>(svuzp1_bf16((svbfloat16_t)a, (svbfloat16_t)b)),
|
||||
Vectorized<c10::BFloat16>(svuzp2_bf16((svbfloat16_t)a, (svbfloat16_t)b)));
|
||||
svbfloat16_t aReg = a;
|
||||
svbfloat16_t bReg = b;
|
||||
Vectorized<c10::BFloat16> c = svuzp1_bf16(aReg, bReg);
|
||||
Vectorized<c10::BFloat16> d = svuzp2_bf16(aReg, bReg);
|
||||
return std::make_pair(c, d);
|
||||
}
|
||||
#endif // __ARM_FEATURE_BF16
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FLIP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
#define DEFINE_FLIP_FUNC(type, sve_func) \
|
||||
inline Vectorized<type> flip(const Vectorized<type>& v) { \
|
||||
return Vectorized<type>(sve_func(v)); \
|
||||
}
|
||||
// Use the macro to define the flip functions
|
||||
DEFINE_FLIP_FUNC(float, svrev_f32)
|
||||
DEFINE_FLIP_FUNC(double, svrev_f64)
|
||||
DEFINE_FLIP_FUNC(int64_t, svrev_s64)
|
||||
DEFINE_FLIP_FUNC(int32_t, svrev_s32)
|
||||
DEFINE_FLIP_FUNC(int16_t, svrev_s16)
|
||||
DEFINE_FLIP_FUNC(int8_t, svrev_s8)
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
@ -422,34 +422,18 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
|
||||
abType = CUDA_R_16F;
|
||||
cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
|
||||
#ifndef USE_ROCM
|
||||
auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
|
||||
if (fp16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
uint32_t mask =
|
||||
fp16_reduction ==
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
|
||||
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
|
||||
CUBLASLT_REDUCTION_SCHEME_NONE)
|
||||
: CUBLASLT_REDUCTION_SCHEME_NONE;
|
||||
preference.setAttribute(
|
||||
CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
|
||||
if (!at::globalContext().allowFP16ReductionCuBLAS()) {
|
||||
preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
|
||||
CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
|
||||
}
|
||||
#endif
|
||||
} else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
|
||||
abType = CUDA_R_16BF;
|
||||
cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
|
||||
#ifndef USE_ROCM
|
||||
auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
|
||||
if (bf16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
uint32_t mask =
|
||||
bf16_reduction ==
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
|
||||
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
|
||||
CUBLASLT_REDUCTION_SCHEME_NONE)
|
||||
: CUBLASLT_REDUCTION_SCHEME_NONE;
|
||||
preference.setAttribute(
|
||||
CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
|
||||
if (!at::globalContext().allowBF16ReductionCuBLAS()) {
|
||||
preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
|
||||
CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
@ -1136,15 +1120,8 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
|
||||
}
|
||||
if (prop->major >= 5) {
|
||||
cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
|
||||
auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
|
||||
TORCH_CHECK(fp16_reduction !=
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK,
|
||||
"torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction("
|
||||
"..., allow_splitk=False) requires the cuBLASLt backend");
|
||||
if (fp16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
cublas_flags = static_cast<cublasMath_t>(
|
||||
cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
|
||||
if (!at::globalContext().allowFP16ReductionCuBLAS()) {
|
||||
cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
|
||||
}
|
||||
// Disallow fp16 reductions that could lead to unexpected overflow issues.
|
||||
TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, cublas_flags));
|
||||
@ -1203,15 +1180,8 @@ inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DT
|
||||
GEMM_CHECK_ARGVALUES(at::BFloat16);
|
||||
#ifndef USE_ROCM
|
||||
cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
|
||||
auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
|
||||
TORCH_CHECK(bf16_reduction !=
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK,
|
||||
"torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction("
|
||||
"..., allow_splitk=False) requires the cuBLASLt backend");
|
||||
if (bf16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
cublas_flags = static_cast<cublasMath_t>(
|
||||
cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
|
||||
if (!at::globalContext().allowBF16ReductionCuBLAS()) {
|
||||
cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
|
||||
}
|
||||
#endif
|
||||
#if defined(USE_ROCM)
|
||||
@ -1300,7 +1270,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
|
||||
}
|
||||
#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
|
||||
else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
|
||||
if (at::detail::getCUDAHooks().isGPUArch({"gfx11", "gfx12"})) { //no CK GEMM version
|
||||
if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) { //no CK GEMM version for gfx1100
|
||||
gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
|
||||
} else{
|
||||
at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
|
||||
@ -1607,34 +1577,18 @@ bool gemm_and_bias(
|
||||
abType = CUDA_R_16F;
|
||||
cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
|
||||
#ifndef USE_ROCM
|
||||
auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
|
||||
if (fp16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
uint32_t mask =
|
||||
fp16_reduction ==
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
|
||||
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
|
||||
CUBLASLT_REDUCTION_SCHEME_NONE)
|
||||
: CUBLASLT_REDUCTION_SCHEME_NONE;
|
||||
preference.setAttribute(
|
||||
CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
|
||||
if (!at::globalContext().allowFP16ReductionCuBLAS()) {
|
||||
preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
|
||||
CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
|
||||
}
|
||||
#endif
|
||||
} else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
|
||||
abType = CUDA_R_16BF;
|
||||
cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
|
||||
#ifndef USE_ROCM
|
||||
auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
|
||||
if (bf16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
uint32_t mask =
|
||||
bf16_reduction ==
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
|
||||
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
|
||||
CUBLASLT_REDUCTION_SCHEME_NONE)
|
||||
: CUBLASLT_REDUCTION_SCHEME_NONE;
|
||||
preference.setAttribute(
|
||||
CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
|
||||
if (!at::globalContext().allowBF16ReductionCuBLAS()) {
|
||||
preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
|
||||
CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -1861,8 +1815,6 @@ template bool gemm_and_bias(
|
||||
int64_t result_ld,
|
||||
GEMMAndBiasActivationEpilogue activation);
|
||||
|
||||
using at::blas::ScalingType;
|
||||
|
||||
int get_scale_mode(ScalingType scaling_type, ScalarType scale_dtype, bool use_fast_accum) {
|
||||
switch (scaling_type) {
|
||||
case ScalingType::BlockWise1x32:
|
||||
|
||||
@ -14,7 +14,6 @@
|
||||
*/
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <ATen/BlasBackend.h>
|
||||
#include <ATen/OpMathType.h>
|
||||
|
||||
namespace at::cuda::blas {
|
||||
@ -137,6 +136,15 @@ void int8_gemm(
|
||||
int32_t* result_ptr,
|
||||
int64_t result_ld);
|
||||
|
||||
enum class ScalingType : std::uint8_t {
|
||||
TensorWise, // fp32 scales
|
||||
RowWise, // fp32 scales
|
||||
BlockWise1x16, // fp8_e4m3fn scales
|
||||
BlockWise1x32, // fp8_e8m0fnu scales
|
||||
BlockWise1x128, // fp32 scales
|
||||
BlockWise128x128, // fp32 scales
|
||||
};
|
||||
|
||||
void scaled_gemm(
|
||||
char transa,
|
||||
char transb,
|
||||
@ -148,13 +156,13 @@ void scaled_gemm(
|
||||
int64_t mat1_ld,
|
||||
ScalarType mat1_dtype,
|
||||
ScalarType mat1_scale_dtype,
|
||||
at::blas::ScalingType mat1_scaling_type,
|
||||
ScalingType mat1_scaling_type,
|
||||
const void* mat2_ptr,
|
||||
const void* mat2_scale_ptr,
|
||||
int64_t mat2_ld,
|
||||
ScalarType mat2_dtype,
|
||||
ScalarType mat2_scale_dtype,
|
||||
at::blas::ScalingType mat2_scaling_type,
|
||||
ScalingType mat2_scaling_type,
|
||||
const void* bias_ptr,
|
||||
ScalarType bias_dtype,
|
||||
void* result_ptr,
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user