mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
Update
[ghstack-poisoned]
This commit is contained in:
@ -181,7 +181,7 @@ case "$tag" in
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
|
||||
PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950;gfx1100"
|
||||
if [[ $tag =~ "benchmarks" ]]; then
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
fi
|
||||
@ -344,7 +344,7 @@ docker build \
|
||||
--build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
|
||||
--build-arg "KATEX=${KATEX:-}" \
|
||||
--build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
|
||||
--build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx1100}" \
|
||||
--build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}" \
|
||||
--build-arg "IMAGE_NAME=${IMAGE_NAME}" \
|
||||
--build-arg "UCX_COMMIT=${UCX_COMMIT}" \
|
||||
--build-arg "UCC_COMMIT=${UCC_COMMIT}" \
|
||||
|
@ -1 +1 @@
|
||||
e0dda9059d082537cee36be6c5e4fe3b18c880c0
|
||||
deb42f2a8e48f5032b4a98ee781a15fa87a157cf
|
||||
|
@ -1 +1 @@
|
||||
27664085f804afc83df26f740bb46c365854f2c4
|
||||
7416ffcb92cdbe98d9f97e4e6f95247e46dfc9fd
|
||||
|
@ -46,9 +46,9 @@ case ${DOCKER_TAG_PREFIX} in
|
||||
BASE_TARGET=rocm
|
||||
GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
|
||||
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
|
||||
# add gfx950 conditionally starting in ROCm 7.0
|
||||
# add gfx950, gfx115x conditionally starting in ROCm 7.0
|
||||
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
|
||||
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
|
||||
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
|
||||
fi
|
||||
DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
|
||||
;;
|
||||
|
@ -84,9 +84,9 @@ case ${image} in
|
||||
DEVTOOLSET_VERSION="11"
|
||||
GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
|
||||
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
|
||||
# add gfx950 conditionally starting in ROCm 7.0
|
||||
# add gfx950, gfx115x conditionally starting in ROCm 7.0
|
||||
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
|
||||
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
|
||||
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
|
||||
fi
|
||||
DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
|
||||
;;
|
||||
|
@ -10,11 +10,6 @@ BAD_SSL = "https://self-signed.badssl.com"
|
||||
|
||||
print("Testing SSL certificate checking for Python:", sys.version)
|
||||
|
||||
if sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4):
|
||||
print("This version never checks SSL certs; skipping tests")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
EXC = OSError
|
||||
|
||||
print(f"Connecting to {GOOD_SSL} should work")
|
||||
|
@ -233,7 +233,9 @@ if [[ "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
|
||||
export BUILD_STATIC_RUNTIME_BENCHMARK=ON
|
||||
fi
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
|
||||
if [[ "$BUILD_ENVIRONMENT" == *-full-debug* ]]; then
|
||||
export CMAKE_BUILD_TYPE=Debug
|
||||
elif [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
|
||||
export CMAKE_BUILD_TYPE=RelWithAssert
|
||||
fi
|
||||
|
||||
@ -299,6 +301,11 @@ else
|
||||
python -m build --wheel --no-isolation
|
||||
fi
|
||||
pip_install_whl "$(echo dist/*.whl)"
|
||||
if [[ "$BUILD_ENVIRONMENT" == *full-debug* ]]; then
|
||||
# Regression test for https://github.com/pytorch/pytorch/issues/164297
|
||||
# Torch should be importable and that's about it
|
||||
pushd /; python -c "import torch;print(torch.__config__.show(), torch.randn(5) + 1.7)"; popd
|
||||
fi
|
||||
|
||||
if [[ "${BUILD_ADDITIONAL_PACKAGES:-}" == *vision* ]]; then
|
||||
install_torchvision
|
||||
|
@ -256,7 +256,7 @@ test_torchbench_smoketest() {
|
||||
local device=mps
|
||||
local dtypes=(undefined float16 bfloat16 notset)
|
||||
local dtype=${dtypes[$1]}
|
||||
local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
|
||||
local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16)
|
||||
|
||||
for backend in eager inductor; do
|
||||
|
||||
@ -319,7 +319,7 @@ test_aoti_torchbench_smoketest() {
|
||||
local device=mps
|
||||
local dtypes=(undefined float16 bfloat16 notset)
|
||||
local dtype=${dtypes[$1]}
|
||||
local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
|
||||
local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16)
|
||||
|
||||
echo "Launching torchbench inference performance run for AOT Inductor and dtype ${dtype}"
|
||||
local dtype_arg="--${dtype}"
|
||||
|
@ -337,13 +337,13 @@ test_python() {
|
||||
|
||||
test_python_smoke() {
|
||||
# Smoke tests for H100/B200
|
||||
time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
test_python_smoke_b200() {
|
||||
# Targeted smoke tests for B200 - staged approach to avoid too many failures
|
||||
time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
@ -838,7 +838,7 @@ test_dynamo_benchmark() {
|
||||
elif [[ "${suite}" == "timm_models" ]]; then
|
||||
export TORCHBENCH_ONLY_MODELS="inception_v3"
|
||||
elif [[ "${suite}" == "torchbench" ]]; then
|
||||
export TORCHBENCH_ONLY_MODELS="hf_Bert"
|
||||
export TORCHBENCH_ONLY_MODELS="BERT_pytorch"
|
||||
fi
|
||||
fi
|
||||
test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
|
||||
@ -869,13 +869,13 @@ test_inductor_torchbench_smoketest_perf() {
|
||||
mkdir -p "$TEST_REPORTS_DIR"
|
||||
|
||||
python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
|
||||
--batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
|
||||
--batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only BERT_pytorch \
|
||||
--output "$TEST_REPORTS_DIR/inductor_training_smoketest.csv"
|
||||
# The threshold value needs to be actively maintained to make this check useful
|
||||
python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4
|
||||
|
||||
# Check memory compression ratio for a few models
|
||||
for test in hf_Albert timm_vision_transformer; do
|
||||
for test in BERT_pytorch yolov3; do
|
||||
python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --amp --training \
|
||||
--disable-cudagraphs --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" \
|
||||
--only $test --output "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv"
|
||||
|
@ -15,37 +15,35 @@ if errorlevel 1 exit /b 1
|
||||
if not errorlevel 0 exit /b 1
|
||||
|
||||
cd %TMP_DIR_WIN%\build\torch\test
|
||||
|
||||
:: Enable delayed variable expansion to make the list
|
||||
setlocal enabledelayedexpansion
|
||||
set EXE_LIST=
|
||||
for /r "." %%a in (*.exe) do (
|
||||
call :libtorch_check "%%~na" "%%~fa"
|
||||
if "%%~na" == "c10_intrusive_ptr_benchmark" (
|
||||
@REM NB: This is not a gtest executable file, thus couldn't be handled by
|
||||
@REM pytest-cpp and is excluded from test discovery by run_test
|
||||
call "%%~fa"
|
||||
if errorlevel 1 goto fail
|
||||
if not errorlevel 0 goto fail
|
||||
) else (
|
||||
if "%%~na" == "verify_api_visibility" (
|
||||
@REM Skip verify_api_visibility as it is a compile-level test
|
||||
) else (
|
||||
set EXE_LIST=!EXE_LIST! cpp/%%~na
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
goto :eof
|
||||
|
||||
:libtorch_check
|
||||
|
||||
cd %CWD%
|
||||
set CPP_TESTS_DIR=%TMP_DIR_WIN%\build\torch\test
|
||||
|
||||
:: Skip verify_api_visibility as it a compile level test
|
||||
if "%~1" == "verify_api_visibility" goto :eof
|
||||
:: Run python test\run_test.py on the list
|
||||
set NO_TD=True && python test\run_test.py --cpp --verbose -i !EXE_LIST!
|
||||
if errorlevel 1 goto fail
|
||||
if not errorlevel 0 goto fail
|
||||
|
||||
echo Running "%~2"
|
||||
if "%~1" == "c10_intrusive_ptr_benchmark" (
|
||||
:: NB: This is not a gtest executable file, thus couldn't be handled by pytest-cpp
|
||||
call "%~2"
|
||||
goto :eof
|
||||
)
|
||||
|
||||
python test\run_test.py --cpp --verbose -i "cpp/%~1"
|
||||
if errorlevel 1 (
|
||||
echo %1 failed with exit code %errorlevel%
|
||||
goto fail
|
||||
)
|
||||
if not errorlevel 0 (
|
||||
echo %1 failed with exit code %errorlevel%
|
||||
goto fail
|
||||
)
|
||||
goto :eof
|
||||
|
||||
:eof
|
||||
exit /b 0
|
||||
|
@ -71,14 +71,7 @@ export PYTORCH_BUILD_NUMBER=1
|
||||
|
||||
# Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS
|
||||
TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
|
||||
|
||||
# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
|
||||
TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||
|
||||
# CUDA 12.9/13.0 builds have triton for Linux and Linux aarch64 binaries.
|
||||
if [[ "$DESIRED_CUDA" == "cu129" ]] || [[ "$DESIRED_CUDA" == "cu130" ]]; then
|
||||
TRITON_CONSTRAINT="platform_system == 'Linux'"
|
||||
fi
|
||||
TRITON_CONSTRAINT="platform_system == 'Linux'"
|
||||
|
||||
if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" && ! "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then
|
||||
TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
|
||||
|
2
.flake8
2
.flake8
@ -12,7 +12,7 @@ ignore =
|
||||
# to line this up with executable bit
|
||||
EXE001,
|
||||
# these ignores are from flake8-bugbear; please fix!
|
||||
B007,B008,B017,B019,B023,B028,B903,B904,B905,B906,B907,B908,B910
|
||||
B007,B008,B017,B019,B023,B028,B903,B905,B906,B907,B908,B910
|
||||
# these ignores are from flake8-comprehensions; please fix!
|
||||
C407,
|
||||
# these ignores are from flake8-logging-format; please fix!
|
||||
|
2
.github/actions/linux-test/action.yml
vendored
2
.github/actions/linux-test/action.yml
vendored
@ -274,8 +274,6 @@ runs:
|
||||
-w /var/lib/jenkins/workspace \
|
||||
"${DOCKER_IMAGE}"
|
||||
)
|
||||
# Propagate download.pytorch.org IP to container
|
||||
grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
|
||||
echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
|
||||
docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
|
||||
|
||||
|
35
.github/actions/setup-linux/action.yml
vendored
35
.github/actions/setup-linux/action.yml
vendored
@ -28,6 +28,10 @@ runs:
|
||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
||||
echo "system info $(uname -a)"
|
||||
|
||||
- name: Print GPU info (if present)
|
||||
shell: bash
|
||||
run: if [ -f /usr/bin/nvidia-smi ]; then nvidia-smi; fi
|
||||
|
||||
- name: Check if in a container runner
|
||||
shell: bash
|
||||
id: check_container_runner
|
||||
@ -82,37 +86,6 @@ runs:
|
||||
# Prune all of the docker images
|
||||
docker system prune -af
|
||||
|
||||
- name: Manually resolve download.pytorch.org
|
||||
shell: bash
|
||||
continue-on-error: true
|
||||
run: |
|
||||
set +e
|
||||
set -x
|
||||
|
||||
PT_DOMAIN=download.pytorch.org
|
||||
# TODO: Flaky access to download.pytorch.org https://github.com/pytorch/pytorch/issues/100400,
|
||||
# cleaning this up once the issue is fixed. There are more than one resolved IP here, the last
|
||||
# one is returned at random
|
||||
RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" | tail -n1)
|
||||
|
||||
if [ -z "${RESOLVED_IP}" ]; then
|
||||
echo "Couldn't resolve ${PT_DOMAIN}, retrying with Google DNS..."
|
||||
RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" @8.8.8.8 | tail -n1)
|
||||
|
||||
if [ -z "${RESOLVED_IP}" ]; then
|
||||
echo "Couldn't resolve ${PT_DOMAIN}, exiting..."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if grep -r "${PT_DOMAIN}" /etc/hosts; then
|
||||
# Clean up any old records first
|
||||
sudo sed -i "/${PT_DOMAIN}/d" /etc/hosts
|
||||
fi
|
||||
|
||||
echo "${RESOLVED_IP} ${PT_DOMAIN}" | sudo tee -a /etc/hosts
|
||||
cat /etc/hosts
|
||||
|
||||
- name: Check that the docker daemon is running
|
||||
shell: bash
|
||||
continue-on-error: true
|
||||
|
@ -33,10 +33,6 @@ runs:
|
||||
)
|
||||
|
||||
echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
|
||||
if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" && "${BUILD_ENVIRONMENT}" != "linux-s390x-binary-manywheel" && "${GPU_ARCH_TYPE}" != "xpu" ]]; then
|
||||
# Propagate download.pytorch.org IP to container. This is only needed on Linux non aarch64 runner
|
||||
grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" bash -c "/bin/cat >> /etc/hosts"
|
||||
fi
|
||||
|
||||
docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
|
||||
# Generate test script
|
||||
|
1
.github/pytorch-probot.yml
vendored
1
.github/pytorch-probot.yml
vendored
@ -30,6 +30,7 @@ ciflow_push_tags:
|
||||
- ciflow/riscv64
|
||||
- ciflow/rocm
|
||||
- ciflow/rocm-mi300
|
||||
- ciflow/rocm-mi355
|
||||
- ciflow/s390
|
||||
- ciflow/slow
|
||||
- ciflow/torchbench
|
||||
|
BIN
.github/scripts/drci_mocks.json.gz
vendored
BIN
.github/scripts/drci_mocks.json.gz
vendored
Binary file not shown.
1
.github/scripts/github_utils.py
vendored
1
.github/scripts/github_utils.py
vendored
@ -18,6 +18,7 @@ class GitHubComment:
|
||||
body_text: str
|
||||
created_at: str
|
||||
author_login: str
|
||||
author_url: Optional[str]
|
||||
author_association: str
|
||||
editor_login: Optional[str]
|
||||
database_id: int
|
||||
|
BIN
.github/scripts/gql_mocks.json.gz
vendored
BIN
.github/scripts/gql_mocks.json.gz
vendored
Binary file not shown.
2
.github/scripts/test_check_labels.py
vendored
2
.github/scripts/test_check_labels.py
vendored
@ -38,6 +38,7 @@ def mock_get_comments() -> list[GitHubComment]:
|
||||
body_text="mock_body_text",
|
||||
created_at="",
|
||||
author_login="",
|
||||
author_url=None,
|
||||
author_association="",
|
||||
editor_login=None,
|
||||
database_id=1,
|
||||
@ -48,6 +49,7 @@ def mock_get_comments() -> list[GitHubComment]:
|
||||
body_text=" #" + LABEL_ERR_MSG_TITLE.replace("`", ""),
|
||||
created_at="",
|
||||
author_login=BOT_AUTHORS[1],
|
||||
author_url=None,
|
||||
author_association="",
|
||||
editor_login=None,
|
||||
database_id=2,
|
||||
|
18
.github/scripts/test_trymerge.py
vendored
18
.github/scripts/test_trymerge.py
vendored
@ -32,6 +32,7 @@ from trymerge import (
|
||||
main as trymerge_main,
|
||||
MandatoryChecksMissingError,
|
||||
MergeRule,
|
||||
PostCommentError,
|
||||
RE_GHSTACK_DESC,
|
||||
read_merge_rules,
|
||||
remove_job_name_suffix,
|
||||
@ -588,6 +589,23 @@ class TestTryMerge(TestCase):
|
||||
self.assertEqual(mock_merge_base, pr.get_merge_base())
|
||||
mocked_gh_fetch_merge_base.assert_called_once()
|
||||
|
||||
def test_app_can_revert(self, *args: Any) -> None:
|
||||
pr = GitHubPR("pytorch", "pytorch", 164660)
|
||||
repo = DummyGitRepo()
|
||||
app_comment_id, impostor_comment_id = 3375785595, 3377647892
|
||||
# Check that app can revert
|
||||
self.assertIsNotNone(validate_revert(repo, pr, comment_id=app_comment_id))
|
||||
# But impostor can not
|
||||
self.assertRaises(
|
||||
PostCommentError,
|
||||
lambda: validate_revert(repo, pr, comment_id=impostor_comment_id),
|
||||
)
|
||||
# Despite it's name being the name of the bot
|
||||
self.assertEqual(
|
||||
pr.get_comment_by_id(impostor_comment_id).author_login,
|
||||
"pytorch-auto-revert",
|
||||
)
|
||||
|
||||
|
||||
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
|
||||
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
|
||||
|
7
.github/scripts/trymerge.py
vendored
7
.github/scripts/trymerge.py
vendored
@ -234,6 +234,7 @@ query ($owner: String!, $name: String!, $number: Int!) {
|
||||
createdAt
|
||||
author {
|
||||
login
|
||||
url
|
||||
}
|
||||
authorAssociation
|
||||
editor {
|
||||
@ -1093,6 +1094,7 @@ class GitHubPR:
|
||||
body_text=node["bodyText"],
|
||||
created_at=node["createdAt"] if "createdAt" in node else "",
|
||||
author_login=node["author"]["login"],
|
||||
author_url=node["author"].get("url", None),
|
||||
author_association=node["authorAssociation"],
|
||||
editor_login=editor["login"] if editor else None,
|
||||
database_id=node["databaseId"],
|
||||
@ -2029,6 +2031,11 @@ def validate_revert(
|
||||
# For some reason, one can not be a member of private repo, only CONTRIBUTOR
|
||||
if pr.is_base_repo_private():
|
||||
allowed_reverters.append("CONTRIBUTOR")
|
||||
# Special case the pytorch-auto-revert app, whose does not have association
|
||||
# But should be able to issue revert command
|
||||
if comment.author_url == "https://github.com/apps/pytorch-auto-revert":
|
||||
allowed_reverters.append("NONE")
|
||||
|
||||
if author_association not in allowed_reverters:
|
||||
raise PostCommentError(
|
||||
f"Will not revert as @{author_login} is not one of "
|
||||
|
2
.github/workflows/_linux-test.yml
vendored
2
.github/workflows/_linux-test.yml
vendored
@ -389,8 +389,6 @@ jobs:
|
||||
"${DOCKER_IMAGE}" \
|
||||
${DOCKER_SHELL_CMD}
|
||||
)
|
||||
# Propagate download.pytorch.org IP to container
|
||||
grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
|
||||
echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
|
||||
|
||||
if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
|
||||
|
2
.github/workflows/h100-distributed.yml
vendored
2
.github/workflows/h100-distributed.yml
vendored
@ -37,7 +37,7 @@ jobs:
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runner: "linux.12xlarge"
|
||||
runner: "linux.c7i.12xlarge"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '9.0'
|
||||
|
@ -2,7 +2,7 @@ name: inductor-perf-nightly-h100
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: 15 0,12 * * 1-6
|
||||
- cron: 15 0 * * 1-6
|
||||
- cron: 0 7 * * 0
|
||||
# NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
|
||||
# out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
|
||||
|
7
.github/workflows/rocm-mi355.yml
vendored
7
.github/workflows/rocm-mi355.yml
vendored
@ -1,6 +1,9 @@
|
||||
name: rocm-mi355
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- ciflow/rocm-mi355/*
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: 30 11,1 * * * # about 4:30am PDT and 6:30pm PDT
|
||||
@ -64,5 +67,7 @@ jobs:
|
||||
build-environment: linux-noble-rocm-py3.12-mi355
|
||||
docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }}
|
||||
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
|
||||
tests-to-include: >-
|
||||
${{ github.event_name == 'schedule' && 'test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor test_matmul_cuda test_scaled_matmul_cuda'
|
||||
|| '' }}
|
||||
secrets: inherit
|
||||
|
13
.github/workflows/trunk.yml
vendored
13
.github/workflows/trunk.yml
vendored
@ -56,7 +56,7 @@ jobs:
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
build-generates-artifacts: false
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runner: "linux.4xlarge"
|
||||
runner: "linux.c7i.4xlarge"
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 1 },
|
||||
@ -249,3 +249,14 @@ jobs:
|
||||
docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3_10-gcc11-full-debug-build-only:
|
||||
name: linux-jammy-py3.10-gcc11-full-debug-build-only
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runner: linux.2xlarge.memory
|
||||
build-environment: linux-jammy-py3.10-gcc11-full-debug-build-only
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
|
||||
secrets: inherit
|
||||
|
4
.github/workflows/xpu.yml
vendored
4
.github/workflows/xpu.yml
vendored
@ -35,7 +35,7 @@ jobs:
|
||||
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
|
||||
build-environment: linux-jammy-xpu-n-1-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-1-py3
|
||||
runner: linux.12xlarge
|
||||
runner: linux.c7i.12xlarge
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
@ -56,7 +56,7 @@ jobs:
|
||||
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
|
||||
build-environment: linux-jammy-xpu-n-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
|
||||
runner: linux.12xlarge
|
||||
runner: linux.c7i.12xlarge
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
|
@ -388,9 +388,9 @@ cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker
|
||||
|
||||
option(USE_MIMALLOC "Use mimalloc" OFF)
|
||||
# Enable third party mimalloc library to improve memory allocation performance
|
||||
# on Windows.
|
||||
# on Windows and AArch64.
|
||||
option(USE_MIMALLOC_ON_MKL "Use mimalloc on MKL" OFF)
|
||||
if(WIN32)
|
||||
if(WIN32 OR (CPU_AARCH64 AND NOT APPLE))
|
||||
set(USE_MIMALLOC ON)
|
||||
|
||||
# Not enable USE_MIMALLOC_ON_MKL due to it caused issue:
|
||||
|
@ -28,4 +28,19 @@ inline std::ostream& operator<<(std::ostream& stream, at::BlasBackend backend) {
|
||||
return stream << BlasBackendToString(backend);
|
||||
}
|
||||
|
||||
namespace blas {
|
||||
|
||||
enum class ScalingType : std::uint8_t {
|
||||
TensorWise, // fp32 scales
|
||||
RowWise, // fp32 scales
|
||||
BlockWise1x16, // fp8_e4m3fn scales
|
||||
BlockWise1x32, // fp8_e8m0fnu scales
|
||||
BlockWise1x128, // fp32 scales
|
||||
BlockWise128x128, // fp32 scales
|
||||
};
|
||||
|
||||
enum class SwizzleType : std::uint8_t { NO_SWIZZLE = 0, SWIZZLE_32_4_4 = 1 };
|
||||
|
||||
} // namespace blas
|
||||
|
||||
} // namespace at
|
||||
|
@ -144,8 +144,7 @@ inline std::string _all_equal_numel_error(at::ArrayRef<Tensor> tensors) {
|
||||
inline bool _apply_preamble(ArrayRef<Tensor> tensors) {
|
||||
checkDeviceType("CPU_tensor_apply", tensors, kCPU);
|
||||
checkLayout("CPU_tensor_apply", tensors, kStrided);
|
||||
if (!_all_equal_numel(tensors))
|
||||
TORCH_CHECK(false, _all_equal_numel_error(tensors));
|
||||
TORCH_CHECK(_all_equal_numel(tensors), _all_equal_numel_error(tensors));
|
||||
// An empty tensor has no elements
|
||||
for (auto& t : tensors)
|
||||
if (t.numel() == 0)
|
||||
|
@ -587,20 +587,33 @@ void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) {
|
||||
rocm_fa_preferred_backend = b;
|
||||
}
|
||||
|
||||
bool Context::allowFP16ReductionCuBLAS() const {
|
||||
CuBLASReductionOption Context::allowFP16ReductionCuBLAS() const {
|
||||
return allow_fp16_reduction_cublas;
|
||||
}
|
||||
|
||||
void Context::setAllowFP16ReductionCuBLAS(bool b) {
|
||||
allow_fp16_reduction_cublas = b;
|
||||
CuBLASReductionOption inline get_reduction_option(bool allow_reduced_precision, bool allow_splitk) {
|
||||
TORCH_CHECK(
|
||||
!(allow_reduced_precision && !allow_splitk),
|
||||
"allow_splitk=False is not supported when reduced precision reductions are enabled");
|
||||
if (allow_reduced_precision) {
|
||||
return CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
|
||||
} else if (allow_splitk) {
|
||||
return CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK;
|
||||
} else {
|
||||
return CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK;
|
||||
}
|
||||
}
|
||||
|
||||
bool Context::allowBF16ReductionCuBLAS() const {
|
||||
void Context::setAllowFP16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) {
|
||||
allow_fp16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk);
|
||||
}
|
||||
|
||||
CuBLASReductionOption Context::allowBF16ReductionCuBLAS() const {
|
||||
return allow_bf16_reduction_cublas;
|
||||
}
|
||||
|
||||
void Context::setAllowBF16ReductionCuBLAS(bool b) {
|
||||
allow_bf16_reduction_cublas = b;
|
||||
void Context::setAllowBF16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) {
|
||||
allow_bf16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk);
|
||||
}
|
||||
|
||||
bool Context::allowFP16AccumulationCuBLAS() const {
|
||||
|
@ -38,6 +38,12 @@ namespace at {
|
||||
class Tensor;
|
||||
|
||||
enum class TORCH_API Float32MatmulPrecision { HIGHEST, HIGH, MEDIUM };
|
||||
|
||||
enum class CuBLASReductionOption : uint8_t {
|
||||
AllowReducedPrecisionWithSplitK = 0,
|
||||
DisallowReducedPrecisionAllowSplitK = 1,
|
||||
DisallowReducedPrecisionDisallowSplitK = 2,
|
||||
};
|
||||
enum class TORCH_API Float32Backend { GENERIC, CUDA, MKLDNN };
|
||||
enum class TORCH_API Float32Op { ALL, CONV, RNN, MATMUL };
|
||||
enum class TORCH_API Float32Precision { NONE, IEEE, TF32, BF16 };
|
||||
@ -220,15 +226,15 @@ class TORCH_API Context {
|
||||
bool userEnabledMkldnn() const;
|
||||
void setUserEnabledMkldnn(bool e);
|
||||
bool benchmarkCuDNN() const;
|
||||
void setBenchmarkCuDNN(bool);
|
||||
void setBenchmarkCuDNN(bool /*b*/);
|
||||
int benchmarkLimitCuDNN() const;
|
||||
void setBenchmarkLimitCuDNN(int);
|
||||
void setBenchmarkLimitCuDNN(int /*b*/);
|
||||
bool immediateMiopen() const;
|
||||
void setImmediateMiopen(bool);
|
||||
void setImmediateMiopen(bool /*b*/);
|
||||
bool deterministicCuDNN() const;
|
||||
void setDeterministicCuDNN(bool);
|
||||
void setDeterministicCuDNN(bool /*b*/);
|
||||
bool deterministicMkldnn() const;
|
||||
void setDeterministicMkldnn(bool);
|
||||
void setDeterministicMkldnn(bool /*b*/);
|
||||
bool userEnabledNNPACK() const;
|
||||
void setUserEnabledNNPACK(bool e);
|
||||
|
||||
@ -246,32 +252,32 @@ class TORCH_API Context {
|
||||
void setSDPPriorityOrder(const std::vector<int64_t>& order);
|
||||
std::array<at::SDPBackend, at::num_sdp_backends> sDPPriorityOrder();
|
||||
|
||||
void setSDPUseFlash(bool);
|
||||
void setSDPUseFlash(bool /*e*/);
|
||||
bool userEnabledFlashSDP() const;
|
||||
|
||||
void setSDPUseMemEfficient(bool);
|
||||
void setSDPUseMemEfficient(bool /*e*/);
|
||||
bool userEnabledMemEfficientSDP() const;
|
||||
|
||||
void setSDPUseMath(bool);
|
||||
void setSDPUseMath(bool /*e*/);
|
||||
bool userEnabledMathSDP() const;
|
||||
|
||||
void setSDPUseCuDNN(bool);
|
||||
void setSDPUseCuDNN(bool /*e*/);
|
||||
bool userEnabledCuDNNSDP() const;
|
||||
|
||||
void setAllowFP16BF16ReductionMathSDP(bool);
|
||||
void setAllowFP16BF16ReductionMathSDP(bool /*e*/);
|
||||
bool allowFP16BF16ReductionMathSDP() const;
|
||||
|
||||
void setSDPUseOverrideable(bool);
|
||||
void setSDPUseOverrideable(bool /*e*/);
|
||||
bool userEnabledOverrideableSDP() const;
|
||||
|
||||
at::LinalgBackend linalgPreferredBackend() const;
|
||||
void setLinalgPreferredBackend(at::LinalgBackend);
|
||||
void setLinalgPreferredBackend(at::LinalgBackend /*b*/);
|
||||
|
||||
at::BlasBackend blasPreferredBackend();
|
||||
void setBlasPreferredBackend(at::BlasBackend);
|
||||
void setBlasPreferredBackend(at::BlasBackend /*b*/);
|
||||
|
||||
at::ROCmFABackend getROCmFAPreferredBackend();
|
||||
void setROCmFAPreferredBackend(at::ROCmFABackend);
|
||||
void setROCmFAPreferredBackend(at::ROCmFABackend /*b*/);
|
||||
|
||||
// Note [Enabling Deterministic Operations]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
@ -304,9 +310,9 @@ class TORCH_API Context {
|
||||
|
||||
bool deterministicAlgorithms() const;
|
||||
bool deterministicAlgorithmsWarnOnly() const;
|
||||
void setDeterministicAlgorithms(bool, bool);
|
||||
void setDeterministicAlgorithms(bool /*b*/, bool /*warn_only*/);
|
||||
bool deterministicFillUninitializedMemory() const;
|
||||
void setDeterministicFillUninitializedMemory(bool);
|
||||
void setDeterministicFillUninitializedMemory(bool /*b*/);
|
||||
|
||||
// Note [Writing Nondeterministic Operations]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
@ -350,19 +356,23 @@ class TORCH_API Context {
|
||||
Float32Op op,
|
||||
Float32Precision p);
|
||||
bool allowTF32CuDNN(std::optional<Float32Op> op = std::nullopt) const;
|
||||
void setAllowTF32CuDNN(bool);
|
||||
void setAllowTF32CuDNN(bool /*b*/);
|
||||
bool allowTF32OneDNN() const;
|
||||
void setAllowTF32OneDNN(bool);
|
||||
void setAllowTF32OneDNN(bool /*b*/);
|
||||
bool allowTF32CuBLAS() const;
|
||||
void setAllowTF32CuBLAS(bool);
|
||||
void setAllowTF32CuBLAS(bool /*b*/);
|
||||
Float32MatmulPrecision float32MatmulPrecision() const;
|
||||
Float32Precision float32Precision(Float32Backend backend, Float32Op op) const;
|
||||
bool allowFP16ReductionCuBLAS() const;
|
||||
void setAllowFP16ReductionCuBLAS(bool);
|
||||
bool allowBF16ReductionCuBLAS() const;
|
||||
void setAllowBF16ReductionCuBLAS(bool);
|
||||
CuBLASReductionOption allowFP16ReductionCuBLAS() const;
|
||||
void setAllowFP16ReductionCuBLAS(
|
||||
bool allow_reduced_precision,
|
||||
bool allow_splitk = true);
|
||||
CuBLASReductionOption allowBF16ReductionCuBLAS() const;
|
||||
void setAllowBF16ReductionCuBLAS(
|
||||
bool allow_reduced_precision,
|
||||
bool allow_splitk = true);
|
||||
bool allowFP16AccumulationCuBLAS() const;
|
||||
void setAllowFP16AccumulationCuBLAS(bool);
|
||||
void setAllowFP16AccumulationCuBLAS(bool /*b*/);
|
||||
|
||||
// Matmuls can use a so-called "persistent" kernel which launches one CUDA
|
||||
// block for each SM on the GPU, and each block then iterates over multiple
|
||||
@ -374,7 +384,7 @@ class TORCH_API Context {
|
||||
// to make matmuls target only a subset of the SMs, so they can fully schedule
|
||||
// even next to a comms kernel, and only be a few percent slower.
|
||||
std::optional<int32_t> _SMCarveout_EXPERIMENTAL() const;
|
||||
void _setSMCarveout_EXPERIMENTAL(std::optional<int32_t>);
|
||||
void _setSMCarveout_EXPERIMENTAL(std::optional<int32_t> /*c*/);
|
||||
|
||||
at::QEngine qEngine() const;
|
||||
void setQEngine(at::QEngine e);
|
||||
@ -395,7 +405,7 @@ class TORCH_API Context {
|
||||
void setDefaultMobileCPUAllocator();
|
||||
void unsetDefaultMobileCPUAllocator();
|
||||
bool allowFP16ReductionCPU() const;
|
||||
void setAllowFP16ReductionCPU(bool);
|
||||
void setAllowFP16ReductionCPU(bool /*b*/);
|
||||
|
||||
// Preserved for BC
|
||||
void lazyInitCUDA() {
|
||||
@ -452,8 +462,10 @@ class TORCH_API Context {
|
||||
: at::Float32MatmulPrecision::HIGHEST;
|
||||
int benchmark_limit_cudnn = 10;
|
||||
bool allow_tf32_cudnn = true;
|
||||
bool allow_fp16_reduction_cublas = true;
|
||||
bool allow_bf16_reduction_cublas = true;
|
||||
CuBLASReductionOption allow_fp16_reduction_cublas =
|
||||
CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
|
||||
CuBLASReductionOption allow_bf16_reduction_cublas =
|
||||
CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
|
||||
bool allow_fp16_accumulation_cublas = false;
|
||||
std::optional<int32_t> sm_carveout = std::nullopt;
|
||||
bool enabled_mkldnn = true;
|
||||
|
@ -16,8 +16,8 @@ inline void check_size_nonnegative(ArrayRef<int64_t> size) {
|
||||
|
||||
inline void check_size_nonnegative(ArrayRef<c10::SymInt> size) {
|
||||
for (const auto& x : size) {
|
||||
TORCH_CHECK(
|
||||
x.expect_size(__FILE__, __LINE__),
|
||||
TORCH_SYM_CHECK(
|
||||
x.sym_ge(0),
|
||||
"Trying to create tensor with negative dimension ",
|
||||
x,
|
||||
": ",
|
||||
|
@ -62,7 +62,7 @@ constexpr const char* unknown_eventname = "eventname not specified";
|
||||
#endif
|
||||
} // namespace (anonymous)
|
||||
|
||||
MapAllocator::MapAllocator(WithFd, std::string_view filename, int fd, int flags, size_t size)
|
||||
MapAllocator::MapAllocator(WithFd /*unused*/, std::string_view filename, int fd, int flags, size_t size)
|
||||
: filename_(filename.empty() ? unknown_filename : filename)
|
||||
, size_(0) // to be filled later
|
||||
#ifdef _WIN32
|
||||
@ -494,7 +494,7 @@ RefcountedMapAllocator::RefcountedMapAllocator(const char *filename, int flags,
|
||||
|
||||
initializeAlloc();
|
||||
}
|
||||
RefcountedMapAllocator::RefcountedMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size)
|
||||
RefcountedMapAllocator::RefcountedMapAllocator(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size)
|
||||
: RefcountedMapAllocatorArgCheck(flags)
|
||||
, MapAllocator(WITH_FD, filename, flags, fd, size + map_alloc_alignment) {
|
||||
|
||||
@ -614,7 +614,7 @@ at::DataPtr MapAllocator::makeDataPtr(std::string_view filename, int flags, size
|
||||
return {context->data(), context, &deleteMapAllocator, at::DeviceType::CPU};
|
||||
}
|
||||
|
||||
at::DataPtr MapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
|
||||
at::DataPtr MapAllocator::makeDataPtr(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
|
||||
auto* context = new MapAllocator(WITH_FD, filename, fd, flags, size);
|
||||
if (actual_size_out) *actual_size_out = context->size();
|
||||
return {context->data(), context, &deleteMapAllocator, at::DeviceType::CPU};
|
||||
@ -626,7 +626,7 @@ at::DataPtr RefcountedMapAllocator::makeDataPtr(const char *filename, int flags,
|
||||
return {context->data(), context, &deleteRefcountedMapAllocator, at::DeviceType::CPU};
|
||||
}
|
||||
|
||||
at::DataPtr RefcountedMapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
|
||||
at::DataPtr RefcountedMapAllocator::makeDataPtr(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
|
||||
auto* context = new RefcountedMapAllocator(WITH_FD, filename, fd, flags, size);
|
||||
if (actual_size_out) *actual_size_out = context->size() - map_alloc_alignment;
|
||||
return {context->data(), context, &deleteRefcountedMapAllocator, at::DeviceType::CPU};
|
||||
|
@ -25,7 +25,7 @@ class TORCH_API MapAllocator {
|
||||
public:
|
||||
MapAllocator(std::string_view filename, int flags, size_t size);
|
||||
MapAllocator(
|
||||
WithFd,
|
||||
WithFd /*unused*/,
|
||||
std::string_view filename,
|
||||
int fd,
|
||||
int flags,
|
||||
@ -59,14 +59,14 @@ class TORCH_API MapAllocator {
|
||||
return flags_;
|
||||
}
|
||||
|
||||
static MapAllocator* fromDataPtr(const at::DataPtr&);
|
||||
static MapAllocator* fromDataPtr(const at::DataPtr& /*dptr*/);
|
||||
static at::DataPtr makeDataPtr(
|
||||
std::string_view filename,
|
||||
int flags,
|
||||
size_t size,
|
||||
size_t* actual_size_out);
|
||||
static at::DataPtr makeDataPtr(
|
||||
WithFd,
|
||||
WithFd /*unused*/,
|
||||
const char* filename,
|
||||
int fd,
|
||||
int flags,
|
||||
@ -105,13 +105,13 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck,
|
||||
public:
|
||||
RefcountedMapAllocator(const char* filename, int flags, size_t size);
|
||||
RefcountedMapAllocator(
|
||||
WithFd,
|
||||
WithFd /*unused*/,
|
||||
const char* filename,
|
||||
int fd,
|
||||
int flags,
|
||||
size_t size);
|
||||
|
||||
static RefcountedMapAllocator* fromDataPtr(const at::DataPtr&);
|
||||
static RefcountedMapAllocator* fromDataPtr(const at::DataPtr& /*dptr*/);
|
||||
RefcountedMapAllocator(const RefcountedMapAllocator&) = delete;
|
||||
RefcountedMapAllocator(RefcountedMapAllocator&&) = delete;
|
||||
RefcountedMapAllocator& operator=(const RefcountedMapAllocator&) = delete;
|
||||
@ -122,7 +122,7 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck,
|
||||
size_t size,
|
||||
size_t* actual_size_out);
|
||||
static at::DataPtr makeDataPtr(
|
||||
WithFd,
|
||||
WithFd /*unused*/,
|
||||
const char* filename,
|
||||
int fd,
|
||||
int flags,
|
||||
|
@ -273,7 +273,7 @@ c10::SymInt NestedTensorImpl::sym_numel_custom() const {
|
||||
return NestedTensorImpl::numel_custom();
|
||||
}
|
||||
|
||||
c10::SymBool NestedTensorImpl::sym_is_contiguous_custom(MemoryFormat) const {
|
||||
c10::SymBool NestedTensorImpl::sym_is_contiguous_custom(MemoryFormat /*memory_format*/) const {
|
||||
return nested_tensor_impl_is_contiguous(this);
|
||||
}
|
||||
IntArrayRef NestedTensorImpl::sizes_custom() const {
|
||||
|
@ -115,7 +115,8 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
|
||||
// with real implementations
|
||||
int64_t numel_custom() const override;
|
||||
c10::SymInt sym_numel_custom() const override;
|
||||
c10::SymBool sym_is_contiguous_custom(MemoryFormat) const override;
|
||||
c10::SymBool sym_is_contiguous_custom(
|
||||
MemoryFormat /*memory_format*/) const override;
|
||||
int64_t size_custom(int64_t d) const override {
|
||||
return this->size(d);
|
||||
}
|
||||
|
@ -14,7 +14,7 @@ inline int64_t divup(int64_t x, int64_t y) {
|
||||
TORCH_API void init_num_threads();
|
||||
|
||||
// Sets the number of threads to be used in parallel region
|
||||
TORCH_API void set_num_threads(int);
|
||||
TORCH_API void set_num_threads(int /*nthreads*/);
|
||||
|
||||
// Returns the maximum number of threads that may be used in a parallel region
|
||||
TORCH_API int get_num_threads();
|
||||
@ -37,7 +37,7 @@ inline void lazy_init_num_threads() {
|
||||
}
|
||||
}
|
||||
|
||||
TORCH_API void set_thread_num(int);
|
||||
TORCH_API void set_thread_num(int /*id*/);
|
||||
|
||||
class TORCH_API ThreadIdGuard {
|
||||
public:
|
||||
@ -130,7 +130,7 @@ inline scalar_t parallel_reduce(
|
||||
TORCH_API std::string get_parallel_info();
|
||||
|
||||
// Sets number of threads used for inter-op parallelism
|
||||
TORCH_API void set_num_interop_threads(int);
|
||||
TORCH_API void set_num_interop_threads(int /*nthreads*/);
|
||||
|
||||
// Returns the number of threads used for inter-op parallelism
|
||||
TORCH_API size_t get_num_interop_threads();
|
||||
|
@ -252,7 +252,7 @@ void SparseCsrTensorImpl::set_stride(int64_t dim, int64_t new_stride) {
|
||||
void SparseCsrTensorImpl::set_storage_offset(int64_t storage_offset) {
|
||||
TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have set_storage_offset.");
|
||||
}
|
||||
c10::SymBool SparseCsrTensorImpl::sym_is_contiguous_custom(MemoryFormat) const {
|
||||
c10::SymBool SparseCsrTensorImpl::sym_is_contiguous_custom(MemoryFormat /*memory_format*/) const {
|
||||
TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have is_contiguous");
|
||||
}
|
||||
} // namespace at
|
||||
|
@ -32,10 +32,10 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
|
||||
|
||||
public:
|
||||
explicit SparseCsrTensorImpl(
|
||||
at::DispatchKeySet,
|
||||
at::DispatchKeySet /*key_set*/,
|
||||
at::Device device,
|
||||
Layout layout,
|
||||
const caffe2::TypeMeta);
|
||||
const caffe2::TypeMeta /*data_type*/);
|
||||
|
||||
void resize_(int64_t nnz, IntArrayRef size);
|
||||
void resize_and_clear_(
|
||||
@ -86,7 +86,8 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
|
||||
protected:
|
||||
IntArrayRef strides_custom() const override;
|
||||
SymIntArrayRef sym_strides_custom() const override;
|
||||
SymBool sym_is_contiguous_custom(MemoryFormat) const override;
|
||||
SymBool sym_is_contiguous_custom(
|
||||
MemoryFormat /*memory_format*/) const override;
|
||||
|
||||
public:
|
||||
void set_size(int64_t dim, int64_t new_size) override;
|
||||
|
@ -46,7 +46,9 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
|
||||
|
||||
public:
|
||||
// Public for now...
|
||||
explicit SparseTensorImpl(at::DispatchKeySet, const caffe2::TypeMeta);
|
||||
explicit SparseTensorImpl(
|
||||
at::DispatchKeySet /*key_set*/,
|
||||
const caffe2::TypeMeta /*data_type*/);
|
||||
|
||||
void release_resources() override;
|
||||
|
||||
@ -384,8 +386,8 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
|
||||
|
||||
private:
|
||||
explicit SparseTensorImpl(
|
||||
at::DispatchKeySet,
|
||||
const caffe2::TypeMeta,
|
||||
at::DispatchKeySet /*key_set*/,
|
||||
const caffe2::TypeMeta /*data_type*/,
|
||||
at::Tensor indices,
|
||||
at::Tensor values);
|
||||
|
||||
|
@ -112,10 +112,10 @@ TORCH_API std::ostream& operator<<(std::ostream& stream, const Slice& slice);
|
||||
// `torch.tensor([1, 2])`) | `torch::tensor({1, 2})`
|
||||
struct TORCH_API TensorIndex final {
|
||||
// Case 1: `at::indexing::None`
|
||||
TensorIndex(std::nullopt_t) : type_(TensorIndexType::None) {}
|
||||
TensorIndex(std::nullopt_t /*unused*/) : type_(TensorIndexType::None) {}
|
||||
|
||||
// Case 2: "..." / `at::indexing::Ellipsis`
|
||||
TensorIndex(at::indexing::EllipsisIndexType)
|
||||
TensorIndex(at::indexing::EllipsisIndexType /*unused*/)
|
||||
: type_(TensorIndexType::Ellipsis) {}
|
||||
TensorIndex(const char* str) : TensorIndex(at::indexing::Ellipsis) {
|
||||
TORCH_CHECK_VALUE(
|
||||
|
@ -250,7 +250,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
|
||||
using PtrVector = SmallVector<char*, 4>;
|
||||
using StrideVector = SmallVector<int64_t, 6>;
|
||||
|
||||
void build(TensorIteratorConfig&);
|
||||
void build(TensorIteratorConfig& /*config*/);
|
||||
|
||||
// The inner-loop function operates on the fastest moving dimension. It
|
||||
// implements element-wise operations in terms of 1-d strided tensors.
|
||||
@ -618,20 +618,20 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
|
||||
#undef TORCH_DISALLOW_TEMPORARIES
|
||||
protected:
|
||||
// Mutable reference as it moves tensors out of TensorIteratorConfig
|
||||
void populate_operands(TensorIteratorConfig&);
|
||||
void populate_operands(TensorIteratorConfig& /*config*/);
|
||||
void mark_outputs();
|
||||
void mark_resize_outputs(const TensorIteratorConfig&);
|
||||
void compute_mem_overlaps(const TensorIteratorConfig&);
|
||||
void compute_shape(const TensorIteratorConfig&);
|
||||
void compute_strides(const TensorIteratorConfig&);
|
||||
void mark_resize_outputs(const TensorIteratorConfig& /*config*/);
|
||||
void compute_mem_overlaps(const TensorIteratorConfig& /*config*/);
|
||||
void compute_shape(const TensorIteratorConfig& /*config*/);
|
||||
void compute_strides(const TensorIteratorConfig& /*config*/);
|
||||
void reorder_dimensions();
|
||||
void permute_dimensions(IntArrayRef perm);
|
||||
void compute_types(const TensorIteratorConfig&);
|
||||
void compute_types(const TensorIteratorConfig& /*config*/);
|
||||
ScalarType compute_common_dtype();
|
||||
void allocate_or_resize_outputs();
|
||||
bool fast_set_up(const TensorIteratorConfig&);
|
||||
FastSetupType compute_fast_setup_type(const TensorIteratorConfig&);
|
||||
void compute_names(const TensorIteratorConfig&);
|
||||
bool fast_set_up(const TensorIteratorConfig& /*config*/);
|
||||
FastSetupType compute_fast_setup_type(const TensorIteratorConfig& /*config*/);
|
||||
void compute_names(const TensorIteratorConfig& /*config*/);
|
||||
void propagate_names_to_outputs();
|
||||
void coalesce_dimensions();
|
||||
|
||||
|
@ -20,7 +20,7 @@
|
||||
|
||||
namespace at {
|
||||
|
||||
TORCH_API int _crash_if_asan(int);
|
||||
TORCH_API int _crash_if_asan(int /*arg*/);
|
||||
|
||||
// Converts a TensorList (i.e. ArrayRef<Tensor> to vector of TensorImpl*)
|
||||
// NB: This is ONLY used by legacy TH bindings, and ONLY used by cat.
|
||||
|
@ -103,9 +103,7 @@ std::string get_cpu_capability() {
|
||||
#elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
|
||||
case native::CPUCapability::ZVECTOR:
|
||||
return "Z VECTOR";
|
||||
#elif defined(HAVE_SVE_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
|
||||
case native::CPUCapability::SVE128:
|
||||
return "SVE128";
|
||||
#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
|
||||
case native::CPUCapability::SVE256:
|
||||
return "SVE256";
|
||||
#else
|
||||
|
@ -148,7 +148,7 @@ Tensor cached_cast(at::ScalarType to_type, const Tensor& arg, DeviceType device_
|
||||
Banned functions
|
||||
*******************************/
|
||||
|
||||
static Tensor binary_cross_entropy_banned(const Tensor &, const Tensor &, const std::optional<Tensor>&, int64_t) {
|
||||
static Tensor binary_cross_entropy_banned(const Tensor & /*unused*/, const Tensor & /*unused*/, const std::optional<Tensor>& /*unused*/, int64_t /*unused*/) {
|
||||
TORCH_CHECK(false, "torch.nn.functional.binary_cross_entropy and torch.nn.BCELoss are unsafe to autocast.\n"
|
||||
"Many models use a sigmoid layer right before the binary cross entropy layer.\n"
|
||||
"In this case, combine the two layers using torch.nn.functional.binary_cross_entropy_with_logits\n"
|
||||
|
@ -27,11 +27,11 @@ struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface {
|
||||
HasNonWildcard
|
||||
};
|
||||
|
||||
explicit NamedTensorMeta(HAS_NON_WILDCARD, DimnameList names)
|
||||
explicit NamedTensorMeta(HAS_NON_WILDCARD /*unused*/, DimnameList names)
|
||||
: names_(names.vec()) {
|
||||
check_invariants();
|
||||
}
|
||||
explicit NamedTensorMeta(HAS_NON_WILDCARD, std::vector<Dimname>&& names)
|
||||
explicit NamedTensorMeta(HAS_NON_WILDCARD /*unused*/, std::vector<Dimname>&& names)
|
||||
: names_(std::move(names)) {
|
||||
check_invariants();
|
||||
}
|
||||
@ -52,13 +52,13 @@ struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface {
|
||||
std::any_of(names_.begin(), names_.end(), [](const Dimname& n) { return !n.isWildcard(); }));
|
||||
}
|
||||
|
||||
void set_names(HAS_NON_WILDCARD, DimnameList new_names) {
|
||||
void set_names(HAS_NON_WILDCARD /*unused*/, DimnameList new_names) {
|
||||
TORCH_INTERNAL_ASSERT(new_names.size() == names_.size());
|
||||
std::copy(new_names.begin(), new_names.end(), names_.begin());
|
||||
check_invariants();
|
||||
}
|
||||
|
||||
void set_names(HAS_NON_WILDCARD, std::vector<Dimname>&& new_names) {
|
||||
void set_names(HAS_NON_WILDCARD /*unused*/, std::vector<Dimname>&& new_names) {
|
||||
TORCH_INTERNAL_ASSERT(new_names.size() == names_.size());
|
||||
names_ = std::move(new_names);
|
||||
check_invariants();
|
||||
|
@ -13,7 +13,7 @@ class TORCH_API PythonOpRegistrationTrampoline final {
|
||||
public:
|
||||
// Returns true if you successfully registered yourself (that means
|
||||
// you are in the hot seat for doing the operator registrations!)
|
||||
static bool registerInterpreter(c10::impl::PyInterpreter*);
|
||||
static bool registerInterpreter(c10::impl::PyInterpreter* /*interp*/);
|
||||
|
||||
// Returns nullptr if no interpreter has been registered yet.
|
||||
static c10::impl::PyInterpreter* getInterpreter();
|
||||
|
@ -173,4 +173,12 @@ unsigned TensorBase::_register_hook(std::function<TensorBase(const TensorBase&)>
|
||||
return impl::GetVariableHooks()->_register_hook(*this, std::move(hook));
|
||||
}
|
||||
|
||||
std::optional<ScalarType> TensorBase::grad_dtype() const {
|
||||
return impl::GetVariableHooks()->grad_dtype(*this);
|
||||
}
|
||||
|
||||
void TensorBase::set_grad_dtype(const std::optional<ScalarType>& grad_dtype) const {
|
||||
return impl::GetVariableHooks()->set_grad_dtype(*this, grad_dtype);
|
||||
}
|
||||
|
||||
} // namespace at
|
||||
|
@ -100,7 +100,7 @@ class TORCH_API TensorBase {
|
||||
// Create a Tensor with a +0 reference count. Special care must be
|
||||
// taken to avoid decrementing this reference count at destruction
|
||||
// time. Intended to support MaybeOwnedTraits<Tensor>.
|
||||
explicit TensorBase(unsafe_borrow_t, const TensorBase& rhs)
|
||||
explicit TensorBase(unsafe_borrow_t /*unused*/, const TensorBase& rhs)
|
||||
: impl_(c10::intrusive_ptr<at::TensorImpl, UndefinedTensorImpl>(rhs.impl_.get(), c10::raw::DontIncreaseRefcount{})) {}
|
||||
friend MaybeOwnedTraits<TensorBase>;
|
||||
|
||||
@ -930,6 +930,10 @@ public:
|
||||
|
||||
const TensorBase& requires_grad_(bool _requires_grad=true) const;
|
||||
|
||||
std::optional<ScalarType> grad_dtype() const;
|
||||
|
||||
void set_grad_dtype(const std::optional<ScalarType>& grad_dtype) const;
|
||||
|
||||
// View Variables
|
||||
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
@ -950,7 +954,7 @@ protected:
|
||||
c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> impl_;
|
||||
|
||||
private:
|
||||
TensorBase __dispatch_contiguous(c10::MemoryFormat) const;
|
||||
TensorBase __dispatch_contiguous(c10::MemoryFormat /*memory_format*/) const;
|
||||
};
|
||||
|
||||
inline DeviceIndex get_device(const TensorBase& self) {
|
||||
|
@ -68,6 +68,8 @@ struct TORCH_API VariableHooksInterface {
|
||||
const c10::OperatorHandle& op,
|
||||
c10::DispatchKeySet dispatch_keys,
|
||||
torch::jit::Stack* stack) const = 0;
|
||||
virtual std::optional<c10::ScalarType> grad_dtype(const TensorBase&) const = 0;
|
||||
virtual void set_grad_dtype(const TensorBase&, const std::optional<c10::ScalarType>&) const = 0;
|
||||
};
|
||||
|
||||
TORCH_API void SetVariableHooks(VariableHooksInterface* hooks);
|
||||
|
@ -18,10 +18,10 @@ class KernelFunction;
|
||||
// implementation notes; notably, this does NOT actually go through the
|
||||
// boxing/unboxing codepath.
|
||||
TORCH_API void fallthrough_kernel(
|
||||
OperatorKernel*,
|
||||
const OperatorHandle&,
|
||||
DispatchKeySet,
|
||||
Stack*);
|
||||
OperatorKernel* /*unused*/,
|
||||
const OperatorHandle& /*unused*/,
|
||||
DispatchKeySet /*unused*/,
|
||||
Stack* /*unused*/);
|
||||
|
||||
// Note [Ambiguity in AutogradOther kernel]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
@ -62,10 +62,10 @@ TORCH_API void fallthrough_kernel(
|
||||
// than arbitrarily pick one or the other, we just register a kernel that raises
|
||||
// an error and let the user decide how to proceed.
|
||||
TORCH_API void ambiguous_autogradother_kernel(
|
||||
OperatorKernel*,
|
||||
const OperatorHandle&,
|
||||
DispatchKeySet,
|
||||
Stack*);
|
||||
OperatorKernel* /*unused*/,
|
||||
const OperatorHandle& /*op*/,
|
||||
DispatchKeySet /*unused*/,
|
||||
Stack* /*unused*/);
|
||||
|
||||
// Note [named_not_supported_kernel]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
@ -75,10 +75,10 @@ TORCH_API void ambiguous_autogradother_kernel(
|
||||
// give a good error message in cases when boxing is not supported). When
|
||||
// boxing is universally supported this can be removed.
|
||||
[[noreturn]] TORCH_API void named_not_supported_kernel(
|
||||
OperatorKernel*,
|
||||
const OperatorHandle&,
|
||||
DispatchKeySet,
|
||||
Stack*);
|
||||
OperatorKernel* /*unused*/,
|
||||
const OperatorHandle& /*op*/,
|
||||
DispatchKeySet /*unused*/,
|
||||
Stack* /*unused*/);
|
||||
|
||||
/**
|
||||
* BoxedKernel is similar to a std::function storing a boxed kernel.
|
||||
@ -185,16 +185,16 @@ class TORCH_API BoxedKernel final {
|
||||
|
||||
template <BoxedKernelFunction* func>
|
||||
static void make_boxed_function(
|
||||
OperatorKernel*,
|
||||
OperatorKernel* /*unused*/,
|
||||
const OperatorHandle& opHandle,
|
||||
DispatchKeySet,
|
||||
DispatchKeySet /*unused*/,
|
||||
Stack* stack);
|
||||
|
||||
template <BoxedKernelFunction_withDispatchKeys* func>
|
||||
static void make_boxed_function(
|
||||
OperatorKernel*,
|
||||
OperatorKernel* /*unused*/,
|
||||
const OperatorHandle& opHandle,
|
||||
DispatchKeySet,
|
||||
DispatchKeySet /*ks*/,
|
||||
Stack* stack);
|
||||
|
||||
explicit BoxedKernel(
|
||||
|
@ -11,9 +11,9 @@ inline BoxedKernel::BoxedKernel(
|
||||
|
||||
template <BoxedKernel::BoxedKernelFunction* func>
|
||||
inline void BoxedKernel::make_boxed_function(
|
||||
OperatorKernel*,
|
||||
OperatorKernel* /*unused*/,
|
||||
const OperatorHandle& opHandle,
|
||||
DispatchKeySet,
|
||||
DispatchKeySet /*unused*/,
|
||||
Stack* stack) {
|
||||
// Note that we're dropping the DispatchKeySet argument.
|
||||
// See Note [Plumbing Keys Through The Dispatcher 2] for details.
|
||||
@ -22,7 +22,7 @@ inline void BoxedKernel::make_boxed_function(
|
||||
|
||||
template <BoxedKernel::BoxedKernelFunction_withDispatchKeys* func>
|
||||
inline void BoxedKernel::make_boxed_function(
|
||||
OperatorKernel*,
|
||||
OperatorKernel* /*unused*/,
|
||||
const OperatorHandle& opHandle,
|
||||
DispatchKeySet ks,
|
||||
Stack* stack) {
|
||||
|
@ -10,7 +10,7 @@ namespace c10 {
|
||||
// be handled specially. Its semantics is that it redispatches to the
|
||||
// *next* dispatch key that would have been processed, skipping the current
|
||||
// one.
|
||||
void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, DispatchKeySet, Stack*) {
|
||||
void fallthrough_kernel(OperatorKernel* /*unused*/, const OperatorHandle& /*unused*/, DispatchKeySet /*unused*/, Stack* /*unused*/) {
|
||||
TORCH_INTERNAL_ASSERT(0,
|
||||
"fallthrough_kernel was executed but it should have been short-circuited by the dispatcher. "
|
||||
"This could occur if you registered a fallthrough kernel as a override for a specific operator "
|
||||
@ -19,7 +19,7 @@ void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, DispatchKeySet,
|
||||
"let us know in the bug tracker.");
|
||||
}
|
||||
|
||||
void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, DispatchKeySet, Stack*) {
|
||||
void ambiguous_autogradother_kernel(OperatorKernel* /*unused*/, const OperatorHandle& op, DispatchKeySet /*unused*/, Stack* /*unused*/) {
|
||||
TORCH_INTERNAL_ASSERT(0,
|
||||
op.operator_name(), " has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. "
|
||||
"This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering "
|
||||
@ -32,7 +32,7 @@ void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, D
|
||||
"\nCanonical state\n~~~~~~~~~~~\n", op.dumpState(), "\n\n");
|
||||
}
|
||||
|
||||
void named_not_supported_kernel(OperatorKernel*, const OperatorHandle& op, DispatchKeySet, Stack*) {
|
||||
void named_not_supported_kernel(OperatorKernel* /*unused*/, const OperatorHandle& op, DispatchKeySet /*unused*/, Stack* /*unused*/) {
|
||||
// DO NOT LOOK AT STACK, YOU HAVE SHORT CIRCUITED BOXING
|
||||
// See Note [named_not_supported_kernel]
|
||||
TORCH_CHECK(0,
|
||||
|
@ -229,7 +229,7 @@ class TORCH_API KernelFunction final {
|
||||
* &unboxed_func>();
|
||||
*/
|
||||
template <class FuncPtr, bool AllowLegacyTypes = false>
|
||||
static KernelFunction makeFromUnboxedFunction(FuncPtr);
|
||||
static KernelFunction makeFromUnboxedFunction(FuncPtr /*func_ptr*/);
|
||||
|
||||
/**
|
||||
* Create a KernelFunction from an unboxed function.
|
||||
@ -271,7 +271,7 @@ class TORCH_API KernelFunction final {
|
||||
|
||||
std::string dumpState() const;
|
||||
// For testing internal invariants only
|
||||
bool _equalsBoxedAndUnboxed(const KernelFunction&) const;
|
||||
bool _equalsBoxedAndUnboxed(const KernelFunction& /*other*/) const;
|
||||
|
||||
// Register a token to be invalidated when this KernelFunction is destroyed
|
||||
void registerToken(std::weak_ptr<KernelToken> token) const;
|
||||
|
@ -131,7 +131,7 @@ C10_ALWAYS_INLINE_UNLESS_MOBILE void boxToStack(
|
||||
new (dest++) IValue(options.pinned_memory());
|
||||
}
|
||||
|
||||
inline void boxArgsToStack(IValue*&) {}
|
||||
inline void boxArgsToStack(IValue*& /*unused*/) {}
|
||||
|
||||
template <typename T, typename... Args>
|
||||
C10_ALWAYS_INLINE_UNLESS_MOBILE void boxArgsToStack(
|
||||
@ -185,7 +185,7 @@ struct PopResult<std::tuple<Types...>> final {
|
||||
template <size_t... indices>
|
||||
static Result pop_to_tuple_impl(
|
||||
Stack& stack,
|
||||
std::index_sequence<indices...>) {
|
||||
std::index_sequence<indices...> /*unused*/) {
|
||||
return std::make_tuple((std::move(stack[indices]).template to<Types>())...);
|
||||
}
|
||||
};
|
||||
|
@ -561,7 +561,7 @@ struct wrap_kernel_functor_unboxed_<
|
||||
// doesn't use &&
|
||||
static ReturnType call(
|
||||
OperatorKernel* functor,
|
||||
DispatchKeySet,
|
||||
DispatchKeySet /*unused*/,
|
||||
ParameterTypes... args) {
|
||||
KernelFunctor* functor_ = static_cast<KernelFunctor*>(functor);
|
||||
// Note [Plumbing Keys Through The Dispatcher 2]
|
||||
@ -629,8 +629,8 @@ call_functor_with_args_from_stack_(
|
||||
OperatorKernel* functor,
|
||||
DispatchKeySet dispatchKeySet,
|
||||
Stack* stack,
|
||||
std::index_sequence<ivalue_arg_indices...>,
|
||||
guts::typelist::typelist<ArgTypes...>*) {
|
||||
std::index_sequence<ivalue_arg_indices...> /*unused*/,
|
||||
guts::typelist::typelist<ArgTypes...>* /*unused*/) {
|
||||
(void)(stack); // when sizeof...(ivalue_arg_indices) == 0, this argument would
|
||||
// be unused and we have to silence the compiler warning.
|
||||
|
||||
@ -708,7 +708,7 @@ struct push_outputs<std::tuple<OutputTypes...>, AllowDeprecatedTypes> final {
|
||||
static void call_(
|
||||
std::tuple<OutputTypes...>&& output,
|
||||
Stack* stack,
|
||||
std::index_sequence<indices...>) {
|
||||
std::index_sequence<indices...> /*unused*/) {
|
||||
torch::jit::push(
|
||||
*stack,
|
||||
return_to_ivalue<OutputTypes, AllowDeprecatedTypes>::call(
|
||||
@ -718,7 +718,7 @@ struct push_outputs<std::tuple<OutputTypes...>, AllowDeprecatedTypes> final {
|
||||
static void copy_(
|
||||
const std::tuple<OutputTypes...>& output,
|
||||
Stack* stack,
|
||||
std::index_sequence<indices...>) {
|
||||
std::index_sequence<indices...> /*unused*/) {
|
||||
torch::jit::push(
|
||||
*stack,
|
||||
return_to_ivalue<OutputTypes, AllowDeprecatedTypes>::copy(
|
||||
@ -741,7 +741,7 @@ struct make_boxed_from_unboxed_functor final {
|
||||
|
||||
static void call(
|
||||
OperatorKernel* functor,
|
||||
const OperatorHandle&,
|
||||
const OperatorHandle& /*unused*/,
|
||||
DispatchKeySet dispatchKeySet,
|
||||
Stack* stack) {
|
||||
using ReturnType =
|
||||
|
@ -63,13 +63,13 @@ struct BuiltinOpFunction : public Function {
|
||||
|
||||
bool call(
|
||||
Stack& stack,
|
||||
std::optional<size_t>,
|
||||
c10::function_ref<void(const Code&)>) override {
|
||||
std::optional<size_t> /*unused*/,
|
||||
c10::function_ref<void(const Code&)> /*unused*/) override {
|
||||
run(stack);
|
||||
return false;
|
||||
}
|
||||
|
||||
bool call(Stack& stack, c10::function_ref<void(const mobile::Code&)>)
|
||||
bool call(Stack& stack, c10::function_ref<void(const mobile::Code&)> /*unused*/)
|
||||
override {
|
||||
run(stack);
|
||||
return false;
|
||||
|
@ -80,7 +80,8 @@ struct MultiDispatchKeySet : at::IterArgs<MultiDispatchKeySet> {
|
||||
ts = ts | x.key_set();
|
||||
}
|
||||
}
|
||||
[[noreturn]] void operator()(at::ArrayRef<std::optional<at::Tensor>>) {
|
||||
[[noreturn]] void operator()(
|
||||
at::ArrayRef<std::optional<at::Tensor>> /*unused*/) {
|
||||
// Just checking that the handling of Tensor?[] didn't change.
|
||||
TORCH_INTERNAL_ASSERT(false);
|
||||
}
|
||||
@ -95,7 +96,7 @@ struct MultiDispatchKeySet : at::IterArgs<MultiDispatchKeySet> {
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
void operator()(const T&) {
|
||||
void operator()(const T& /*unused*/) {
|
||||
// do nothing
|
||||
}
|
||||
};
|
||||
|
@ -634,7 +634,7 @@ class TypedOperatorHandle<Return(Args...)> final : public OperatorHandle {
|
||||
|
||||
namespace detail {
|
||||
template <class... Args>
|
||||
inline void unused_arg_(const Args&...) {}
|
||||
inline void unused_arg_(const Args&... /*unused*/) {}
|
||||
|
||||
// CaptureKernelCall is intended to capture return values from Dispatcher
|
||||
// unboxed kernel calls. A record function may request to get outputs from the
|
||||
|
@ -105,7 +105,7 @@ class TORCH_API OperatorEntry final {
|
||||
// versa that is an error. (Refcounting for the registrations is
|
||||
// handled in the OperatorHandle in Dispatcher)
|
||||
void registerSchema(
|
||||
FunctionSchema&&,
|
||||
FunctionSchema&& /*schema*/,
|
||||
std::string&& debug,
|
||||
std::vector<at::Tag> tags = {});
|
||||
void deregisterSchema();
|
||||
|
@ -177,7 +177,7 @@ bool DynamicType::equals(const Type& rhs) const {
|
||||
return equals(*create(rhs));
|
||||
}
|
||||
|
||||
bool DynamicType::isSubtypeOfExt(const Type& rhs, std::ostream*) const {
|
||||
bool DynamicType::isSubtypeOfExt(const Type& rhs, std::ostream* /*why_not*/) const {
|
||||
auto other = create(rhs);
|
||||
if (tag_ == other->tag_) {
|
||||
if (equals(*other)) {
|
||||
@ -371,7 +371,7 @@ DynamicTypePtr ivalue::TupleTypeFactory<c10::DynamicType>::create(
|
||||
}
|
||||
|
||||
DynamicTypePtr ivalue::TupleTypeFactory<c10::DynamicType>::fallback(
|
||||
const Type&) {
|
||||
const Type& /*unused*/) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
|
||||
return nullptr;
|
||||
}
|
||||
|
@ -138,8 +138,8 @@ class DynamicType : public SharedType {
|
||||
|
||||
struct Arguments {
|
||||
Arguments() = default;
|
||||
Arguments(c10::ArrayRef<TypePtr>);
|
||||
Arguments(const std::vector<std::string_view>&, c10::ArrayRef<TypePtr>);
|
||||
Arguments(c10::ArrayRef<TypePtr> /*args*/);
|
||||
Arguments(const std::vector<std::string_view>& /*names*/, c10::ArrayRef<TypePtr> /*args*/);
|
||||
std::vector<LabeledDynamicType> elems;
|
||||
};
|
||||
|
||||
@ -156,15 +156,15 @@ class DynamicType : public SharedType {
|
||||
static const TypeKind Kind = TypeKind::DynamicType;
|
||||
static TORCH_API DynamicTypePtr create(Type& ty);
|
||||
|
||||
explicit DynamicType(Tag, Arguments);
|
||||
explicit DynamicType(Tag, std::string_view, Arguments);
|
||||
explicit DynamicType(Tag /*tag*/, Arguments /*arguments*/);
|
||||
explicit DynamicType(Tag /*tag*/, std::string_view /*name*/, Arguments /*arguments*/);
|
||||
|
||||
DynamicType(DynamicType&& other) = delete;
|
||||
DynamicType(const DynamicType&) = delete;
|
||||
DynamicType& operator=(const DynamicType&) = delete;
|
||||
DynamicType& operator=(DynamicType&&) = delete;
|
||||
|
||||
TypePtr containedType(size_t) const override;
|
||||
TypePtr containedType(size_t /*i*/) const override;
|
||||
size_t containedTypeSize() const override;
|
||||
Tag tag() const {
|
||||
return tag_;
|
||||
|
@ -96,15 +96,15 @@ struct TORCH_API Function {
|
||||
// Overload for server interpreter, a bailout size is needed for graph
|
||||
// executor.
|
||||
virtual bool call(
|
||||
Stack&,
|
||||
std::optional<size_t>,
|
||||
c10::function_ref<void(const Code&)>) {
|
||||
Stack& /*unused*/,
|
||||
std::optional<size_t> /*unused*/,
|
||||
c10::function_ref<void(const Code&)> /*unused*/) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Overload for mobile interpreter.
|
||||
virtual bool call(Stack&, c10::function_ref<void(const mobile::Code&)>) {
|
||||
virtual bool call(Stack& /*unused*/, c10::function_ref<void(const mobile::Code&)> /*unused*/) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
|
||||
return false;
|
||||
}
|
||||
|
@ -847,7 +847,7 @@ struct TORCH_API IValue final {
|
||||
IValue(std::optional<T> v);
|
||||
template <class T, enable_if_list_is_ivalue_constructible<T> = nullptr>
|
||||
IValue(c10::OptionalArrayRef<T> v);
|
||||
IValue(std::nullopt_t);
|
||||
IValue(std::nullopt_t /*unused*/);
|
||||
|
||||
// ClassType
|
||||
IValue(c10::intrusive_ptr<ivalue::Object> v);
|
||||
|
@ -660,7 +660,7 @@ struct TORCH_API TupleTypeFactory<TupleType> {
|
||||
template <>
|
||||
struct TORCH_API TupleTypeFactory<c10::DynamicType> {
|
||||
static DynamicTypePtr create(const std::vector<TypePtr>& elemTypes);
|
||||
static DynamicTypePtr fallback(const Type&);
|
||||
static DynamicTypePtr fallback(const Type& /*unused*/);
|
||||
};
|
||||
|
||||
struct TORCH_API Tuple : c10::intrusive_ptr_target {
|
||||
@ -1682,7 +1682,7 @@ struct ivalue::EnumHolder : c10::intrusive_ptr_target {
|
||||
namespace detail {
|
||||
|
||||
struct _guarded_unsigned_long_unique_dummy final {
|
||||
_guarded_unsigned_long_unique_dummy(int64_t){}
|
||||
_guarded_unsigned_long_unique_dummy(int64_t /*unused*/){}
|
||||
};
|
||||
using _guarded_unsigned_long = std::conditional_t<
|
||||
std::is_same_v<unsigned long, uint32_t> ||
|
||||
@ -1776,7 +1776,7 @@ template <class Elem>
|
||||
// native_functions.yaml still return std::vector.
|
||||
// C10_DEPRECATED_MESSAGE("IValues based on std::vector<T> are potentially slow
|
||||
// and deprecated. Please use torch::List<T> instead.")
|
||||
std::vector<Elem> generic_to(IValue ivalue, _fake_type<std::vector<Elem>>) {
|
||||
std::vector<Elem> generic_to(IValue ivalue, _fake_type<std::vector<Elem>> /*unused*/) {
|
||||
// We need to do a deep copy of the vector because there might be other
|
||||
// references to this same IValue that also use the list. We can't just
|
||||
// move the elements out.
|
||||
@ -1826,18 +1826,18 @@ c10::intrusive_ptr<T> IValue::toCustomClass() const& {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T generic_to(IValue ivalue, _fake_type<T>) {
|
||||
T generic_to(IValue ivalue, _fake_type<T> /*unused*/) {
|
||||
using ElemType = typename std::remove_pointer<T>::type::element_type;
|
||||
return std::move(ivalue).template toCustomClass<ElemType>();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
tagged_capsule<T> generic_to(IValue ivalue, _fake_type<tagged_capsule<T>>) {
|
||||
tagged_capsule<T> generic_to(IValue ivalue, _fake_type<tagged_capsule<T>> /*unused*/) {
|
||||
return tagged_capsule<T>{std::move(ivalue)};
|
||||
}
|
||||
|
||||
template <typename Elem>
|
||||
c10::List<Elem> generic_to(IValue ivalue, _fake_type<c10::List<Elem>>) {
|
||||
c10::List<Elem> generic_to(IValue ivalue, _fake_type<c10::List<Elem>> /*unused*/) {
|
||||
return impl::toTypedList<Elem>(std::move(ivalue).toList());
|
||||
}
|
||||
|
||||
@ -1867,7 +1867,7 @@ std::vector<T> createVectorFromList(const c10::List<T>& impl) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
OptionalArray<T> generic_to(IValue ivalue, _fake_type<OptionalArray<T>>) {
|
||||
OptionalArray<T> generic_to(IValue ivalue, _fake_type<OptionalArray<T>> /*unused*/) {
|
||||
if (ivalue.isNone()) {
|
||||
return {};
|
||||
}
|
||||
@ -1880,8 +1880,8 @@ namespace detail {
|
||||
template <typename Elem, size_t... I>
|
||||
std::array<Elem, sizeof...(I)> generic_to_array(
|
||||
IValue ivalue,
|
||||
_fake_type<std::array<Elem, sizeof...(I)>>,
|
||||
std::index_sequence<I...>) {
|
||||
_fake_type<std::array<Elem, sizeof...(I)>> /*unused*/,
|
||||
std::index_sequence<I...> /*unused*/) {
|
||||
// We need to do a deep copy of the array because there might be other
|
||||
// references to this same IValue that also use the list. We can't just
|
||||
// move the elements out.
|
||||
@ -1906,7 +1906,7 @@ std::array<Elem, N> generic_to(
|
||||
template <typename Key, typename Value>
|
||||
c10::Dict<Key, Value> generic_to(
|
||||
IValue ivalue,
|
||||
_fake_type<c10::Dict<Key, Value>>) {
|
||||
_fake_type<c10::Dict<Key, Value>> /*unused*/) {
|
||||
return impl::toTypedDict<Key, Value>(std::move(ivalue).toGenericDict());
|
||||
}
|
||||
|
||||
@ -1915,7 +1915,7 @@ C10_DEPRECATED_MESSAGE(
|
||||
"IValues based on std::unordered_map are slow and deprecated. Please use c10::Dict<K, V> instead.")
|
||||
std::unordered_map<K, V> generic_to(
|
||||
IValue ivalue,
|
||||
_fake_type<std::unordered_map<K, V>>) {
|
||||
_fake_type<std::unordered_map<K, V>> /*unused*/) {
|
||||
std::unordered_map<K, V> specialized_dict;
|
||||
|
||||
for (const auto& item : std::move(ivalue).toGenericDict()) {
|
||||
@ -1926,7 +1926,7 @@ std::unordered_map<K, V> generic_to(
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::optional<T> generic_to(IValue ivalue, _fake_type<std::optional<T>>) {
|
||||
std::optional<T> generic_to(IValue ivalue, _fake_type<std::optional<T>> /*unused*/) {
|
||||
if (ivalue.isNone()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
@ -1937,7 +1937,7 @@ namespace detail {
|
||||
template <typename Tuple, std::size_t... INDEX>
|
||||
Tuple generic_to_tuple_impl(
|
||||
const ivalue::TupleElements& t,
|
||||
std::index_sequence<INDEX...>) {
|
||||
std::index_sequence<INDEX...> /*unused*/) {
|
||||
return std::make_tuple(
|
||||
t[INDEX].to<typename std::tuple_element<INDEX, Tuple>::type>()...);
|
||||
}
|
||||
@ -1951,7 +1951,7 @@ template <
|
||||
std::is_lvalue_reference<Args>...,
|
||||
std::negation<std::is_constructible<IValue, Args>>...>,
|
||||
std::nullptr_t> = nullptr>
|
||||
std::tuple<Args...> generic_to(const IValue& ivalue, _fake_type<std::tuple<Args...>>) {
|
||||
std::tuple<Args...> generic_to(const IValue& ivalue, _fake_type<std::tuple<Args...>> /*unused*/) {
|
||||
const auto& vals = ivalue.toTupleRef().elements();
|
||||
TORCH_CHECK(vals.size() == sizeof...(Args));
|
||||
return detail::generic_to_tuple_impl<std::tuple<Args...>>(vals, Indices{});
|
||||
@ -2311,7 +2311,7 @@ inline IValue::IValue(std::optional<T> v) : IValue() {
|
||||
}
|
||||
}
|
||||
|
||||
inline IValue::IValue(std::nullopt_t) : IValue() {}
|
||||
inline IValue::IValue(std::nullopt_t /*unused*/) : IValue() {}
|
||||
|
||||
inline IValue::IValue(c10::intrusive_ptr<ivalue::Object> v)
|
||||
: tag(Tag::Object) {
|
||||
@ -2482,15 +2482,15 @@ namespace ivalue {
|
||||
namespace detail {
|
||||
|
||||
template <typename T>
|
||||
IValue from_(T&& x, std::true_type) {
|
||||
IValue from_(T&& x, std::true_type /*unused*/) {
|
||||
return IValue(std::forward<T>(x));
|
||||
}
|
||||
template <typename T>
|
||||
IValue from_(c10::intrusive_ptr<T> x, std::false_type) {
|
||||
IValue from_(c10::intrusive_ptr<T> x, std::false_type /*unused*/) {
|
||||
return IValue(std::move(x));
|
||||
}
|
||||
template <typename T>
|
||||
IValue from_(T&& /*x*/, std::false_type) {
|
||||
IValue from_(T&& /*x*/, std::false_type /*unused*/) {
|
||||
static_assert(
|
||||
guts::false_t<T>::value,
|
||||
"You are calling from with a type that it doesn't support, and isn't a potential custom class (ie: is an intrusive_ptr)");
|
||||
@ -2546,19 +2546,19 @@ struct MaybeOwnedTraits<IValue> {
|
||||
return &borrow;
|
||||
}
|
||||
|
||||
static bool debugBorrowIsValid(const borrow_type&) {
|
||||
static bool debugBorrowIsValid(const borrow_type& /*unused*/) {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct IValue::TagType<c10::Type> {
|
||||
static TORCH_API c10::TypePtr get(const IValue&);
|
||||
static TORCH_API c10::TypePtr get(const IValue& /*v*/);
|
||||
};
|
||||
|
||||
template <>
|
||||
struct IValue::TagType<c10::DynamicType> {
|
||||
static TORCH_API c10::TypePtr get(const IValue&);
|
||||
static TORCH_API c10::TypePtr get(const IValue& /*v*/);
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
|
@ -44,7 +44,7 @@ constexpr int checkStaticTypes() {
|
||||
}
|
||||
|
||||
template <typename... Ts, size_t... Is>
|
||||
constexpr std::array<ArgumentDef, sizeof...(Ts)> createArgumentVectorFromTypes(std::index_sequence<Is...>) {
|
||||
constexpr std::array<ArgumentDef, sizeof...(Ts)> createArgumentVectorFromTypes(std::index_sequence<Is...> /*unused*/) {
|
||||
return (
|
||||
// Check types for common errors
|
||||
checkStaticTypes<Ts...>(),
|
||||
|
@ -83,7 +83,7 @@ inline bool operator!=(const OperatorName& lhs, const OperatorName& rhs) {
|
||||
}
|
||||
|
||||
TORCH_API std::string toString(const OperatorName& opName);
|
||||
TORCH_API std::ostream& operator<<(std::ostream&, const OperatorName&);
|
||||
TORCH_API std::ostream& operator<<(std::ostream& /*os*/, const OperatorName& /*opName*/);
|
||||
|
||||
} // namespace c10
|
||||
|
||||
|
@ -16,7 +16,7 @@ class SingletonTypePtr {
|
||||
/* implicit */ SingletonTypePtr(T* p) : repr_(p) {}
|
||||
|
||||
// We need this to satisfy Pybind11, but it shouldn't be hit.
|
||||
explicit SingletonTypePtr(std::shared_ptr<T>) { TORCH_CHECK(false); }
|
||||
explicit SingletonTypePtr(std::shared_ptr<T> /*unused*/) { TORCH_CHECK(false); }
|
||||
|
||||
using element_type = typename std::shared_ptr<T>::element_type;
|
||||
|
||||
|
@ -102,31 +102,8 @@ struct VecReduceAllSIMD<float, Op> {
|
||||
#endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) &&
|
||||
// !defined(C10_MOBILE)
|
||||
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
|
||||
#if defined(CPU_CAPABILITY_SVE256)
|
||||
template <typename Op>
|
||||
struct VecReduceAllSIMD<float, Op> {
|
||||
static inline float apply(
|
||||
const Op& vec_fun,
|
||||
const Vectorized<float>& acc_vec) {
|
||||
using Vec = Vectorized<float>;
|
||||
Vec v = acc_vec;
|
||||
// 128-bit shuffle
|
||||
svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
|
||||
Vec v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
// 64-bit shuffle
|
||||
ind = svdupq_n_u32(2, 3, 0, 1);
|
||||
v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
// 32-bit shuffle
|
||||
ind = svdupq_n_u32(1, 0, 2, 3);
|
||||
v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
return svlasta(svpfalse(), v);
|
||||
}
|
||||
};
|
||||
#else
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
!defined(CPU_CAPABILITY_SVE)
|
||||
template <typename Op>
|
||||
struct VecReduceAllSIMD<float, Op> {
|
||||
static inline float apply(
|
||||
@ -163,8 +140,35 @@ struct VecReduceAllSIMD<float, std::plus<Vectorized<float>>> {
|
||||
return vaddvq_f32(acc_vec);
|
||||
}
|
||||
};
|
||||
#endif // defined(CPU_CAPABILITY_SVE256)
|
||||
#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
|
||||
// && !defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
defined(CPU_CAPABILITY_SVE256)
|
||||
template <typename Op>
|
||||
struct VecReduceAllSIMD<float, Op> {
|
||||
static inline float apply(
|
||||
const Op& vec_fun,
|
||||
const Vectorized<float>& acc_vec) {
|
||||
using Vec = Vectorized<float>;
|
||||
Vec v = acc_vec;
|
||||
// 128-bit shuffle
|
||||
svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
|
||||
Vec v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
// 64-bit shuffle
|
||||
ind = svdupq_n_u32(2, 3, 0, 1);
|
||||
v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
// 32-bit shuffle
|
||||
ind = svdupq_n_u32(1, 0, 2, 3);
|
||||
v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
return svlasta(svpfalse(), v);
|
||||
}
|
||||
};
|
||||
#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
|
||||
// && defined(CPU_CAPABILITY_SVE256)
|
||||
|
||||
template <typename scalar_t, typename Op>
|
||||
inline scalar_t vec_reduce_all(
|
||||
|
@ -1,21 +1,9 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <cstdint>
|
||||
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
|
||||
#if defined(__aarch64__) && \
|
||||
(defined(AT_BUILD_ARM_VEC256_WITH_SLEEF) || \
|
||||
defined(AT_BUILD_ARM_VECSVE_WITH_SLEEF))
|
||||
#define SLEEF_STATIC_LIBS
|
||||
#include <sleef.h>
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
|
||||
#else
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
|
||||
#endif
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
// Define the data type of VLS(vector-length specific).
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
#include <ATen/cpu/vec/sve/vec_common_sve.h>
|
||||
#include <ATen/cpu/vec/sve/vec_float.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <c10/util/bit_cast.h>
|
||||
@ -307,8 +308,8 @@ Vectorized<c10::BFloat16> inline operator/(
|
||||
}
|
||||
|
||||
inline Vectorized<BFloat16>::Vectorized() {
|
||||
const short zero = 0;
|
||||
values = svdup_n_bf16(c10::bit_cast<bfloat16_t>(zero));
|
||||
auto vals_f = svdup_n_f32(0);
|
||||
values = convert_float_bfloat16(vals_f, vals_f);
|
||||
}
|
||||
|
||||
inline Vectorized<BFloat16>::Vectorized(int val) {
|
||||
|
@ -8,48 +8,13 @@
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
|
||||
#ifdef CPU_CAPABILITY_SVE128
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_half_neon.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_convert.h>
|
||||
|
||||
#include <ATen/cpu/vec/sve/vec_qint.h>
|
||||
|
||||
#elif defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
#include <ATen/cpu/vec/sve/vec_float.h>
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#include <ATen/cpu/vec/sve/vec_bfloat16.h>
|
||||
|
||||
#include <ATen/cpu/vec/sve/vec_double.h>
|
||||
#include <ATen/cpu/vec/sve/vec_float.h>
|
||||
#include <ATen/cpu/vec/sve/vec_int.h>
|
||||
|
||||
#include <ATen/cpu/vec/sve/vec_qint.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec256/vec256_half.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec256/vec256_convert.h>
|
||||
|
||||
#else // NEON
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_half_neon.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_convert.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec256/vec256_qint.h>
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE128)
|
||||
|
||||
#include <ATen/cpu/vec/functional.h>
|
||||
#endif
|
||||
|
||||
namespace at::vec {
|
||||
// Note [CPU_CAPABILITY namespace]
|
||||
@ -83,6 +48,12 @@ DEFINE_SVE_CAST(int32_t, s32, float, f32)
|
||||
DEFINE_SVE_CAST(int16_t, s16, float, f32)
|
||||
DEFINE_SVE_CAST(float, f32, double, f64)
|
||||
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
DEFINE_SVE_CAST(int64_t, s64, c10::BFloat16, bf16)
|
||||
DEFINE_SVE_CAST(int32_t, s32, c10::BFloat16, bf16)
|
||||
DEFINE_SVE_CAST(int16_t, s16, c10::BFloat16, bf16)
|
||||
#endif // __ARM_FEATURE_BF16
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
template <int64_t scale = 1>
|
||||
@ -202,11 +173,9 @@ std::pair<
|
||||
// group cols crossing lanes:
|
||||
// return {a0, b0, a1, b1, a2, b2, a3, b3}
|
||||
// {a4, b4, a5, b5, a6, b6, a7, b7}
|
||||
svbfloat16_t aReg = a;
|
||||
svbfloat16_t bReg = b;
|
||||
Vectorized<c10::BFloat16> c = svzip1_bf16(aReg, bReg);
|
||||
Vectorized<c10::BFloat16> d = svzip2_bf16(aReg, bReg);
|
||||
return std::make_pair(c, d);
|
||||
return std::make_pair(
|
||||
Vectorized<c10::BFloat16>(svzip1_bf16(a, b)),
|
||||
Vectorized<c10::BFloat16>(svzip2_bf16(a, b)));
|
||||
}
|
||||
#endif // __ARM_FEATURE_BF16
|
||||
|
||||
@ -255,27 +224,12 @@ std::pair<
|
||||
// swap lanes:
|
||||
// return {a0, a1, a2, a3, a4, a5, a6, a7}
|
||||
// {b0, b1, b2, b3, b4, b5, b6, b7}
|
||||
svbfloat16_t aReg = a;
|
||||
svbfloat16_t bReg = b;
|
||||
Vectorized<c10::BFloat16> c = svuzp1_bf16(aReg, bReg);
|
||||
Vectorized<c10::BFloat16> d = svuzp2_bf16(aReg, bReg);
|
||||
return std::make_pair(c, d);
|
||||
return std::make_pair(
|
||||
Vectorized<c10::BFloat16>(svuzp1_bf16((svbfloat16_t)a, (svbfloat16_t)b)),
|
||||
Vectorized<c10::BFloat16>(svuzp2_bf16((svbfloat16_t)a, (svbfloat16_t)b)));
|
||||
}
|
||||
#endif // __ARM_FEATURE_BF16
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FLIP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
#define DEFINE_FLIP_FUNC(type, sve_func) \
|
||||
inline Vectorized<type> flip(const Vectorized<type>& v) { \
|
||||
return Vectorized<type>(sve_func(v)); \
|
||||
}
|
||||
// Use the macro to define the flip functions
|
||||
DEFINE_FLIP_FUNC(float, svrev_f32)
|
||||
DEFINE_FLIP_FUNC(double, svrev_f64)
|
||||
DEFINE_FLIP_FUNC(int64_t, svrev_s64)
|
||||
DEFINE_FLIP_FUNC(int32_t, svrev_s32)
|
||||
DEFINE_FLIP_FUNC(int16_t, svrev_s16)
|
||||
DEFINE_FLIP_FUNC(int8_t, svrev_s8)
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
@ -1,8 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#if defined(__aarch64__)
|
||||
#include <ATen/cpu/vec/vec_common_aarch64.h>
|
||||
#elif defined(CPU_CAPABILITY_AVX512)
|
||||
#if defined(CPU_CAPABILITY_AVX512)
|
||||
#include <ATen/cpu/vec/vec512/vec512.h>
|
||||
#else
|
||||
#include <ATen/cpu/vec/vec128/vec128.h>
|
||||
@ -13,34 +11,6 @@ namespace at::vec {
|
||||
// See Note [CPU_CAPABILITY namespace]
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) {
|
||||
stream << val.val_;
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) {
|
||||
stream << static_cast<int>(val.val_);
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) {
|
||||
stream << static_cast<unsigned int>(val.val_);
|
||||
return stream;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
|
||||
T buf[Vectorized<T>::size()];
|
||||
vec.store(buf);
|
||||
stream << "vec[";
|
||||
for (int i = 0; i != Vectorized<T>::size(); i++) {
|
||||
if (i != 0) {
|
||||
stream << ", ";
|
||||
}
|
||||
stream << buf[i];
|
||||
}
|
||||
stream << "]";
|
||||
return stream;
|
||||
}
|
||||
|
||||
inline Vectorized<bool> convert_to_bool(Vectorized<int8_t> x) {
|
||||
__at_align__ bool buffer[x.size()];
|
||||
x.ne(Vectorized<int8_t>(0)).store(buffer);
|
||||
|
@ -2,7 +2,6 @@
|
||||
|
||||
// DO NOT DEFINE STATIC DATA IN THIS HEADER!
|
||||
// See Note [Do not compile initializers with AVX]
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
@ -263,13 +262,6 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
|
||||
c10::bit_cast<at_bfloat16_t>(val6.x),
|
||||
c10::bit_cast<at_bfloat16_t>(val7.x)}) {}
|
||||
|
||||
#ifdef CPU_CAPABILITY_SVE128
|
||||
Vectorized(svbfloat16_t v) : Vectorized16(svget_neonq(v)) {}
|
||||
operator svbfloat16_t() const {
|
||||
return svset_neonq(svundef_bf16(), values);
|
||||
}
|
||||
#endif
|
||||
|
||||
static Vectorized<c10::BFloat16> blendv(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b,
|
||||
@ -382,23 +374,6 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
|
||||
Vectorized ge(const Vectorized& other) const;
|
||||
Vectorized lt(const Vectorized& other) const;
|
||||
Vectorized le(const Vectorized& other) const;
|
||||
|
||||
#ifdef CPU_CAPABILITY_SVE128
|
||||
|
||||
template <typename step_t>
|
||||
static Vectorized<BFloat16> arange(
|
||||
BFloat16 base = 0.f,
|
||||
step_t step = static_cast<step_t>(1)) {
|
||||
__at_align__ BFloat16 buffer[size()];
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
buffer[i] = base + i * step;
|
||||
}
|
||||
return svget_neonq(
|
||||
svld1_bf16(ptrue, reinterpret_cast<bfloat16_t*>(buffer)));
|
||||
}
|
||||
|
||||
#endif // CPU_CAPABILITY_SVE128
|
||||
|
||||
}; // Vectorized<c10::BFloat16>
|
||||
|
||||
inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(
|
||||
@ -422,24 +397,6 @@ inline Vectorized<c10::BFloat16> convert_float_bfloat16(
|
||||
return Vectorized<c10::BFloat16>(at_vcombine_bf16(x1, x2));
|
||||
}
|
||||
|
||||
inline void load_fp32_from_bf16(const BFloat16* data, Vectorized<float>& out) {
|
||||
__at_align__ float values[Vectorized<float>::size()];
|
||||
for (const auto k : c10::irange(Vectorized<float>::size())) {
|
||||
values[k] = data[k];
|
||||
}
|
||||
out = Vectorized<float>::loadu(values);
|
||||
}
|
||||
|
||||
inline void load_fp32_from_bf16(
|
||||
const BFloat16* data,
|
||||
Vectorized<float>& out1,
|
||||
Vectorized<float>& out2) {
|
||||
Vectorized<BFloat16> bf16_vec = Vectorized<BFloat16>::loadu(data);
|
||||
auto floats = convert_bfloat16_float(bf16_vec);
|
||||
out1 = std::get<0>(floats);
|
||||
out2 = std::get<1>(floats);
|
||||
}
|
||||
|
||||
template <typename Op>
|
||||
Vectorized<c10::BFloat16> binary_operator_via_float(
|
||||
Op op,
|
||||
@ -622,12 +579,6 @@ Vectorized<c10::BFloat16> inline fnmsub(
|
||||
return -a * b - c;
|
||||
}
|
||||
|
||||
#else //
|
||||
|
||||
CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16)
|
||||
|
||||
LOAD_FP32_NON_VECTORIZED_INIT(BFloat16, bf16)
|
||||
|
||||
#endif // !defined(C10_MOBILE) && defined(__aarch64__)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
namespace at::vec {
|
||||
inline namespace CPU_CAPABILITY {
|
||||
#if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256)
|
||||
#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
|
||||
template <typename src_t>
|
||||
struct VecConvert<
|
||||
float,
|
||||
@ -60,7 +60,6 @@ struct VecConvert<float, 1, BFloat16, 1> {
|
||||
}
|
||||
};
|
||||
|
||||
#endif // defined(__aarch64__) && (!defined(CPU_CAPABILITY_SVE) ||
|
||||
// defined(CPU_CAPABILITY_SVE128))
|
||||
#endif // defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256)
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
|
@ -4,10 +4,13 @@
|
||||
// See Note [Do not compile initializers with AVX]
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
|
||||
#include <sleef.h>
|
||||
#endif
|
||||
|
||||
// Sleef offers vectorized versions of some transcedentals
|
||||
// such as sin, cos, tan etc..
|
||||
// However for now opting for STL, since we are not building
|
||||
@ -32,6 +35,12 @@ inline namespace CPU_CAPABILITY {
|
||||
#error "Big endian is not supported."
|
||||
#endif
|
||||
|
||||
#if defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
|
||||
#else
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
|
||||
#endif
|
||||
|
||||
template <int index, bool mask_val>
|
||||
struct BlendRegs {
|
||||
static float32x4_t impl(
|
||||
@ -85,12 +94,6 @@ class Vectorized<float> {
|
||||
operator float32x4_t() const {
|
||||
return values;
|
||||
}
|
||||
#ifdef CPU_CAPABILITY_SVE128
|
||||
Vectorized(svfloat32_t v) : values(svget_neonq(v)) {}
|
||||
operator svfloat32_t() const {
|
||||
return svset_neonq(svundef_f32(), values);
|
||||
}
|
||||
#endif
|
||||
template <int64_t mask>
|
||||
static Vectorized<float> blend(
|
||||
const Vectorized<float>& a,
|
||||
|
@ -4,6 +4,7 @@
|
||||
// See Note [Do not compile initializers with AVX]
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_convert.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
@ -24,6 +25,7 @@ inline namespace CPU_CAPABILITY {
|
||||
// https://bugs.llvm.org/show_bug.cgi?id=45824
|
||||
// Most likely we will do aarch32 support with inline asm.
|
||||
#if !defined(C10_MOBILE) && defined(__aarch64__)
|
||||
|
||||
#ifdef __BIG_ENDIAN__
|
||||
#error "Big endian is not supported."
|
||||
#endif
|
||||
@ -419,24 +421,6 @@ Vectorized<c10::Half> inline operator+(
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void load_fp32_from_fp16(const c10::Half* data, Vectorized<float>& out) {
|
||||
__at_align__ float values[Vectorized<float>::size()];
|
||||
for (const auto k : c10::irange(Vectorized<float>::size())) {
|
||||
values[k] = data[k];
|
||||
}
|
||||
out = Vectorized<float>::loadu(values);
|
||||
}
|
||||
|
||||
inline void load_fp32_from_fp16(
|
||||
const c10::Half* data,
|
||||
Vectorized<float>& out1,
|
||||
Vectorized<float>& out2) {
|
||||
Vectorized<c10::Half> f16_vec = Vectorized<c10::Half>::loadu(data);
|
||||
auto floats = convert_half_float(f16_vec);
|
||||
out1 = std::get<0>(floats);
|
||||
out2 = std::get<1>(floats);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<c10::Half> inline operator-(
|
||||
const Vectorized<c10::Half>& a,
|
||||
@ -672,53 +656,6 @@ Vectorized<c10::Half> inline fnmsub(
|
||||
return -a * b - c;
|
||||
#endif
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define CONVERT_NON_VECTORIZED_INIT(type, name) \
|
||||
inline std::tuple<Vectorized<float>, Vectorized<float>> \
|
||||
convert_##name##_float(const Vectorized<type>& a) { \
|
||||
constexpr int64_t K = Vectorized<type>::size(); \
|
||||
__at_align__ float arr[K]; \
|
||||
__at_align__ type arr2[K]; \
|
||||
a.store(arr2); \
|
||||
convert(arr2, arr, K); \
|
||||
return std::make_tuple( \
|
||||
Vectorized<float>::loadu(arr), \
|
||||
Vectorized<float>::loadu(arr + Vectorized<float>::size())); \
|
||||
} \
|
||||
inline Vectorized<type> convert_float_##name( \
|
||||
const Vectorized<float>& a, const Vectorized<float>& b) { \
|
||||
constexpr int64_t K = Vectorized<type>::size(); \
|
||||
__at_align__ float arr[K]; \
|
||||
__at_align__ type arr2[K]; \
|
||||
a.store(arr); \
|
||||
b.store(arr + Vectorized<float>::size()); \
|
||||
convert(arr, arr2, K); \
|
||||
return Vectorized<type>::loadu(arr2); \
|
||||
}
|
||||
|
||||
#define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
|
||||
inline void load_fp32_from_##name( \
|
||||
const type* data, Vectorized<float>& out) { \
|
||||
__at_align__ float values[Vectorized<float>::size()]; \
|
||||
for (const auto k : c10::irange(Vectorized<float>::size())) { \
|
||||
values[k] = data[k]; \
|
||||
} \
|
||||
out = Vectorized<float>::loadu(values); \
|
||||
} \
|
||||
\
|
||||
inline void load_fp32_from_##name( \
|
||||
const type* data, Vectorized<float>& out1, Vectorized<float>& out2) { \
|
||||
load_fp32_from_##name(data, out1); \
|
||||
data += Vectorized<float>::size(); \
|
||||
load_fp32_from_##name(data, out2); \
|
||||
}
|
||||
|
||||
CONVERT_NON_VECTORIZED_INIT(Half, half)
|
||||
|
||||
LOAD_FP32_NON_VECTORIZED_INIT(Half, fp16)
|
||||
|
||||
#endif // !defined(C10_MOBILE) && defined(__aarch64__)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
|
@ -9,16 +9,21 @@
|
||||
#if !( \
|
||||
defined(__VSX__) || defined(CPU_CAPABILITY_VSX) || \
|
||||
defined(CPU_CAPABILITY_ZVECTOR))
|
||||
#include <ATen/cpu/vec/vec256/vec256_double.h>
|
||||
#if defined(CPU_CAPABILITY_SVE256)
|
||||
#include <ATen/cpu/vec/sve/vec_common_sve.h>
|
||||
#else
|
||||
// clang-format off
|
||||
#include <ATen/cpu/vec/vec256/vec256_float.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_double.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_int.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_qint.h>
|
||||
#endif
|
||||
#if !defined(CPU_CAPABILITY_SVE256) || !defined(__ARM_FEATURE_BF16)
|
||||
#include <ATen/cpu/vec/vec256/vec256_bfloat16.h>
|
||||
#endif
|
||||
#include <ATen/cpu/vec/vec256/vec256_complex_double.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_complex_float.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_half.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_complex_float.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_complex_double.h>
|
||||
// clang-format on
|
||||
#elif defined(__VSX__) || defined(CPU_CAPABILITY_VSX)
|
||||
#include <ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h>
|
||||
@ -51,6 +56,34 @@ namespace at::vec {
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) {
|
||||
stream << val.val_;
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) {
|
||||
stream << static_cast<int>(val.val_);
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) {
|
||||
stream << static_cast<unsigned int>(val.val_);
|
||||
return stream;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
|
||||
T buf[Vectorized<T>::size()];
|
||||
vec.store(buf);
|
||||
stream << "vec[";
|
||||
for (int i = 0; i != Vectorized<T>::size(); i++) {
|
||||
if (i != 0) {
|
||||
stream << ", ";
|
||||
}
|
||||
stream << buf[i];
|
||||
}
|
||||
stream << "]";
|
||||
return stream;
|
||||
}
|
||||
|
||||
#if defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -268,7 +268,9 @@ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16)
|
||||
|
||||
#else // defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
#if !(defined(__aarch64__))
|
||||
#if !( \
|
||||
defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
!defined(CPU_CAPABILITY_SVE256))
|
||||
CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16)
|
||||
#endif
|
||||
|
||||
|
@ -342,19 +342,19 @@ class Vectorized<c10::complex<double>> {
|
||||
return _mm256_cmp_pd(values, other.values, _CMP_NEQ_UQ);
|
||||
}
|
||||
Vectorized<c10::complex<double>> operator<(
|
||||
const Vectorized<c10::complex<double>>&) const {
|
||||
const Vectorized<c10::complex<double>>& /*unused*/) const {
|
||||
TORCH_CHECK(false, "not supported for complex numbers");
|
||||
}
|
||||
Vectorized<c10::complex<double>> operator<=(
|
||||
const Vectorized<c10::complex<double>>&) const {
|
||||
const Vectorized<c10::complex<double>>& /*unused*/) const {
|
||||
TORCH_CHECK(false, "not supported for complex numbers");
|
||||
}
|
||||
Vectorized<c10::complex<double>> operator>(
|
||||
const Vectorized<c10::complex<double>>&) const {
|
||||
const Vectorized<c10::complex<double>>& /*unused*/) const {
|
||||
TORCH_CHECK(false, "not supported for complex numbers");
|
||||
}
|
||||
Vectorized<c10::complex<double>> operator>=(
|
||||
const Vectorized<c10::complex<double>>&) const {
|
||||
const Vectorized<c10::complex<double>>& /*unused*/) const {
|
||||
TORCH_CHECK(false, "not supported for complex numbers");
|
||||
}
|
||||
|
||||
|
@ -268,7 +268,9 @@ LOAD_FP32_VECTORIZED_INIT(Half, fp16)
|
||||
|
||||
#else // defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
#if !defined(__aarch64__) || defined(CPU_CAPABILITY_SVE256)
|
||||
#if !( \
|
||||
defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
!defined(CPU_CAPABILITY_SVE256))
|
||||
CONVERT_NON_VECTORIZED_INIT(Half, half)
|
||||
#endif
|
||||
|
||||
|
@ -5,13 +5,6 @@
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
|
||||
#ifdef __aarch64__
|
||||
#if defined(CPU_CAPABILITY_SVE128) || !defined(CPU_CAPABILITY_SVE)
|
||||
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <ATen/native/quantized/AffineQuantizerBase.h>
|
||||
|
||||
#include <c10/util/irange.h>
|
||||
@ -922,7 +915,7 @@ Vectorized<c10::quint8> inline maximum(
|
||||
return a.maximum(b);
|
||||
}
|
||||
|
||||
#else
|
||||
#elif !defined(CPU_CAPABILITY_SVE256)
|
||||
|
||||
// NOTE: These are low-performance implementations that we fall back on
|
||||
// if we are not building with AVX2. This may not be an issue, because
|
||||
@ -1379,18 +1372,12 @@ Vectorized<c10::quint8> inline maximum(
|
||||
return a.maximum(b);
|
||||
}
|
||||
|
||||
#if defined(__aarch64__) && \
|
||||
(defined(CPU_CAPABILITY_SVE128) || !defined(CPU_CAPABILITY_SVE))
|
||||
#endif // if defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
|
||||
std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
|
||||
at::vec::Vectorized<int8_t> src) {
|
||||
|
||||
#ifdef CPU_CAPABILITY_SVE
|
||||
svint8_t x = src;
|
||||
auto s8x8 = vget_low_s8(svget_neonq(x));
|
||||
#else
|
||||
auto s8x8 = vld1_s8(src.operator const int8_t*());
|
||||
#endif
|
||||
|
||||
auto s16x8 = vmovl_s8(s8x8);
|
||||
|
||||
auto s32x4_hi = vmovl_s16(vget_high_s16(s16x8));
|
||||
@ -1415,14 +1402,7 @@ std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
|
||||
|
||||
Vectorized<float> inline convert_int8_half_register_to_float(
|
||||
at::vec::Vectorized<int8_t> src) {
|
||||
|
||||
#ifdef CPU_CAPABILITY_SVE
|
||||
svint8_t x = src;
|
||||
auto s8x8 = vget_low_s8(svget_neonq(x));
|
||||
#else
|
||||
auto s8x8 = vld1_s8(src.operator const int8_t*());
|
||||
#endif
|
||||
|
||||
auto s16x8 = vmovl_s8(s8x8);
|
||||
|
||||
auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8));
|
||||
@ -1440,8 +1420,5 @@ Vectorized<float> inline convert_int8_half_register_to_float(
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif // if defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
|
@ -31,6 +31,34 @@ namespace vec {
|
||||
// See Note [CPU_CAPABILITY namespace]
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) {
|
||||
stream << val.val_;
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) {
|
||||
stream << static_cast<int>(val.val_);
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) {
|
||||
stream << static_cast<unsigned int>(val.val_);
|
||||
return stream;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
|
||||
T buf[Vectorized<T>::size()];
|
||||
vec.store(buf);
|
||||
stream << "vec[";
|
||||
for (int i = 0; i != Vectorized<T>::size(); i++) {
|
||||
if (i != 0) {
|
||||
stream << ", ";
|
||||
}
|
||||
stream << buf[i];
|
||||
}
|
||||
stream << "]";
|
||||
return stream;
|
||||
}
|
||||
|
||||
#if defined(CPU_CAPABILITY_AVX512)
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512)
|
||||
|
@ -67,7 +67,18 @@ Windows llvm will not have this definition.
|
||||
#endif
|
||||
#define VECTOR_WIDTH 64
|
||||
#define int_vector __m512i
|
||||
#elif defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_SVE256)
|
||||
#elif defined(__aarch64__) && \
|
||||
!defined(CPU_CAPABILITY_SVE) // CPU_CAPABILITY_AVX512
|
||||
// SVE code expects 256-vectors; leave that set for SVE?
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(16)))
|
||||
#elif defined(_WIN32)
|
||||
#define __at_align__ __declspec(align(16))
|
||||
#else
|
||||
#define __at_align__
|
||||
#endif
|
||||
#define VECTOR_WIDTH 16
|
||||
#else // CPU_CAPABILITY_AVX512
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(32)))
|
||||
#elif defined(_WIN32)
|
||||
@ -77,27 +88,7 @@ Windows llvm will not have this definition.
|
||||
#endif
|
||||
#define VECTOR_WIDTH 32
|
||||
#define int_vector __m256i
|
||||
#elif defined(__aarch64__)
|
||||
// Define alignment and vector width for SVE128/Default (e.g., NEON)
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(16)))
|
||||
#elif defined(_WIN32)
|
||||
#define __at_align__ __declspec(align(16))
|
||||
#else
|
||||
#define __at_align__
|
||||
#endif
|
||||
#define VECTOR_WIDTH 16
|
||||
#else
|
||||
// Fallback: define default alignment and vector width
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(32)))
|
||||
#elif defined(_WIN32)
|
||||
#define __at_align__ __declspec(align(32))
|
||||
#else
|
||||
#define __at_align__
|
||||
#endif
|
||||
#define VECTOR_WIDTH 32
|
||||
#endif
|
||||
#endif // CPU_CAPABILITY_AVX512
|
||||
|
||||
namespace at::vec {
|
||||
// See Note [CPU_CAPABILITY namespace]
|
||||
|
@ -422,18 +422,34 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
|
||||
abType = CUDA_R_16F;
|
||||
cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
|
||||
#ifndef USE_ROCM
|
||||
if (!at::globalContext().allowFP16ReductionCuBLAS()) {
|
||||
preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
|
||||
CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
|
||||
auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
|
||||
if (fp16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
uint32_t mask =
|
||||
fp16_reduction ==
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
|
||||
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
|
||||
CUBLASLT_REDUCTION_SCHEME_NONE)
|
||||
: CUBLASLT_REDUCTION_SCHEME_NONE;
|
||||
preference.setAttribute(
|
||||
CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
|
||||
}
|
||||
#endif
|
||||
} else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
|
||||
abType = CUDA_R_16BF;
|
||||
cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
|
||||
#ifndef USE_ROCM
|
||||
if (!at::globalContext().allowBF16ReductionCuBLAS()) {
|
||||
preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
|
||||
CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
|
||||
auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
|
||||
if (bf16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
uint32_t mask =
|
||||
bf16_reduction ==
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
|
||||
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
|
||||
CUBLASLT_REDUCTION_SCHEME_NONE)
|
||||
: CUBLASLT_REDUCTION_SCHEME_NONE;
|
||||
preference.setAttribute(
|
||||
CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
@ -1120,8 +1136,15 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
|
||||
}
|
||||
if (prop->major >= 5) {
|
||||
cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
|
||||
if (!at::globalContext().allowFP16ReductionCuBLAS()) {
|
||||
cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
|
||||
auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
|
||||
TORCH_CHECK(fp16_reduction !=
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK,
|
||||
"torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction("
|
||||
"..., allow_splitk=False) requires the cuBLASLt backend");
|
||||
if (fp16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
cublas_flags = static_cast<cublasMath_t>(
|
||||
cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
|
||||
}
|
||||
// Disallow fp16 reductions that could lead to unexpected overflow issues.
|
||||
TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, cublas_flags));
|
||||
@ -1180,8 +1203,15 @@ inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DT
|
||||
GEMM_CHECK_ARGVALUES(at::BFloat16);
|
||||
#ifndef USE_ROCM
|
||||
cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
|
||||
if (!at::globalContext().allowBF16ReductionCuBLAS()) {
|
||||
cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
|
||||
auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
|
||||
TORCH_CHECK(bf16_reduction !=
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK,
|
||||
"torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction("
|
||||
"..., allow_splitk=False) requires the cuBLASLt backend");
|
||||
if (bf16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
cublas_flags = static_cast<cublasMath_t>(
|
||||
cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
|
||||
}
|
||||
#endif
|
||||
#if defined(USE_ROCM)
|
||||
@ -1577,18 +1607,34 @@ bool gemm_and_bias(
|
||||
abType = CUDA_R_16F;
|
||||
cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
|
||||
#ifndef USE_ROCM
|
||||
if (!at::globalContext().allowFP16ReductionCuBLAS()) {
|
||||
preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
|
||||
CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
|
||||
auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
|
||||
if (fp16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
uint32_t mask =
|
||||
fp16_reduction ==
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
|
||||
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
|
||||
CUBLASLT_REDUCTION_SCHEME_NONE)
|
||||
: CUBLASLT_REDUCTION_SCHEME_NONE;
|
||||
preference.setAttribute(
|
||||
CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
|
||||
}
|
||||
#endif
|
||||
} else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
|
||||
abType = CUDA_R_16BF;
|
||||
cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
|
||||
#ifndef USE_ROCM
|
||||
if (!at::globalContext().allowBF16ReductionCuBLAS()) {
|
||||
preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
|
||||
CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
|
||||
auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
|
||||
if (bf16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
uint32_t mask =
|
||||
bf16_reduction ==
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
|
||||
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
|
||||
CUBLASLT_REDUCTION_SCHEME_NONE)
|
||||
: CUBLASLT_REDUCTION_SCHEME_NONE;
|
||||
preference.setAttribute(
|
||||
CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -1815,6 +1861,8 @@ template bool gemm_and_bias(
|
||||
int64_t result_ld,
|
||||
GEMMAndBiasActivationEpilogue activation);
|
||||
|
||||
using at::blas::ScalingType;
|
||||
|
||||
int get_scale_mode(ScalingType scaling_type, ScalarType scale_dtype, bool use_fast_accum) {
|
||||
switch (scaling_type) {
|
||||
case ScalingType::BlockWise1x32:
|
||||
|
@ -14,6 +14,7 @@
|
||||
*/
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <ATen/BlasBackend.h>
|
||||
#include <ATen/OpMathType.h>
|
||||
|
||||
namespace at::cuda::blas {
|
||||
@ -136,15 +137,6 @@ void int8_gemm(
|
||||
int32_t* result_ptr,
|
||||
int64_t result_ld);
|
||||
|
||||
enum class ScalingType : std::uint8_t {
|
||||
TensorWise, // fp32 scales
|
||||
RowWise, // fp32 scales
|
||||
BlockWise1x16, // fp8_e4m3fn scales
|
||||
BlockWise1x32, // fp8_e8m0fnu scales
|
||||
BlockWise1x128, // fp32 scales
|
||||
BlockWise128x128, // fp32 scales
|
||||
};
|
||||
|
||||
void scaled_gemm(
|
||||
char transa,
|
||||
char transb,
|
||||
@ -156,13 +148,13 @@ void scaled_gemm(
|
||||
int64_t mat1_ld,
|
||||
ScalarType mat1_dtype,
|
||||
ScalarType mat1_scale_dtype,
|
||||
ScalingType mat1_scaling_type,
|
||||
at::blas::ScalingType mat1_scaling_type,
|
||||
const void* mat2_ptr,
|
||||
const void* mat2_scale_ptr,
|
||||
int64_t mat2_ld,
|
||||
ScalarType mat2_dtype,
|
||||
ScalarType mat2_scale_dtype,
|
||||
ScalingType mat2_scaling_type,
|
||||
at::blas::ScalingType mat2_scaling_type,
|
||||
const void* bias_ptr,
|
||||
ScalarType bias_dtype,
|
||||
void* result_ptr,
|
||||
|
@ -326,6 +326,23 @@ bool CUDAHooks::supportsBFloat16ConvolutionWithCuDNNv8() const {
|
||||
#endif
|
||||
}
|
||||
|
||||
bool CUDAHooks::supportsBFloat16RNNWithCuDNN() const {
|
||||
#if AT_CUDNN_ENABLED() && (CUDNN_VERSION >= 91300)
|
||||
if (!hasCUDA()) {
|
||||
return false;
|
||||
}
|
||||
cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
|
||||
// Check for Volta cores
|
||||
if (prop->major >= 8) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
long CUDAHooks::versionCuDNN() const {
|
||||
#if AT_CUDNN_ENABLED()
|
||||
return CUDNN_VERSION;
|
||||
|
@ -17,7 +17,7 @@ TORCH_CUDA_CPP_API void set_magma_init_fn(void (*magma_init_fn)());
|
||||
|
||||
// The real implementation of CUDAHooksInterface
|
||||
struct CUDAHooks : public at::CUDAHooksInterface {
|
||||
CUDAHooks(at::CUDAHooksArgs) {}
|
||||
CUDAHooks(at::CUDAHooksArgs /*unused*/) {}
|
||||
void init() const override;
|
||||
Device getDeviceFromPtr(void* data) const override;
|
||||
bool isPinnedPtr(const void* data) const override;
|
||||
@ -45,6 +45,7 @@ struct CUDAHooks : public at::CUDAHooksInterface {
|
||||
bool supportsDilatedConvolutionWithCuDNN() const override;
|
||||
bool supportsDepthwiseConvolutionWithCuDNN() const override;
|
||||
bool supportsBFloat16ConvolutionWithCuDNNv8() const override;
|
||||
bool supportsBFloat16RNNWithCuDNN() const override;
|
||||
bool hasCUDART() const override;
|
||||
long versionCUDART() const override;
|
||||
long versionCuDNN() const override;
|
||||
|
@ -29,7 +29,7 @@
|
||||
|
||||
namespace at::cuda::tunable {
|
||||
|
||||
using at::cuda::blas::ScalingType;
|
||||
using at::blas::ScalingType;
|
||||
|
||||
enum class BlasOp {
|
||||
N = 0,
|
||||
|
@ -29,7 +29,7 @@ template <typename ParamsT>
|
||||
class Callable {
|
||||
public:
|
||||
virtual ~Callable() = default;
|
||||
virtual TuningStatus Call(const ParamsT*) {
|
||||
virtual TuningStatus Call(const ParamsT* /*unused*/) {
|
||||
return FAIL;
|
||||
}
|
||||
virtual TuningStatus IsSupported(const ParamsT* params) {
|
||||
|
@ -166,6 +166,10 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual bool supportsBFloat16RNNWithCuDNN() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual long versionCuDNN() const {
|
||||
TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
|
||||
}
|
||||
|
@ -25,7 +25,7 @@ struct TORCH_API HPUHooksInterface : AcceleratorHooksInterface {
|
||||
false, "Cannot get device of pointer on HPU without HPU backend");
|
||||
}
|
||||
|
||||
bool isPinnedPtr(const void*) const override {
|
||||
bool isPinnedPtr(const void* /*data*/) const override {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -410,7 +410,7 @@ struct ExistingBdimBatchRuleHelper<F, Func, c10::guts::typelist::typelist<A, T..
|
||||
|
||||
|
||||
template <typename F, F Method, typename... ExtraArgs>
|
||||
Tensor& unary_inplace_batch_rule(Tensor& self, std::optional<int64_t>, ExtraArgs... extra_args) {
|
||||
Tensor& unary_inplace_batch_rule(Tensor& self, std::optional<int64_t> /*unused*/, ExtraArgs... extra_args) {
|
||||
INVOKE(self, Method)(std::forward<ExtraArgs>(extra_args)...);
|
||||
return self;
|
||||
}
|
||||
|
@ -18,7 +18,7 @@ extern std::atomic<const MetalInterface*> g_metal_impl_registry;
|
||||
|
||||
class MetalImplRegistrar {
|
||||
public:
|
||||
explicit MetalImplRegistrar(MetalInterface*);
|
||||
explicit MetalImplRegistrar(MetalInterface* /*impl*/);
|
||||
};
|
||||
|
||||
at::Tensor& metal_copy_(at::Tensor& self, const at::Tensor& src);
|
||||
|
@ -12,7 +12,7 @@
|
||||
|
||||
#define MPS_ERROR_NOT_COMPILED "PyTorch code is not compiled with MPS enabled"
|
||||
#define MPS_ERROR_RUNTIME_TOO_LOW \
|
||||
"The MPS backend is supported on MacOS 13.0+.", \
|
||||
"The MPS backend is supported on MacOS 14.0+. ", \
|
||||
"Current OS version can be queried using `sw_vers`"
|
||||
#define MPS_ERROR_DOUBLE_NOT_SUPPORTED "Cannot convert a MPS Tensor to float64 dtype " \
|
||||
"as the MPS framework doesn't support float64. Please use float32 instead."
|
||||
|
@ -2060,7 +2060,7 @@ std::tuple<Tensor, Tensor> linalg_lu_factor(const Tensor& A, bool pivot) {
|
||||
}
|
||||
|
||||
// TODO Deprecate this function in favour of linalg_lu_factor_ex
|
||||
std::tuple<Tensor, Tensor, Tensor> _lu_with_info(const Tensor& self, bool compute_pivots, bool) {
|
||||
std::tuple<Tensor, Tensor, Tensor> _lu_with_info(const Tensor& self, bool compute_pivots, bool /*unused*/) {
|
||||
TORCH_WARN_ONCE(
|
||||
"torch.lu is deprecated in favor of torch.linalg.lu_factor / torch.linalg.lu_factor_ex and will be ",
|
||||
"removed in a future PyTorch release.\n",
|
||||
|
@ -1157,103 +1157,103 @@ REGISTER_AVX512_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
REGISTER_AVX2_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
REGISTER_VSX_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
REGISTER_SVE_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
REGISTER_SVE256_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(cholesky_inverse_stub, DEFAULT, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_AVX512_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_AVX2_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_VSX_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_ZVECTOR_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_SVE_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_SVE256_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(linalg_eig_stub, DEFAULT, &linalg_eig_kernel)
|
||||
REGISTER_AVX512_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
REGISTER_AVX2_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
REGISTER_VSX_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
REGISTER_SVE_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
REGISTER_SVE256_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(linalg_eigh_stub, DEFAULT, &linalg_eigh_kernel)
|
||||
REGISTER_AVX512_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
REGISTER_AVX2_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
REGISTER_VSX_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
REGISTER_SVE_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
REGISTER_SVE256_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(geqrf_stub, DEFAULT, &geqrf_kernel)
|
||||
REGISTER_AVX512_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
REGISTER_AVX2_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
REGISTER_VSX_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
REGISTER_SVE_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
REGISTER_SVE256_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(orgqr_stub, DEFAULT, &orgqr_kernel_impl)
|
||||
REGISTER_AVX512_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
REGISTER_AVX2_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
REGISTER_VSX_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
REGISTER_ZVECTOR_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
REGISTER_SVE_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
REGISTER_SVE256_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(ormqr_stub, DEFAULT, &ormqr_kernel)
|
||||
REGISTER_AVX512_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
REGISTER_AVX2_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
REGISTER_VSX_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
REGISTER_SVE_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
REGISTER_SVE256_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(lstsq_stub, DEFAULT, &lstsq_kernel)
|
||||
REGISTER_AVX512_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
REGISTER_AVX2_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
REGISTER_VSX_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
REGISTER_SVE_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
REGISTER_SVE256_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(triangular_solve_stub, DEFAULT, &triangular_solve_kernel)
|
||||
REGISTER_AVX512_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
REGISTER_AVX2_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
REGISTER_VSX_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
REGISTER_SVE_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
REGISTER_SVE256_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(lu_factor_stub, DEFAULT, &lu_factor_kernel)
|
||||
REGISTER_AVX512_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
REGISTER_AVX2_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
REGISTER_VSX_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
REGISTER_SVE_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
REGISTER_SVE256_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(ldl_factor_stub, DEFAULT, &ldl_factor_kernel)
|
||||
REGISTER_AVX512_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
REGISTER_AVX2_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
REGISTER_VSX_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
REGISTER_SVE_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
REGISTER_SVE256_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(ldl_solve_stub, DEFAULT, &ldl_solve_kernel)
|
||||
REGISTER_AVX512_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
REGISTER_AVX2_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
REGISTER_VSX_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
REGISTER_SVE_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
REGISTER_SVE256_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(lu_solve_stub, DEFAULT, &lu_solve_kernel)
|
||||
REGISTER_AVX512_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
REGISTER_AVX2_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
REGISTER_VSX_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
REGISTER_SVE_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
REGISTER_SVE256_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(svd_stub, DEFAULT, &svd_kernel)
|
||||
REGISTER_AVX512_DISPATCH(svd_stub, &svd_kernel)
|
||||
REGISTER_AVX2_DISPATCH(svd_stub, &svd_kernel)
|
||||
REGISTER_VSX_DISPATCH(svd_stub, &svd_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(svd_stub, &svd_kernel)
|
||||
REGISTER_SVE_DISPATCH(svd_stub, &svd_kernel)
|
||||
REGISTER_SVE256_DISPATCH(svd_stub, &svd_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(unpack_pivots_stub, DEFAULT, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_AVX512_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_AVX2_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_VSX_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_SVE_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_SVE256_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
} // namespace at::native
|
||||
|
@ -39,21 +39,19 @@ static CPUCapability compute_cpu_capability() {
|
||||
}
|
||||
#elif defined(HAVE_SVE_CPU_DEFINITION)
|
||||
int sve_vl = cpuinfo_get_max_arm_sve_length(); //Returns maximum SVE VL supported by your HW.
|
||||
if (envar == "sve") {
|
||||
// Select SVE capability based on the maximum SVE VL supported by the HW.
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
if (envar == "sve256") {
|
||||
if (sve_vl == 256) {
|
||||
#ifdef HAVE_ARM_BF16_CPU_DEFINITION
|
||||
if (cpuinfo_has_arm_bf16()) {
|
||||
return CPUCapability::SVE256;
|
||||
}
|
||||
} else if (sve_vl == 128) {
|
||||
if (cpuinfo_has_arm_bf16()) {
|
||||
return CPUCapability::SVE128;
|
||||
}
|
||||
} else {
|
||||
TORCH_WARN("SVE capability not available on hardware. Falling back to DEFAULT");
|
||||
return CPUCapability::DEFAULT;
|
||||
#endif
|
||||
}
|
||||
TORCH_WARN("SVE256 capability not available on hardware. Falling back to DEFAULT");
|
||||
return CPUCapability::DEFAULT;
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
#ifdef HAVE_AVX512_CPU_DEFINITION
|
||||
if (envar == "avx512") {
|
||||
@ -115,11 +113,6 @@ static CPUCapability compute_cpu_capability() {
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
if (sve_vl == 128) { // Check for SVE128
|
||||
return CPUCapability::SVE128;
|
||||
}
|
||||
#endif
|
||||
// Return the default CPU capability.
|
||||
return CPUCapability::DEFAULT;
|
||||
}
|
||||
@ -154,9 +147,6 @@ DispatchResult DispatchStubImpl::try_get_call_ptr(
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, void *SVE128
|
||||
#endif
|
||||
) {
|
||||
constexpr auto supported_devices = c10::array_of<c10::DeviceType>(
|
||||
c10::DeviceType::CPU,
|
||||
@ -194,9 +184,6 @@ DispatchResult DispatchStubImpl::try_get_call_ptr(
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, SVE128
|
||||
#endif
|
||||
);
|
||||
if (!std::holds_alternative<ErrorType>(result)) {
|
||||
@ -255,9 +242,6 @@ void* DispatchStubImpl::get_call_ptr(
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, void *SVE128
|
||||
#endif
|
||||
) {
|
||||
|
||||
auto result = try_get_call_ptr(
|
||||
@ -282,10 +266,6 @@ void* DispatchStubImpl::get_call_ptr(
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
,
|
||||
SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
,
|
||||
SVE128
|
||||
#endif
|
||||
);
|
||||
if (std::holds_alternative<ErrorType>(result)) {
|
||||
@ -320,9 +300,6 @@ DispatchResult DispatchStubImpl::try_choose_cpu_impl(
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, void *SVE128
|
||||
#endif
|
||||
){
|
||||
|
||||
@ -365,16 +342,6 @@ DispatchResult DispatchStubImpl::try_choose_cpu_impl(
|
||||
return DispatchResult(SVE256);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
if (capability >= static_cast<int>(CPUCapability::SVE128)) {
|
||||
if (C10_UNLIKELY(!SVE128)) {
|
||||
// dispatch to DEFAULT, since the SVE kernel is missing
|
||||
return DEFAULT != nullptr ? DispatchResult(DEFAULT) : ErrorType::MissingDeviceKernel;
|
||||
} else {
|
||||
return DispatchResult(SVE128);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return DEFAULT != nullptr ? DispatchResult(DEFAULT) : ErrorType::MissingDeviceKernel;
|
||||
}
|
||||
@ -396,9 +363,6 @@ void* DispatchStubImpl::choose_cpu_impl(
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, void *SVE128
|
||||
#endif
|
||||
) {
|
||||
auto capability = static_cast<int>(get_cpu_capability());
|
||||
(void)capability;
|
||||
@ -444,17 +408,6 @@ void* DispatchStubImpl::choose_cpu_impl(
|
||||
return SVE256;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
if (capability >= static_cast<int>(CPUCapability::SVE128)) {
|
||||
if (C10_UNLIKELY(!SVE128)) {
|
||||
// dispatch to DEFAULT, since the SVE kernel is missing
|
||||
TORCH_INTERNAL_ASSERT(DEFAULT, "DispatchStub: missing default kernel");
|
||||
return DEFAULT;
|
||||
} else {
|
||||
return SVE128;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
TORCH_INTERNAL_ASSERT(DEFAULT, "DispatchStub: missing default kernel");
|
||||
return DEFAULT;
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user