mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-03 15:35:04 +08:00
Compare commits
17 Commits
varlen-api
...
mlazos/use
| Author | SHA1 | Date | |
|---|---|---|---|
| f70a6ac1a6 | |||
| 020bd1f830 | |||
| c84e73df6e | |||
| 6120d39fdb | |||
| 17ed117a90 | |||
| 3e941aebb7 | |||
| e403203714 | |||
| 1e80b1ad7d | |||
| 3dbe758856 | |||
| 7faa4842f8 | |||
| 45200769d1 | |||
| 38695fbfb4 | |||
| 0c01bec755 | |||
| 02cbfe3469 | |||
| 3e8fcf18ab | |||
| 3e5b122936 | |||
| 422233fde2 |
@ -344,7 +344,7 @@ docker build \
|
||||
--build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
|
||||
--build-arg "KATEX=${KATEX:-}" \
|
||||
--build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
|
||||
--build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx1100}" \
|
||||
--build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942}" \
|
||||
--build-arg "IMAGE_NAME=${IMAGE_NAME}" \
|
||||
--build-arg "UCX_COMMIT=${UCX_COMMIT}" \
|
||||
--build-arg "UCC_COMMIT=${UCC_COMMIT}" \
|
||||
|
||||
@ -1 +1 @@
|
||||
deb42f2a8e48f5032b4a98ee781a15fa87a157cf
|
||||
e0dda9059d082537cee36be6c5e4fe3b18c880c0
|
||||
|
||||
@ -1 +1 @@
|
||||
7416ffcb92cdbe98d9f97e4e6f95247e46dfc9fd
|
||||
27664085f804afc83df26f740bb46c365854f2c4
|
||||
|
||||
@ -46,9 +46,9 @@ case ${DOCKER_TAG_PREFIX} in
|
||||
BASE_TARGET=rocm
|
||||
GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
|
||||
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
|
||||
# add gfx950, gfx115x conditionally starting in ROCm 7.0
|
||||
# add gfx950 conditionally starting in ROCm 7.0
|
||||
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
|
||||
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
|
||||
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
|
||||
fi
|
||||
DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
|
||||
;;
|
||||
|
||||
@ -115,9 +115,6 @@ RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio
|
||||
# cmake-3.28.0 from pip for onnxruntime
|
||||
RUN python3 -mpip install cmake==3.28.0
|
||||
|
||||
ADD ./common/patch_libstdc.sh patch_libstdc.sh
|
||||
RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
|
||||
|
||||
# build onnxruntime 1.21.0 from sources.
|
||||
# it is not possible to build it from sources using pip,
|
||||
# so just build it from upstream repository.
|
||||
|
||||
@ -84,9 +84,9 @@ case ${image} in
|
||||
DEVTOOLSET_VERSION="11"
|
||||
GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
|
||||
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
|
||||
# add gfx950, gfx115x conditionally starting in ROCm 7.0
|
||||
# add gfx950 conditionally starting in ROCm 7.0
|
||||
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
|
||||
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
|
||||
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
|
||||
fi
|
||||
DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
|
||||
;;
|
||||
|
||||
@ -10,6 +10,11 @@ BAD_SSL = "https://self-signed.badssl.com"
|
||||
|
||||
print("Testing SSL certificate checking for Python:", sys.version)
|
||||
|
||||
if sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4):
|
||||
print("This version never checks SSL certs; skipping tests")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
EXC = OSError
|
||||
|
||||
print(f"Connecting to {GOOD_SSL} should work")
|
||||
|
||||
@ -67,7 +67,7 @@ fi
|
||||
# wheels with cxx11-abi
|
||||
|
||||
echo "Checking that the gcc ABI is what we expect"
|
||||
if [[ "$(uname)" != 'Darwin' ]]; then
|
||||
if [[ "$(uname)" != 'Darwin' && "$(uname -m)" != "s390x" ]]; then
|
||||
# We also check that there are cxx11 symbols in libtorch
|
||||
#
|
||||
echo "Checking that symbols in libtorch.so have the right gcc abi"
|
||||
|
||||
@ -256,7 +256,7 @@ test_torchbench_smoketest() {
|
||||
local device=mps
|
||||
local dtypes=(undefined float16 bfloat16 notset)
|
||||
local dtype=${dtypes[$1]}
|
||||
local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16)
|
||||
local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
|
||||
|
||||
for backend in eager inductor; do
|
||||
|
||||
@ -319,7 +319,7 @@ test_aoti_torchbench_smoketest() {
|
||||
local device=mps
|
||||
local dtypes=(undefined float16 bfloat16 notset)
|
||||
local dtype=${dtypes[$1]}
|
||||
local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16)
|
||||
local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
|
||||
|
||||
echo "Launching torchbench inference performance run for AOT Inductor and dtype ${dtype}"
|
||||
local dtype_arg="--${dtype}"
|
||||
|
||||
@ -838,7 +838,7 @@ test_dynamo_benchmark() {
|
||||
elif [[ "${suite}" == "timm_models" ]]; then
|
||||
export TORCHBENCH_ONLY_MODELS="inception_v3"
|
||||
elif [[ "${suite}" == "torchbench" ]]; then
|
||||
export TORCHBENCH_ONLY_MODELS="BERT_pytorch"
|
||||
export TORCHBENCH_ONLY_MODELS="hf_Bert"
|
||||
fi
|
||||
fi
|
||||
test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
|
||||
@ -869,13 +869,13 @@ test_inductor_torchbench_smoketest_perf() {
|
||||
mkdir -p "$TEST_REPORTS_DIR"
|
||||
|
||||
python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
|
||||
--batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only BERT_pytorch \
|
||||
--batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
|
||||
--output "$TEST_REPORTS_DIR/inductor_training_smoketest.csv"
|
||||
# The threshold value needs to be actively maintained to make this check useful
|
||||
python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4
|
||||
|
||||
# Check memory compression ratio for a few models
|
||||
for test in BERT_pytorch yolov3; do
|
||||
for test in hf_Albert timm_vision_transformer; do
|
||||
python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --amp --training \
|
||||
--disable-cudagraphs --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" \
|
||||
--only $test --output "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv"
|
||||
@ -886,7 +886,7 @@ test_inductor_torchbench_smoketest_perf() {
|
||||
done
|
||||
|
||||
# Perform some "warm-start" runs for a few huggingface models.
|
||||
for test in AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
|
||||
for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
|
||||
python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
|
||||
--only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
|
||||
python benchmarks/dynamo/check_accuracy.py \
|
||||
|
||||
@ -71,7 +71,14 @@ export PYTORCH_BUILD_NUMBER=1
|
||||
|
||||
# Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS
|
||||
TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
|
||||
TRITON_CONSTRAINT="platform_system == 'Linux'"
|
||||
|
||||
# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
|
||||
TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||
|
||||
# CUDA 12.9/13.0 builds have triton for Linux and Linux aarch64 binaries.
|
||||
if [[ "$DESIRED_CUDA" == "cu129" ]] || [[ "$DESIRED_CUDA" == "cu130" ]]; then
|
||||
TRITON_CONSTRAINT="platform_system == 'Linux'"
|
||||
fi
|
||||
|
||||
if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" && ! "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then
|
||||
TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
|
||||
|
||||
35
.github/actions/setup-linux/action.yml
vendored
35
.github/actions/setup-linux/action.yml
vendored
@ -28,10 +28,6 @@ runs:
|
||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
||||
echo "system info $(uname -a)"
|
||||
|
||||
- name: Print GPU info (if present)
|
||||
shell: bash
|
||||
run: if [ -f /usr/bin/nvidia-smi ]; then nvidia-smi; fi
|
||||
|
||||
- name: Check if in a container runner
|
||||
shell: bash
|
||||
id: check_container_runner
|
||||
@ -86,6 +82,37 @@ runs:
|
||||
# Prune all of the docker images
|
||||
docker system prune -af
|
||||
|
||||
- name: Manually resolve download.pytorch.org
|
||||
shell: bash
|
||||
continue-on-error: true
|
||||
run: |
|
||||
set +e
|
||||
set -x
|
||||
|
||||
PT_DOMAIN=download.pytorch.org
|
||||
# TODO: Flaky access to download.pytorch.org https://github.com/pytorch/pytorch/issues/100400,
|
||||
# cleaning this up once the issue is fixed. There are more than one resolved IP here, the last
|
||||
# one is returned at random
|
||||
RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" | tail -n1)
|
||||
|
||||
if [ -z "${RESOLVED_IP}" ]; then
|
||||
echo "Couldn't resolve ${PT_DOMAIN}, retrying with Google DNS..."
|
||||
RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" @8.8.8.8 | tail -n1)
|
||||
|
||||
if [ -z "${RESOLVED_IP}" ]; then
|
||||
echo "Couldn't resolve ${PT_DOMAIN}, exiting..."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if grep -r "${PT_DOMAIN}" /etc/hosts; then
|
||||
# Clean up any old records first
|
||||
sudo sed -i "/${PT_DOMAIN}/d" /etc/hosts
|
||||
fi
|
||||
|
||||
echo "${RESOLVED_IP} ${PT_DOMAIN}" | sudo tee -a /etc/hosts
|
||||
cat /etc/hosts
|
||||
|
||||
- name: Check that the docker daemon is running
|
||||
shell: bash
|
||||
continue-on-error: true
|
||||
|
||||
BIN
.github/scripts/drci_mocks.json.gz
vendored
BIN
.github/scripts/drci_mocks.json.gz
vendored
Binary file not shown.
1
.github/scripts/github_utils.py
vendored
1
.github/scripts/github_utils.py
vendored
@ -18,7 +18,6 @@ class GitHubComment:
|
||||
body_text: str
|
||||
created_at: str
|
||||
author_login: str
|
||||
author_url: Optional[str]
|
||||
author_association: str
|
||||
editor_login: Optional[str]
|
||||
database_id: int
|
||||
|
||||
BIN
.github/scripts/gql_mocks.json.gz
vendored
BIN
.github/scripts/gql_mocks.json.gz
vendored
Binary file not shown.
2
.github/scripts/test_check_labels.py
vendored
2
.github/scripts/test_check_labels.py
vendored
@ -38,7 +38,6 @@ def mock_get_comments() -> list[GitHubComment]:
|
||||
body_text="mock_body_text",
|
||||
created_at="",
|
||||
author_login="",
|
||||
author_url=None,
|
||||
author_association="",
|
||||
editor_login=None,
|
||||
database_id=1,
|
||||
@ -49,7 +48,6 @@ def mock_get_comments() -> list[GitHubComment]:
|
||||
body_text=" #" + LABEL_ERR_MSG_TITLE.replace("`", ""),
|
||||
created_at="",
|
||||
author_login=BOT_AUTHORS[1],
|
||||
author_url=None,
|
||||
author_association="",
|
||||
editor_login=None,
|
||||
database_id=2,
|
||||
|
||||
18
.github/scripts/test_trymerge.py
vendored
18
.github/scripts/test_trymerge.py
vendored
@ -32,7 +32,6 @@ from trymerge import (
|
||||
main as trymerge_main,
|
||||
MandatoryChecksMissingError,
|
||||
MergeRule,
|
||||
PostCommentError,
|
||||
RE_GHSTACK_DESC,
|
||||
read_merge_rules,
|
||||
remove_job_name_suffix,
|
||||
@ -589,23 +588,6 @@ class TestTryMerge(TestCase):
|
||||
self.assertEqual(mock_merge_base, pr.get_merge_base())
|
||||
mocked_gh_fetch_merge_base.assert_called_once()
|
||||
|
||||
def test_app_can_revert(self, *args: Any) -> None:
|
||||
pr = GitHubPR("pytorch", "pytorch", 164660)
|
||||
repo = DummyGitRepo()
|
||||
app_comment_id, impostor_comment_id = 3375785595, 3377647892
|
||||
# Check that app can revert
|
||||
self.assertIsNotNone(validate_revert(repo, pr, comment_id=app_comment_id))
|
||||
# But impostor can not
|
||||
self.assertRaises(
|
||||
PostCommentError,
|
||||
lambda: validate_revert(repo, pr, comment_id=impostor_comment_id),
|
||||
)
|
||||
# Despite it's name being the name of the bot
|
||||
self.assertEqual(
|
||||
pr.get_comment_by_id(impostor_comment_id).author_login,
|
||||
"pytorch-auto-revert",
|
||||
)
|
||||
|
||||
|
||||
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
|
||||
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
|
||||
|
||||
7
.github/scripts/trymerge.py
vendored
7
.github/scripts/trymerge.py
vendored
@ -234,7 +234,6 @@ query ($owner: String!, $name: String!, $number: Int!) {
|
||||
createdAt
|
||||
author {
|
||||
login
|
||||
url
|
||||
}
|
||||
authorAssociation
|
||||
editor {
|
||||
@ -1094,7 +1093,6 @@ class GitHubPR:
|
||||
body_text=node["bodyText"],
|
||||
created_at=node["createdAt"] if "createdAt" in node else "",
|
||||
author_login=node["author"]["login"],
|
||||
author_url=node["author"].get("url", None),
|
||||
author_association=node["authorAssociation"],
|
||||
editor_login=editor["login"] if editor else None,
|
||||
database_id=node["databaseId"],
|
||||
@ -2031,11 +2029,6 @@ def validate_revert(
|
||||
# For some reason, one can not be a member of private repo, only CONTRIBUTOR
|
||||
if pr.is_base_repo_private():
|
||||
allowed_reverters.append("CONTRIBUTOR")
|
||||
# Special case the pytorch-auto-revert app, whose does not have association
|
||||
# But should be able to issue revert command
|
||||
if comment.author_url == "https://github.com/apps/pytorch-auto-revert":
|
||||
allowed_reverters.append("NONE")
|
||||
|
||||
if author_association not in allowed_reverters:
|
||||
raise PostCommentError(
|
||||
f"Will not revert as @{author_login} is not one of "
|
||||
|
||||
2
.github/workflows/h100-distributed.yml
vendored
2
.github/workflows/h100-distributed.yml
vendored
@ -37,7 +37,7 @@ jobs:
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runner: "linux.c7i.12xlarge"
|
||||
runner: "linux.12xlarge"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '9.0'
|
||||
|
||||
@ -2,7 +2,7 @@ name: inductor-perf-nightly-h100
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: 15 0 * * 1-6
|
||||
- cron: 15 0,12 * * 1-6
|
||||
- cron: 0 7 * * 0
|
||||
# NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
|
||||
# out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
|
||||
|
||||
@ -63,7 +63,6 @@ jobs:
|
||||
# Same as the build job
|
||||
python-version: 3.12.7
|
||||
test-matrix: ${{ needs.macos-perf-py3-arm64-build.outputs.test-matrix }}
|
||||
timeout-minutes: 300
|
||||
disable-monitor: false
|
||||
monitor-log-interval: 15
|
||||
monitor-data-collect-interval: 4
|
||||
|
||||
26
.github/workflows/rocm.yml
vendored
26
.github/workflows/rocm.yml
vendored
@ -59,29 +59,3 @@ jobs:
|
||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-gfx1100-test:
|
||||
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-jammy-rocm-py3_10-gfx1100
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-jammy-rocm-py3_10-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
|
||||
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
|
||||
]}
|
||||
tests-to-include: >
|
||||
test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
|
||||
test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
|
||||
inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
|
||||
inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
|
||||
inductor/test_flex_attention inductor/test_max_autotune
|
||||
secrets: inherit
|
||||
|
||||
4
.github/workflows/xpu.yml
vendored
4
.github/workflows/xpu.yml
vendored
@ -35,7 +35,7 @@ jobs:
|
||||
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
|
||||
build-environment: linux-jammy-xpu-n-1-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-1-py3
|
||||
runner: linux.c7i.12xlarge
|
||||
runner: linux.12xlarge
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
@ -56,7 +56,7 @@ jobs:
|
||||
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
|
||||
build-environment: linux-jammy-xpu-n-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
|
||||
runner: linux.c7i.12xlarge
|
||||
runner: linux.12xlarge
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@ -88,7 +88,7 @@ torch_compile_debug/
|
||||
# Listed manually because some files in this directory are not generated
|
||||
torch/testing/_internal/generated/annotated_fn_args.py
|
||||
torch/testing/_internal/data/*.pt
|
||||
torch/headeronly/version.h
|
||||
torch/csrc/api/include/torch/version.h
|
||||
torch/csrc/cudnn/cuDNN.cpp
|
||||
torch/csrc/generated
|
||||
torch/csrc/generic/TensorMethods.cpp
|
||||
|
||||
@ -28,7 +28,7 @@ exclude_patterns = [
|
||||
'torch/lib/**',
|
||||
'venv/**',
|
||||
'**/*.pyi',
|
||||
"tools/experimental/torchfuzz/**",
|
||||
"tools/experimental/dynamic_shapes/torchfuzz/**",
|
||||
'tools/test/test_selective_build.py',
|
||||
]
|
||||
command = [
|
||||
@ -198,7 +198,7 @@ exclude_patterns = [
|
||||
'tools/test/gen_operators_yaml_test.py',
|
||||
'tools/test/gen_oplist_test.py',
|
||||
'tools/test/test_selective_build.py',
|
||||
'tools/experimental/torchfuzz/**',
|
||||
'tools/experimental/dynamic_shapes/torchfuzz/**',
|
||||
]
|
||||
command = [
|
||||
'python3',
|
||||
|
||||
@ -13,9 +13,6 @@ load(":build_variables.bzl", "jit_core_sources", "lazy_tensor_ts_sources", "libt
|
||||
load(":ufunc_defs.bzl", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cuda_sources")
|
||||
load("//:tools/bazel.bzl", "rules")
|
||||
|
||||
# Export files for use by torch/headeronly (where version.h generation now lives)
|
||||
exports_files(["version.txt"])
|
||||
|
||||
define_targets(rules = rules)
|
||||
|
||||
COMMON_COPTS = [
|
||||
@ -693,9 +690,7 @@ cc_library(
|
||||
"torch/csrc/*/generated/*.h",
|
||||
"torch/csrc/jit/serialization/mobile_bytecode_generated.h",
|
||||
] + torch_cuda_headers,
|
||||
) + GENERATED_AUTOGRAD_CPP + [
|
||||
"//torch/headeronly:version_h",
|
||||
],
|
||||
) + GENERATED_AUTOGRAD_CPP + [":version_h"],
|
||||
includes = [
|
||||
"third_party/kineto/libkineto/include",
|
||||
"torch/csrc",
|
||||
|
||||
@ -28,19 +28,4 @@ inline std::ostream& operator<<(std::ostream& stream, at::BlasBackend backend) {
|
||||
return stream << BlasBackendToString(backend);
|
||||
}
|
||||
|
||||
namespace blas {
|
||||
|
||||
enum class ScalingType : std::uint8_t {
|
||||
TensorWise, // fp32 scales
|
||||
RowWise, // fp32 scales
|
||||
BlockWise1x16, // fp8_e4m3fn scales
|
||||
BlockWise1x32, // fp8_e8m0fnu scales
|
||||
BlockWise1x128, // fp32 scales
|
||||
BlockWise128x128, // fp32 scales
|
||||
};
|
||||
|
||||
enum class SwizzleType : std::uint8_t { NO_SWIZZLE = 0, SWIZZLE_32_4_4 = 1 };
|
||||
|
||||
} // namespace blas
|
||||
|
||||
} // namespace at
|
||||
|
||||
@ -144,7 +144,8 @@ inline std::string _all_equal_numel_error(at::ArrayRef<Tensor> tensors) {
|
||||
inline bool _apply_preamble(ArrayRef<Tensor> tensors) {
|
||||
checkDeviceType("CPU_tensor_apply", tensors, kCPU);
|
||||
checkLayout("CPU_tensor_apply", tensors, kStrided);
|
||||
TORCH_CHECK(_all_equal_numel(tensors), _all_equal_numel_error(tensors));
|
||||
if (!_all_equal_numel(tensors))
|
||||
TORCH_CHECK(false, _all_equal_numel_error(tensors));
|
||||
// An empty tensor has no elements
|
||||
for (auto& t : tensors)
|
||||
if (t.numel() == 0)
|
||||
|
||||
@ -587,33 +587,20 @@ void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) {
|
||||
rocm_fa_preferred_backend = b;
|
||||
}
|
||||
|
||||
CuBLASReductionOption Context::allowFP16ReductionCuBLAS() const {
|
||||
bool Context::allowFP16ReductionCuBLAS() const {
|
||||
return allow_fp16_reduction_cublas;
|
||||
}
|
||||
|
||||
CuBLASReductionOption inline get_reduction_option(bool allow_reduced_precision, bool allow_splitk) {
|
||||
TORCH_CHECK(
|
||||
!(allow_reduced_precision && !allow_splitk),
|
||||
"allow_splitk=False is not supported when reduced precision reductions are enabled");
|
||||
if (allow_reduced_precision) {
|
||||
return CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
|
||||
} else if (allow_splitk) {
|
||||
return CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK;
|
||||
} else {
|
||||
return CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK;
|
||||
}
|
||||
void Context::setAllowFP16ReductionCuBLAS(bool b) {
|
||||
allow_fp16_reduction_cublas = b;
|
||||
}
|
||||
|
||||
void Context::setAllowFP16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) {
|
||||
allow_fp16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk);
|
||||
}
|
||||
|
||||
CuBLASReductionOption Context::allowBF16ReductionCuBLAS() const {
|
||||
bool Context::allowBF16ReductionCuBLAS() const {
|
||||
return allow_bf16_reduction_cublas;
|
||||
}
|
||||
|
||||
void Context::setAllowBF16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) {
|
||||
allow_bf16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk);
|
||||
void Context::setAllowBF16ReductionCuBLAS(bool b) {
|
||||
allow_bf16_reduction_cublas = b;
|
||||
}
|
||||
|
||||
bool Context::allowFP16AccumulationCuBLAS() const {
|
||||
|
||||
@ -38,12 +38,6 @@ namespace at {
|
||||
class Tensor;
|
||||
|
||||
enum class TORCH_API Float32MatmulPrecision { HIGHEST, HIGH, MEDIUM };
|
||||
|
||||
enum class CuBLASReductionOption : uint8_t {
|
||||
AllowReducedPrecisionWithSplitK = 0,
|
||||
DisallowReducedPrecisionAllowSplitK = 1,
|
||||
DisallowReducedPrecisionDisallowSplitK = 2,
|
||||
};
|
||||
enum class TORCH_API Float32Backend { GENERIC, CUDA, MKLDNN };
|
||||
enum class TORCH_API Float32Op { ALL, CONV, RNN, MATMUL };
|
||||
enum class TORCH_API Float32Precision { NONE, IEEE, TF32, BF16 };
|
||||
@ -226,15 +220,15 @@ class TORCH_API Context {
|
||||
bool userEnabledMkldnn() const;
|
||||
void setUserEnabledMkldnn(bool e);
|
||||
bool benchmarkCuDNN() const;
|
||||
void setBenchmarkCuDNN(bool /*b*/);
|
||||
void setBenchmarkCuDNN(bool);
|
||||
int benchmarkLimitCuDNN() const;
|
||||
void setBenchmarkLimitCuDNN(int /*b*/);
|
||||
void setBenchmarkLimitCuDNN(int);
|
||||
bool immediateMiopen() const;
|
||||
void setImmediateMiopen(bool /*b*/);
|
||||
void setImmediateMiopen(bool);
|
||||
bool deterministicCuDNN() const;
|
||||
void setDeterministicCuDNN(bool /*b*/);
|
||||
void setDeterministicCuDNN(bool);
|
||||
bool deterministicMkldnn() const;
|
||||
void setDeterministicMkldnn(bool /*b*/);
|
||||
void setDeterministicMkldnn(bool);
|
||||
bool userEnabledNNPACK() const;
|
||||
void setUserEnabledNNPACK(bool e);
|
||||
|
||||
@ -252,32 +246,32 @@ class TORCH_API Context {
|
||||
void setSDPPriorityOrder(const std::vector<int64_t>& order);
|
||||
std::array<at::SDPBackend, at::num_sdp_backends> sDPPriorityOrder();
|
||||
|
||||
void setSDPUseFlash(bool /*e*/);
|
||||
void setSDPUseFlash(bool);
|
||||
bool userEnabledFlashSDP() const;
|
||||
|
||||
void setSDPUseMemEfficient(bool /*e*/);
|
||||
void setSDPUseMemEfficient(bool);
|
||||
bool userEnabledMemEfficientSDP() const;
|
||||
|
||||
void setSDPUseMath(bool /*e*/);
|
||||
void setSDPUseMath(bool);
|
||||
bool userEnabledMathSDP() const;
|
||||
|
||||
void setSDPUseCuDNN(bool /*e*/);
|
||||
void setSDPUseCuDNN(bool);
|
||||
bool userEnabledCuDNNSDP() const;
|
||||
|
||||
void setAllowFP16BF16ReductionMathSDP(bool /*e*/);
|
||||
void setAllowFP16BF16ReductionMathSDP(bool);
|
||||
bool allowFP16BF16ReductionMathSDP() const;
|
||||
|
||||
void setSDPUseOverrideable(bool /*e*/);
|
||||
void setSDPUseOverrideable(bool);
|
||||
bool userEnabledOverrideableSDP() const;
|
||||
|
||||
at::LinalgBackend linalgPreferredBackend() const;
|
||||
void setLinalgPreferredBackend(at::LinalgBackend /*b*/);
|
||||
void setLinalgPreferredBackend(at::LinalgBackend);
|
||||
|
||||
at::BlasBackend blasPreferredBackend();
|
||||
void setBlasPreferredBackend(at::BlasBackend /*b*/);
|
||||
void setBlasPreferredBackend(at::BlasBackend);
|
||||
|
||||
at::ROCmFABackend getROCmFAPreferredBackend();
|
||||
void setROCmFAPreferredBackend(at::ROCmFABackend /*b*/);
|
||||
void setROCmFAPreferredBackend(at::ROCmFABackend);
|
||||
|
||||
// Note [Enabling Deterministic Operations]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
@ -310,9 +304,9 @@ class TORCH_API Context {
|
||||
|
||||
bool deterministicAlgorithms() const;
|
||||
bool deterministicAlgorithmsWarnOnly() const;
|
||||
void setDeterministicAlgorithms(bool /*b*/, bool /*warn_only*/);
|
||||
void setDeterministicAlgorithms(bool, bool);
|
||||
bool deterministicFillUninitializedMemory() const;
|
||||
void setDeterministicFillUninitializedMemory(bool /*b*/);
|
||||
void setDeterministicFillUninitializedMemory(bool);
|
||||
|
||||
// Note [Writing Nondeterministic Operations]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
@ -356,23 +350,19 @@ class TORCH_API Context {
|
||||
Float32Op op,
|
||||
Float32Precision p);
|
||||
bool allowTF32CuDNN(std::optional<Float32Op> op = std::nullopt) const;
|
||||
void setAllowTF32CuDNN(bool /*b*/);
|
||||
void setAllowTF32CuDNN(bool);
|
||||
bool allowTF32OneDNN() const;
|
||||
void setAllowTF32OneDNN(bool /*b*/);
|
||||
void setAllowTF32OneDNN(bool);
|
||||
bool allowTF32CuBLAS() const;
|
||||
void setAllowTF32CuBLAS(bool /*b*/);
|
||||
void setAllowTF32CuBLAS(bool);
|
||||
Float32MatmulPrecision float32MatmulPrecision() const;
|
||||
Float32Precision float32Precision(Float32Backend backend, Float32Op op) const;
|
||||
CuBLASReductionOption allowFP16ReductionCuBLAS() const;
|
||||
void setAllowFP16ReductionCuBLAS(
|
||||
bool allow_reduced_precision,
|
||||
bool allow_splitk = true);
|
||||
CuBLASReductionOption allowBF16ReductionCuBLAS() const;
|
||||
void setAllowBF16ReductionCuBLAS(
|
||||
bool allow_reduced_precision,
|
||||
bool allow_splitk = true);
|
||||
bool allowFP16ReductionCuBLAS() const;
|
||||
void setAllowFP16ReductionCuBLAS(bool);
|
||||
bool allowBF16ReductionCuBLAS() const;
|
||||
void setAllowBF16ReductionCuBLAS(bool);
|
||||
bool allowFP16AccumulationCuBLAS() const;
|
||||
void setAllowFP16AccumulationCuBLAS(bool /*b*/);
|
||||
void setAllowFP16AccumulationCuBLAS(bool);
|
||||
|
||||
// Matmuls can use a so-called "persistent" kernel which launches one CUDA
|
||||
// block for each SM on the GPU, and each block then iterates over multiple
|
||||
@ -384,7 +374,7 @@ class TORCH_API Context {
|
||||
// to make matmuls target only a subset of the SMs, so they can fully schedule
|
||||
// even next to a comms kernel, and only be a few percent slower.
|
||||
std::optional<int32_t> _SMCarveout_EXPERIMENTAL() const;
|
||||
void _setSMCarveout_EXPERIMENTAL(std::optional<int32_t> /*c*/);
|
||||
void _setSMCarveout_EXPERIMENTAL(std::optional<int32_t>);
|
||||
|
||||
at::QEngine qEngine() const;
|
||||
void setQEngine(at::QEngine e);
|
||||
@ -405,7 +395,7 @@ class TORCH_API Context {
|
||||
void setDefaultMobileCPUAllocator();
|
||||
void unsetDefaultMobileCPUAllocator();
|
||||
bool allowFP16ReductionCPU() const;
|
||||
void setAllowFP16ReductionCPU(bool /*b*/);
|
||||
void setAllowFP16ReductionCPU(bool);
|
||||
|
||||
// Preserved for BC
|
||||
void lazyInitCUDA() {
|
||||
@ -462,10 +452,8 @@ class TORCH_API Context {
|
||||
: at::Float32MatmulPrecision::HIGHEST;
|
||||
int benchmark_limit_cudnn = 10;
|
||||
bool allow_tf32_cudnn = true;
|
||||
CuBLASReductionOption allow_fp16_reduction_cublas =
|
||||
CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
|
||||
CuBLASReductionOption allow_bf16_reduction_cublas =
|
||||
CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
|
||||
bool allow_fp16_reduction_cublas = true;
|
||||
bool allow_bf16_reduction_cublas = true;
|
||||
bool allow_fp16_accumulation_cublas = false;
|
||||
std::optional<int32_t> sm_carveout = std::nullopt;
|
||||
bool enabled_mkldnn = true;
|
||||
|
||||
@ -62,7 +62,7 @@ constexpr const char* unknown_eventname = "eventname not specified";
|
||||
#endif
|
||||
} // namespace (anonymous)
|
||||
|
||||
MapAllocator::MapAllocator(WithFd /*unused*/, std::string_view filename, int fd, int flags, size_t size)
|
||||
MapAllocator::MapAllocator(WithFd, std::string_view filename, int fd, int flags, size_t size)
|
||||
: filename_(filename.empty() ? unknown_filename : filename)
|
||||
, size_(0) // to be filled later
|
||||
#ifdef _WIN32
|
||||
@ -494,7 +494,7 @@ RefcountedMapAllocator::RefcountedMapAllocator(const char *filename, int flags,
|
||||
|
||||
initializeAlloc();
|
||||
}
|
||||
RefcountedMapAllocator::RefcountedMapAllocator(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size)
|
||||
RefcountedMapAllocator::RefcountedMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size)
|
||||
: RefcountedMapAllocatorArgCheck(flags)
|
||||
, MapAllocator(WITH_FD, filename, flags, fd, size + map_alloc_alignment) {
|
||||
|
||||
@ -614,7 +614,7 @@ at::DataPtr MapAllocator::makeDataPtr(std::string_view filename, int flags, size
|
||||
return {context->data(), context, &deleteMapAllocator, at::DeviceType::CPU};
|
||||
}
|
||||
|
||||
at::DataPtr MapAllocator::makeDataPtr(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
|
||||
at::DataPtr MapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
|
||||
auto* context = new MapAllocator(WITH_FD, filename, fd, flags, size);
|
||||
if (actual_size_out) *actual_size_out = context->size();
|
||||
return {context->data(), context, &deleteMapAllocator, at::DeviceType::CPU};
|
||||
@ -626,7 +626,7 @@ at::DataPtr RefcountedMapAllocator::makeDataPtr(const char *filename, int flags,
|
||||
return {context->data(), context, &deleteRefcountedMapAllocator, at::DeviceType::CPU};
|
||||
}
|
||||
|
||||
at::DataPtr RefcountedMapAllocator::makeDataPtr(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
|
||||
at::DataPtr RefcountedMapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
|
||||
auto* context = new RefcountedMapAllocator(WITH_FD, filename, fd, flags, size);
|
||||
if (actual_size_out) *actual_size_out = context->size() - map_alloc_alignment;
|
||||
return {context->data(), context, &deleteRefcountedMapAllocator, at::DeviceType::CPU};
|
||||
|
||||
@ -25,7 +25,7 @@ class TORCH_API MapAllocator {
|
||||
public:
|
||||
MapAllocator(std::string_view filename, int flags, size_t size);
|
||||
MapAllocator(
|
||||
WithFd /*unused*/,
|
||||
WithFd,
|
||||
std::string_view filename,
|
||||
int fd,
|
||||
int flags,
|
||||
@ -59,14 +59,14 @@ class TORCH_API MapAllocator {
|
||||
return flags_;
|
||||
}
|
||||
|
||||
static MapAllocator* fromDataPtr(const at::DataPtr& /*dptr*/);
|
||||
static MapAllocator* fromDataPtr(const at::DataPtr&);
|
||||
static at::DataPtr makeDataPtr(
|
||||
std::string_view filename,
|
||||
int flags,
|
||||
size_t size,
|
||||
size_t* actual_size_out);
|
||||
static at::DataPtr makeDataPtr(
|
||||
WithFd /*unused*/,
|
||||
WithFd,
|
||||
const char* filename,
|
||||
int fd,
|
||||
int flags,
|
||||
@ -105,13 +105,13 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck,
|
||||
public:
|
||||
RefcountedMapAllocator(const char* filename, int flags, size_t size);
|
||||
RefcountedMapAllocator(
|
||||
WithFd /*unused*/,
|
||||
WithFd,
|
||||
const char* filename,
|
||||
int fd,
|
||||
int flags,
|
||||
size_t size);
|
||||
|
||||
static RefcountedMapAllocator* fromDataPtr(const at::DataPtr& /*dptr*/);
|
||||
static RefcountedMapAllocator* fromDataPtr(const at::DataPtr&);
|
||||
RefcountedMapAllocator(const RefcountedMapAllocator&) = delete;
|
||||
RefcountedMapAllocator(RefcountedMapAllocator&&) = delete;
|
||||
RefcountedMapAllocator& operator=(const RefcountedMapAllocator&) = delete;
|
||||
@ -122,7 +122,7 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck,
|
||||
size_t size,
|
||||
size_t* actual_size_out);
|
||||
static at::DataPtr makeDataPtr(
|
||||
WithFd /*unused*/,
|
||||
WithFd,
|
||||
const char* filename,
|
||||
int fd,
|
||||
int flags,
|
||||
|
||||
@ -273,7 +273,7 @@ c10::SymInt NestedTensorImpl::sym_numel_custom() const {
|
||||
return NestedTensorImpl::numel_custom();
|
||||
}
|
||||
|
||||
c10::SymBool NestedTensorImpl::sym_is_contiguous_custom(MemoryFormat /*memory_format*/) const {
|
||||
c10::SymBool NestedTensorImpl::sym_is_contiguous_custom(MemoryFormat) const {
|
||||
return nested_tensor_impl_is_contiguous(this);
|
||||
}
|
||||
IntArrayRef NestedTensorImpl::sizes_custom() const {
|
||||
|
||||
@ -115,8 +115,7 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
|
||||
// with real implementations
|
||||
int64_t numel_custom() const override;
|
||||
c10::SymInt sym_numel_custom() const override;
|
||||
c10::SymBool sym_is_contiguous_custom(
|
||||
MemoryFormat /*memory_format*/) const override;
|
||||
c10::SymBool sym_is_contiguous_custom(MemoryFormat) const override;
|
||||
int64_t size_custom(int64_t d) const override {
|
||||
return this->size(d);
|
||||
}
|
||||
|
||||
@ -14,7 +14,7 @@ inline int64_t divup(int64_t x, int64_t y) {
|
||||
TORCH_API void init_num_threads();
|
||||
|
||||
// Sets the number of threads to be used in parallel region
|
||||
TORCH_API void set_num_threads(int /*nthreads*/);
|
||||
TORCH_API void set_num_threads(int);
|
||||
|
||||
// Returns the maximum number of threads that may be used in a parallel region
|
||||
TORCH_API int get_num_threads();
|
||||
@ -37,7 +37,7 @@ inline void lazy_init_num_threads() {
|
||||
}
|
||||
}
|
||||
|
||||
TORCH_API void set_thread_num(int /*id*/);
|
||||
TORCH_API void set_thread_num(int);
|
||||
|
||||
class TORCH_API ThreadIdGuard {
|
||||
public:
|
||||
@ -130,7 +130,7 @@ inline scalar_t parallel_reduce(
|
||||
TORCH_API std::string get_parallel_info();
|
||||
|
||||
// Sets number of threads used for inter-op parallelism
|
||||
TORCH_API void set_num_interop_threads(int /*nthreads*/);
|
||||
TORCH_API void set_num_interop_threads(int);
|
||||
|
||||
// Returns the number of threads used for inter-op parallelism
|
||||
TORCH_API size_t get_num_interop_threads();
|
||||
|
||||
@ -252,7 +252,7 @@ void SparseCsrTensorImpl::set_stride(int64_t dim, int64_t new_stride) {
|
||||
void SparseCsrTensorImpl::set_storage_offset(int64_t storage_offset) {
|
||||
TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have set_storage_offset.");
|
||||
}
|
||||
c10::SymBool SparseCsrTensorImpl::sym_is_contiguous_custom(MemoryFormat /*memory_format*/) const {
|
||||
c10::SymBool SparseCsrTensorImpl::sym_is_contiguous_custom(MemoryFormat) const {
|
||||
TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have is_contiguous");
|
||||
}
|
||||
} // namespace at
|
||||
|
||||
@ -32,10 +32,10 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
|
||||
|
||||
public:
|
||||
explicit SparseCsrTensorImpl(
|
||||
at::DispatchKeySet /*key_set*/,
|
||||
at::DispatchKeySet,
|
||||
at::Device device,
|
||||
Layout layout,
|
||||
const caffe2::TypeMeta /*data_type*/);
|
||||
const caffe2::TypeMeta);
|
||||
|
||||
void resize_(int64_t nnz, IntArrayRef size);
|
||||
void resize_and_clear_(
|
||||
@ -86,8 +86,7 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
|
||||
protected:
|
||||
IntArrayRef strides_custom() const override;
|
||||
SymIntArrayRef sym_strides_custom() const override;
|
||||
SymBool sym_is_contiguous_custom(
|
||||
MemoryFormat /*memory_format*/) const override;
|
||||
SymBool sym_is_contiguous_custom(MemoryFormat) const override;
|
||||
|
||||
public:
|
||||
void set_size(int64_t dim, int64_t new_size) override;
|
||||
|
||||
@ -46,9 +46,7 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
|
||||
|
||||
public:
|
||||
// Public for now...
|
||||
explicit SparseTensorImpl(
|
||||
at::DispatchKeySet /*key_set*/,
|
||||
const caffe2::TypeMeta /*data_type*/);
|
||||
explicit SparseTensorImpl(at::DispatchKeySet, const caffe2::TypeMeta);
|
||||
|
||||
void release_resources() override;
|
||||
|
||||
@ -231,14 +229,14 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
|
||||
}
|
||||
|
||||
void resize_(int64_t sparse_dim, int64_t dense_dim, ArrayRef<int64_t> size) {
|
||||
_resize_(sparse_dim, dense_dim, size);
|
||||
return _resize_(sparse_dim, dense_dim, size);
|
||||
}
|
||||
|
||||
void resize_(
|
||||
int64_t sparse_dim,
|
||||
int64_t dense_dim,
|
||||
ArrayRef<c10::SymInt> size) {
|
||||
_resize_(sparse_dim, dense_dim, size);
|
||||
return _resize_(sparse_dim, dense_dim, size);
|
||||
}
|
||||
|
||||
// NOTE: this function will resize the sparse tensor and also set `indices`
|
||||
@ -386,8 +384,8 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
|
||||
|
||||
private:
|
||||
explicit SparseTensorImpl(
|
||||
at::DispatchKeySet /*key_set*/,
|
||||
const caffe2::TypeMeta /*data_type*/,
|
||||
at::DispatchKeySet,
|
||||
const caffe2::TypeMeta,
|
||||
at::Tensor indices,
|
||||
at::Tensor values);
|
||||
|
||||
|
||||
@ -59,7 +59,7 @@ static inline void set_item(const Tensor& self, ArrayRef<TensorIndex> indices, c
|
||||
}
|
||||
}
|
||||
|
||||
set_item(self, indices, value);
|
||||
return set_item(self, indices, value);
|
||||
}
|
||||
|
||||
} // namespace indexing
|
||||
|
||||
@ -112,10 +112,10 @@ TORCH_API std::ostream& operator<<(std::ostream& stream, const Slice& slice);
|
||||
// `torch.tensor([1, 2])`) | `torch::tensor({1, 2})`
|
||||
struct TORCH_API TensorIndex final {
|
||||
// Case 1: `at::indexing::None`
|
||||
TensorIndex(std::nullopt_t /*unused*/) : type_(TensorIndexType::None) {}
|
||||
TensorIndex(std::nullopt_t) : type_(TensorIndexType::None) {}
|
||||
|
||||
// Case 2: "..." / `at::indexing::Ellipsis`
|
||||
TensorIndex(at::indexing::EllipsisIndexType /*unused*/)
|
||||
TensorIndex(at::indexing::EllipsisIndexType)
|
||||
: type_(TensorIndexType::Ellipsis) {}
|
||||
TensorIndex(const char* str) : TensorIndex(at::indexing::Ellipsis) {
|
||||
TORCH_CHECK_VALUE(
|
||||
|
||||
@ -765,8 +765,7 @@ void TensorIteratorBase::for_each(loop2d_t loop, int64_t grain_size) {
|
||||
if (numel == 0) {
|
||||
return;
|
||||
} else if (numel < grain_size || at::get_num_threads() == 1) {
|
||||
serial_for_each(loop, {0, numel});
|
||||
return;
|
||||
return serial_for_each(loop, {0, numel});
|
||||
} else {
|
||||
at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
|
||||
serial_for_each(loop, {begin, end});
|
||||
|
||||
@ -250,7 +250,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
|
||||
using PtrVector = SmallVector<char*, 4>;
|
||||
using StrideVector = SmallVector<int64_t, 6>;
|
||||
|
||||
void build(TensorIteratorConfig& /*config*/);
|
||||
void build(TensorIteratorConfig&);
|
||||
|
||||
// The inner-loop function operates on the fastest moving dimension. It
|
||||
// implements element-wise operations in terms of 1-d strided tensors.
|
||||
@ -618,20 +618,20 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
|
||||
#undef TORCH_DISALLOW_TEMPORARIES
|
||||
protected:
|
||||
// Mutable reference as it moves tensors out of TensorIteratorConfig
|
||||
void populate_operands(TensorIteratorConfig& /*config*/);
|
||||
void populate_operands(TensorIteratorConfig&);
|
||||
void mark_outputs();
|
||||
void mark_resize_outputs(const TensorIteratorConfig& /*config*/);
|
||||
void compute_mem_overlaps(const TensorIteratorConfig& /*config*/);
|
||||
void compute_shape(const TensorIteratorConfig& /*config*/);
|
||||
void compute_strides(const TensorIteratorConfig& /*config*/);
|
||||
void mark_resize_outputs(const TensorIteratorConfig&);
|
||||
void compute_mem_overlaps(const TensorIteratorConfig&);
|
||||
void compute_shape(const TensorIteratorConfig&);
|
||||
void compute_strides(const TensorIteratorConfig&);
|
||||
void reorder_dimensions();
|
||||
void permute_dimensions(IntArrayRef perm);
|
||||
void compute_types(const TensorIteratorConfig& /*config*/);
|
||||
void compute_types(const TensorIteratorConfig&);
|
||||
ScalarType compute_common_dtype();
|
||||
void allocate_or_resize_outputs();
|
||||
bool fast_set_up(const TensorIteratorConfig& /*config*/);
|
||||
FastSetupType compute_fast_setup_type(const TensorIteratorConfig& /*config*/);
|
||||
void compute_names(const TensorIteratorConfig& /*config*/);
|
||||
bool fast_set_up(const TensorIteratorConfig&);
|
||||
FastSetupType compute_fast_setup_type(const TensorIteratorConfig&);
|
||||
void compute_names(const TensorIteratorConfig&);
|
||||
void propagate_names_to_outputs();
|
||||
void coalesce_dimensions();
|
||||
|
||||
|
||||
@ -20,7 +20,7 @@
|
||||
|
||||
namespace at {
|
||||
|
||||
TORCH_API int _crash_if_asan(int /*arg*/);
|
||||
TORCH_API int _crash_if_asan(int);
|
||||
|
||||
// Converts a TensorList (i.e. ArrayRef<Tensor> to vector of TensorImpl*)
|
||||
// NB: This is ONLY used by legacy TH bindings, and ONLY used by cat.
|
||||
|
||||
@ -148,7 +148,7 @@ Tensor cached_cast(at::ScalarType to_type, const Tensor& arg, DeviceType device_
|
||||
Banned functions
|
||||
*******************************/
|
||||
|
||||
static Tensor binary_cross_entropy_banned(const Tensor & /*unused*/, const Tensor & /*unused*/, const std::optional<Tensor>& /*unused*/, int64_t /*unused*/) {
|
||||
static Tensor binary_cross_entropy_banned(const Tensor &, const Tensor &, const std::optional<Tensor>&, int64_t) {
|
||||
TORCH_CHECK(false, "torch.nn.functional.binary_cross_entropy and torch.nn.BCELoss are unsafe to autocast.\n"
|
||||
"Many models use a sigmoid layer right before the binary cross entropy layer.\n"
|
||||
"In this case, combine the two layers using torch.nn.functional.binary_cross_entropy_with_logits\n"
|
||||
|
||||
@ -49,7 +49,7 @@ static void check_unique_names(DimnameList names) {
|
||||
}
|
||||
|
||||
void check_names_valid_for(const TensorBase& tensor, DimnameList names) {
|
||||
impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names);
|
||||
return impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names);
|
||||
}
|
||||
|
||||
void check_names_valid_for(size_t tensor_dim, DimnameList names) {
|
||||
|
||||
@ -27,11 +27,11 @@ struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface {
|
||||
HasNonWildcard
|
||||
};
|
||||
|
||||
explicit NamedTensorMeta(HAS_NON_WILDCARD /*unused*/, DimnameList names)
|
||||
explicit NamedTensorMeta(HAS_NON_WILDCARD, DimnameList names)
|
||||
: names_(names.vec()) {
|
||||
check_invariants();
|
||||
}
|
||||
explicit NamedTensorMeta(HAS_NON_WILDCARD /*unused*/, std::vector<Dimname>&& names)
|
||||
explicit NamedTensorMeta(HAS_NON_WILDCARD, std::vector<Dimname>&& names)
|
||||
: names_(std::move(names)) {
|
||||
check_invariants();
|
||||
}
|
||||
@ -52,13 +52,13 @@ struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface {
|
||||
std::any_of(names_.begin(), names_.end(), [](const Dimname& n) { return !n.isWildcard(); }));
|
||||
}
|
||||
|
||||
void set_names(HAS_NON_WILDCARD /*unused*/, DimnameList new_names) {
|
||||
void set_names(HAS_NON_WILDCARD, DimnameList new_names) {
|
||||
TORCH_INTERNAL_ASSERT(new_names.size() == names_.size());
|
||||
std::copy(new_names.begin(), new_names.end(), names_.begin());
|
||||
check_invariants();
|
||||
}
|
||||
|
||||
void set_names(HAS_NON_WILDCARD /*unused*/, std::vector<Dimname>&& new_names) {
|
||||
void set_names(HAS_NON_WILDCARD, std::vector<Dimname>&& new_names) {
|
||||
TORCH_INTERNAL_ASSERT(new_names.size() == names_.size());
|
||||
names_ = std::move(new_names);
|
||||
check_invariants();
|
||||
|
||||
@ -13,7 +13,7 @@ class TORCH_API PythonOpRegistrationTrampoline final {
|
||||
public:
|
||||
// Returns true if you successfully registered yourself (that means
|
||||
// you are in the hot seat for doing the operator registrations!)
|
||||
static bool registerInterpreter(c10::impl::PyInterpreter* /*interp*/);
|
||||
static bool registerInterpreter(c10::impl::PyInterpreter*);
|
||||
|
||||
// Returns nullptr if no interpreter has been registered yet.
|
||||
static c10::impl::PyInterpreter* getInterpreter();
|
||||
|
||||
@ -138,7 +138,7 @@ void Tensor::_backward(TensorList inputs,
|
||||
const std::optional<Tensor>& gradient,
|
||||
std::optional<bool> keep_graph,
|
||||
bool create_graph) const {
|
||||
impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph);
|
||||
return impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph);
|
||||
}
|
||||
|
||||
const TensorBase& TensorBase::requires_grad_(bool _requires_grad) const {
|
||||
@ -173,12 +173,4 @@ unsigned TensorBase::_register_hook(std::function<TensorBase(const TensorBase&)>
|
||||
return impl::GetVariableHooks()->_register_hook(*this, std::move(hook));
|
||||
}
|
||||
|
||||
std::optional<ScalarType> TensorBase::grad_dtype() const {
|
||||
return impl::GetVariableHooks()->grad_dtype(*this);
|
||||
}
|
||||
|
||||
void TensorBase::set_grad_dtype(const std::optional<ScalarType>& grad_dtype) const {
|
||||
return impl::GetVariableHooks()->set_grad_dtype(*this, grad_dtype);
|
||||
}
|
||||
|
||||
} // namespace at
|
||||
|
||||
@ -100,7 +100,7 @@ class TORCH_API TensorBase {
|
||||
// Create a Tensor with a +0 reference count. Special care must be
|
||||
// taken to avoid decrementing this reference count at destruction
|
||||
// time. Intended to support MaybeOwnedTraits<Tensor>.
|
||||
explicit TensorBase(unsafe_borrow_t /*unused*/, const TensorBase& rhs)
|
||||
explicit TensorBase(unsafe_borrow_t, const TensorBase& rhs)
|
||||
: impl_(c10::intrusive_ptr<at::TensorImpl, UndefinedTensorImpl>(rhs.impl_.get(), c10::raw::DontIncreaseRefcount{})) {}
|
||||
friend MaybeOwnedTraits<TensorBase>;
|
||||
|
||||
@ -930,10 +930,6 @@ public:
|
||||
|
||||
const TensorBase& requires_grad_(bool _requires_grad=true) const;
|
||||
|
||||
std::optional<ScalarType> grad_dtype() const;
|
||||
|
||||
void set_grad_dtype(const std::optional<ScalarType>& grad_dtype) const;
|
||||
|
||||
// View Variables
|
||||
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
@ -954,7 +950,7 @@ protected:
|
||||
c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> impl_;
|
||||
|
||||
private:
|
||||
TensorBase __dispatch_contiguous(c10::MemoryFormat /*memory_format*/) const;
|
||||
TensorBase __dispatch_contiguous(c10::MemoryFormat) const;
|
||||
};
|
||||
|
||||
inline DeviceIndex get_device(const TensorBase& self) {
|
||||
|
||||
@ -68,8 +68,6 @@ struct TORCH_API VariableHooksInterface {
|
||||
const c10::OperatorHandle& op,
|
||||
c10::DispatchKeySet dispatch_keys,
|
||||
torch::jit::Stack* stack) const = 0;
|
||||
virtual std::optional<c10::ScalarType> grad_dtype(const TensorBase&) const = 0;
|
||||
virtual void set_grad_dtype(const TensorBase&, const std::optional<c10::ScalarType>&) const = 0;
|
||||
};
|
||||
|
||||
TORCH_API void SetVariableHooks(VariableHooksInterface* hooks);
|
||||
|
||||
@ -18,10 +18,10 @@ class KernelFunction;
|
||||
// implementation notes; notably, this does NOT actually go through the
|
||||
// boxing/unboxing codepath.
|
||||
TORCH_API void fallthrough_kernel(
|
||||
OperatorKernel* /*unused*/,
|
||||
const OperatorHandle& /*unused*/,
|
||||
DispatchKeySet /*unused*/,
|
||||
Stack* /*unused*/);
|
||||
OperatorKernel*,
|
||||
const OperatorHandle&,
|
||||
DispatchKeySet,
|
||||
Stack*);
|
||||
|
||||
// Note [Ambiguity in AutogradOther kernel]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
@ -62,10 +62,10 @@ TORCH_API void fallthrough_kernel(
|
||||
// than arbitrarily pick one or the other, we just register a kernel that raises
|
||||
// an error and let the user decide how to proceed.
|
||||
TORCH_API void ambiguous_autogradother_kernel(
|
||||
OperatorKernel* /*unused*/,
|
||||
const OperatorHandle& /*op*/,
|
||||
DispatchKeySet /*unused*/,
|
||||
Stack* /*unused*/);
|
||||
OperatorKernel*,
|
||||
const OperatorHandle&,
|
||||
DispatchKeySet,
|
||||
Stack*);
|
||||
|
||||
// Note [named_not_supported_kernel]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
@ -75,10 +75,10 @@ TORCH_API void ambiguous_autogradother_kernel(
|
||||
// give a good error message in cases when boxing is not supported). When
|
||||
// boxing is universally supported this can be removed.
|
||||
[[noreturn]] TORCH_API void named_not_supported_kernel(
|
||||
OperatorKernel* /*unused*/,
|
||||
const OperatorHandle& /*op*/,
|
||||
DispatchKeySet /*unused*/,
|
||||
Stack* /*unused*/);
|
||||
OperatorKernel*,
|
||||
const OperatorHandle&,
|
||||
DispatchKeySet,
|
||||
Stack*);
|
||||
|
||||
/**
|
||||
* BoxedKernel is similar to a std::function storing a boxed kernel.
|
||||
@ -185,16 +185,16 @@ class TORCH_API BoxedKernel final {
|
||||
|
||||
template <BoxedKernelFunction* func>
|
||||
static void make_boxed_function(
|
||||
OperatorKernel* /*unused*/,
|
||||
OperatorKernel*,
|
||||
const OperatorHandle& opHandle,
|
||||
DispatchKeySet /*unused*/,
|
||||
DispatchKeySet,
|
||||
Stack* stack);
|
||||
|
||||
template <BoxedKernelFunction_withDispatchKeys* func>
|
||||
static void make_boxed_function(
|
||||
OperatorKernel* /*unused*/,
|
||||
OperatorKernel*,
|
||||
const OperatorHandle& opHandle,
|
||||
DispatchKeySet /*ks*/,
|
||||
DispatchKeySet,
|
||||
Stack* stack);
|
||||
|
||||
explicit BoxedKernel(
|
||||
|
||||
@ -11,9 +11,9 @@ inline BoxedKernel::BoxedKernel(
|
||||
|
||||
template <BoxedKernel::BoxedKernelFunction* func>
|
||||
inline void BoxedKernel::make_boxed_function(
|
||||
OperatorKernel* /*unused*/,
|
||||
OperatorKernel*,
|
||||
const OperatorHandle& opHandle,
|
||||
DispatchKeySet /*unused*/,
|
||||
DispatchKeySet,
|
||||
Stack* stack) {
|
||||
// Note that we're dropping the DispatchKeySet argument.
|
||||
// See Note [Plumbing Keys Through The Dispatcher 2] for details.
|
||||
@ -22,7 +22,7 @@ inline void BoxedKernel::make_boxed_function(
|
||||
|
||||
template <BoxedKernel::BoxedKernelFunction_withDispatchKeys* func>
|
||||
inline void BoxedKernel::make_boxed_function(
|
||||
OperatorKernel* /*unused*/,
|
||||
OperatorKernel*,
|
||||
const OperatorHandle& opHandle,
|
||||
DispatchKeySet ks,
|
||||
Stack* stack) {
|
||||
|
||||
@ -10,7 +10,7 @@ namespace c10 {
|
||||
// be handled specially. Its semantics is that it redispatches to the
|
||||
// *next* dispatch key that would have been processed, skipping the current
|
||||
// one.
|
||||
void fallthrough_kernel(OperatorKernel* /*unused*/, const OperatorHandle& /*unused*/, DispatchKeySet /*unused*/, Stack* /*unused*/) {
|
||||
void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, DispatchKeySet, Stack*) {
|
||||
TORCH_INTERNAL_ASSERT(0,
|
||||
"fallthrough_kernel was executed but it should have been short-circuited by the dispatcher. "
|
||||
"This could occur if you registered a fallthrough kernel as a override for a specific operator "
|
||||
@ -19,7 +19,7 @@ void fallthrough_kernel(OperatorKernel* /*unused*/, const OperatorHandle& /*unus
|
||||
"let us know in the bug tracker.");
|
||||
}
|
||||
|
||||
void ambiguous_autogradother_kernel(OperatorKernel* /*unused*/, const OperatorHandle& op, DispatchKeySet /*unused*/, Stack* /*unused*/) {
|
||||
void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, DispatchKeySet, Stack*) {
|
||||
TORCH_INTERNAL_ASSERT(0,
|
||||
op.operator_name(), " has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. "
|
||||
"This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering "
|
||||
@ -32,7 +32,7 @@ void ambiguous_autogradother_kernel(OperatorKernel* /*unused*/, const OperatorHa
|
||||
"\nCanonical state\n~~~~~~~~~~~\n", op.dumpState(), "\n\n");
|
||||
}
|
||||
|
||||
void named_not_supported_kernel(OperatorKernel* /*unused*/, const OperatorHandle& op, DispatchKeySet /*unused*/, Stack* /*unused*/) {
|
||||
void named_not_supported_kernel(OperatorKernel*, const OperatorHandle& op, DispatchKeySet, Stack*) {
|
||||
// DO NOT LOOK AT STACK, YOU HAVE SHORT CIRCUITED BOXING
|
||||
// See Note [named_not_supported_kernel]
|
||||
TORCH_CHECK(0,
|
||||
|
||||
@ -229,7 +229,7 @@ class TORCH_API KernelFunction final {
|
||||
* &unboxed_func>();
|
||||
*/
|
||||
template <class FuncPtr, bool AllowLegacyTypes = false>
|
||||
static KernelFunction makeFromUnboxedFunction(FuncPtr /*func_ptr*/);
|
||||
static KernelFunction makeFromUnboxedFunction(FuncPtr);
|
||||
|
||||
/**
|
||||
* Create a KernelFunction from an unboxed function.
|
||||
@ -271,7 +271,7 @@ class TORCH_API KernelFunction final {
|
||||
|
||||
std::string dumpState() const;
|
||||
// For testing internal invariants only
|
||||
bool _equalsBoxedAndUnboxed(const KernelFunction& /*other*/) const;
|
||||
bool _equalsBoxedAndUnboxed(const KernelFunction&) const;
|
||||
|
||||
// Register a token to be invalidated when this KernelFunction is destroyed
|
||||
void registerToken(std::weak_ptr<KernelToken> token) const;
|
||||
|
||||
@ -131,7 +131,7 @@ C10_ALWAYS_INLINE_UNLESS_MOBILE void boxToStack(
|
||||
new (dest++) IValue(options.pinned_memory());
|
||||
}
|
||||
|
||||
inline void boxArgsToStack(IValue*& /*unused*/) {}
|
||||
inline void boxArgsToStack(IValue*&) {}
|
||||
|
||||
template <typename T, typename... Args>
|
||||
C10_ALWAYS_INLINE_UNLESS_MOBILE void boxArgsToStack(
|
||||
@ -185,7 +185,7 @@ struct PopResult<std::tuple<Types...>> final {
|
||||
template <size_t... indices>
|
||||
static Result pop_to_tuple_impl(
|
||||
Stack& stack,
|
||||
std::index_sequence<indices...> /*unused*/) {
|
||||
std::index_sequence<indices...>) {
|
||||
return std::make_tuple((std::move(stack[indices]).template to<Types>())...);
|
||||
}
|
||||
};
|
||||
|
||||
@ -561,7 +561,7 @@ struct wrap_kernel_functor_unboxed_<
|
||||
// doesn't use &&
|
||||
static ReturnType call(
|
||||
OperatorKernel* functor,
|
||||
DispatchKeySet /*unused*/,
|
||||
DispatchKeySet,
|
||||
ParameterTypes... args) {
|
||||
KernelFunctor* functor_ = static_cast<KernelFunctor*>(functor);
|
||||
// Note [Plumbing Keys Through The Dispatcher 2]
|
||||
@ -629,8 +629,8 @@ call_functor_with_args_from_stack_(
|
||||
OperatorKernel* functor,
|
||||
DispatchKeySet dispatchKeySet,
|
||||
Stack* stack,
|
||||
std::index_sequence<ivalue_arg_indices...> /*unused*/,
|
||||
guts::typelist::typelist<ArgTypes...>* /*unused*/) {
|
||||
std::index_sequence<ivalue_arg_indices...>,
|
||||
guts::typelist::typelist<ArgTypes...>*) {
|
||||
(void)(stack); // when sizeof...(ivalue_arg_indices) == 0, this argument would
|
||||
// be unused and we have to silence the compiler warning.
|
||||
|
||||
@ -708,7 +708,7 @@ struct push_outputs<std::tuple<OutputTypes...>, AllowDeprecatedTypes> final {
|
||||
static void call_(
|
||||
std::tuple<OutputTypes...>&& output,
|
||||
Stack* stack,
|
||||
std::index_sequence<indices...> /*unused*/) {
|
||||
std::index_sequence<indices...>) {
|
||||
torch::jit::push(
|
||||
*stack,
|
||||
return_to_ivalue<OutputTypes, AllowDeprecatedTypes>::call(
|
||||
@ -718,7 +718,7 @@ struct push_outputs<std::tuple<OutputTypes...>, AllowDeprecatedTypes> final {
|
||||
static void copy_(
|
||||
const std::tuple<OutputTypes...>& output,
|
||||
Stack* stack,
|
||||
std::index_sequence<indices...> /*unused*/) {
|
||||
std::index_sequence<indices...>) {
|
||||
torch::jit::push(
|
||||
*stack,
|
||||
return_to_ivalue<OutputTypes, AllowDeprecatedTypes>::copy(
|
||||
@ -741,7 +741,7 @@ struct make_boxed_from_unboxed_functor final {
|
||||
|
||||
static void call(
|
||||
OperatorKernel* functor,
|
||||
const OperatorHandle& /*unused*/,
|
||||
const OperatorHandle&,
|
||||
DispatchKeySet dispatchKeySet,
|
||||
Stack* stack) {
|
||||
using ReturnType =
|
||||
|
||||
@ -63,13 +63,13 @@ struct BuiltinOpFunction : public Function {
|
||||
|
||||
bool call(
|
||||
Stack& stack,
|
||||
std::optional<size_t> /*unused*/,
|
||||
c10::function_ref<void(const Code&)> /*unused*/) override {
|
||||
std::optional<size_t>,
|
||||
c10::function_ref<void(const Code&)>) override {
|
||||
run(stack);
|
||||
return false;
|
||||
}
|
||||
|
||||
bool call(Stack& stack, c10::function_ref<void(const mobile::Code&)> /*unused*/)
|
||||
bool call(Stack& stack, c10::function_ref<void(const mobile::Code&)>)
|
||||
override {
|
||||
run(stack);
|
||||
return false;
|
||||
|
||||
@ -80,8 +80,7 @@ struct MultiDispatchKeySet : at::IterArgs<MultiDispatchKeySet> {
|
||||
ts = ts | x.key_set();
|
||||
}
|
||||
}
|
||||
[[noreturn]] void operator()(
|
||||
at::ArrayRef<std::optional<at::Tensor>> /*unused*/) {
|
||||
[[noreturn]] void operator()(at::ArrayRef<std::optional<at::Tensor>>) {
|
||||
// Just checking that the handling of Tensor?[] didn't change.
|
||||
TORCH_INTERNAL_ASSERT(false);
|
||||
}
|
||||
@ -96,7 +95,7 @@ struct MultiDispatchKeySet : at::IterArgs<MultiDispatchKeySet> {
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
void operator()(const T& /*unused*/) {
|
||||
void operator()(const T&) {
|
||||
// do nothing
|
||||
}
|
||||
};
|
||||
|
||||
@ -496,7 +496,7 @@ class TORCH_API OperatorHandle {
|
||||
}
|
||||
|
||||
void checkInvariants() const {
|
||||
operatorDef_->op.checkInvariants();
|
||||
return operatorDef_->op.checkInvariants();
|
||||
}
|
||||
|
||||
c10::ArrayRef<at::Tag> getTags() const {
|
||||
@ -633,7 +633,7 @@ class TypedOperatorHandle<Return(Args...)> final : public OperatorHandle {
|
||||
|
||||
namespace detail {
|
||||
template <class... Args>
|
||||
inline void unused_arg_(const Args&... /*unused*/) {}
|
||||
inline void unused_arg_(const Args&...) {}
|
||||
|
||||
// CaptureKernelCall is intended to capture return values from Dispatcher
|
||||
// unboxed kernel calls. A record function may request to get outputs from the
|
||||
@ -932,7 +932,7 @@ inline void Dispatcher::redispatchBoxed(
|
||||
}
|
||||
#endif
|
||||
const auto& kernel = entry.lookup(dispatchKeySet);
|
||||
kernel.callBoxed(op, dispatchKeySet, stack);
|
||||
return kernel.callBoxed(op, dispatchKeySet, stack);
|
||||
}
|
||||
|
||||
} // namespace c10
|
||||
|
||||
@ -105,7 +105,7 @@ class TORCH_API OperatorEntry final {
|
||||
// versa that is an error. (Refcounting for the registrations is
|
||||
// handled in the OperatorHandle in Dispatcher)
|
||||
void registerSchema(
|
||||
FunctionSchema&& /*schema*/,
|
||||
FunctionSchema&&,
|
||||
std::string&& debug,
|
||||
std::vector<at::Tag> tags = {});
|
||||
void deregisterSchema();
|
||||
|
||||
@ -177,7 +177,7 @@ bool DynamicType::equals(const Type& rhs) const {
|
||||
return equals(*create(rhs));
|
||||
}
|
||||
|
||||
bool DynamicType::isSubtypeOfExt(const Type& rhs, std::ostream* /*why_not*/) const {
|
||||
bool DynamicType::isSubtypeOfExt(const Type& rhs, std::ostream*) const {
|
||||
auto other = create(rhs);
|
||||
if (tag_ == other->tag_) {
|
||||
if (equals(*other)) {
|
||||
@ -371,7 +371,7 @@ DynamicTypePtr ivalue::TupleTypeFactory<c10::DynamicType>::create(
|
||||
}
|
||||
|
||||
DynamicTypePtr ivalue::TupleTypeFactory<c10::DynamicType>::fallback(
|
||||
const Type& /*unused*/) {
|
||||
const Type&) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@ -138,8 +138,8 @@ class DynamicType : public SharedType {
|
||||
|
||||
struct Arguments {
|
||||
Arguments() = default;
|
||||
Arguments(c10::ArrayRef<TypePtr> /*args*/);
|
||||
Arguments(const std::vector<std::string_view>& /*names*/, c10::ArrayRef<TypePtr> /*args*/);
|
||||
Arguments(c10::ArrayRef<TypePtr>);
|
||||
Arguments(const std::vector<std::string_view>&, c10::ArrayRef<TypePtr>);
|
||||
std::vector<LabeledDynamicType> elems;
|
||||
};
|
||||
|
||||
@ -156,15 +156,15 @@ class DynamicType : public SharedType {
|
||||
static const TypeKind Kind = TypeKind::DynamicType;
|
||||
static TORCH_API DynamicTypePtr create(Type& ty);
|
||||
|
||||
explicit DynamicType(Tag /*tag*/, Arguments /*arguments*/);
|
||||
explicit DynamicType(Tag /*tag*/, std::string_view /*name*/, Arguments /*arguments*/);
|
||||
explicit DynamicType(Tag, Arguments);
|
||||
explicit DynamicType(Tag, std::string_view, Arguments);
|
||||
|
||||
DynamicType(DynamicType&& other) = delete;
|
||||
DynamicType(const DynamicType&) = delete;
|
||||
DynamicType& operator=(const DynamicType&) = delete;
|
||||
DynamicType& operator=(DynamicType&&) = delete;
|
||||
|
||||
TypePtr containedType(size_t /*i*/) const override;
|
||||
TypePtr containedType(size_t) const override;
|
||||
size_t containedTypeSize() const override;
|
||||
Tag tag() const {
|
||||
return tag_;
|
||||
|
||||
@ -96,15 +96,15 @@ struct TORCH_API Function {
|
||||
// Overload for server interpreter, a bailout size is needed for graph
|
||||
// executor.
|
||||
virtual bool call(
|
||||
Stack& /*unused*/,
|
||||
std::optional<size_t> /*unused*/,
|
||||
c10::function_ref<void(const Code&)> /*unused*/) {
|
||||
Stack&,
|
||||
std::optional<size_t>,
|
||||
c10::function_ref<void(const Code&)>) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Overload for mobile interpreter.
|
||||
virtual bool call(Stack& /*unused*/, c10::function_ref<void(const mobile::Code&)> /*unused*/) {
|
||||
virtual bool call(Stack&, c10::function_ref<void(const mobile::Code&)>) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -847,7 +847,7 @@ struct TORCH_API IValue final {
|
||||
IValue(std::optional<T> v);
|
||||
template <class T, enable_if_list_is_ivalue_constructible<T> = nullptr>
|
||||
IValue(c10::OptionalArrayRef<T> v);
|
||||
IValue(std::nullopt_t /*unused*/);
|
||||
IValue(std::nullopt_t);
|
||||
|
||||
// ClassType
|
||||
IValue(c10::intrusive_ptr<ivalue::Object> v);
|
||||
|
||||
@ -660,7 +660,7 @@ struct TORCH_API TupleTypeFactory<TupleType> {
|
||||
template <>
|
||||
struct TORCH_API TupleTypeFactory<c10::DynamicType> {
|
||||
static DynamicTypePtr create(const std::vector<TypePtr>& elemTypes);
|
||||
static DynamicTypePtr fallback(const Type& /*unused*/);
|
||||
static DynamicTypePtr fallback(const Type&);
|
||||
};
|
||||
|
||||
struct TORCH_API Tuple : c10::intrusive_ptr_target {
|
||||
@ -1682,7 +1682,7 @@ struct ivalue::EnumHolder : c10::intrusive_ptr_target {
|
||||
namespace detail {
|
||||
|
||||
struct _guarded_unsigned_long_unique_dummy final {
|
||||
_guarded_unsigned_long_unique_dummy(int64_t /*unused*/){}
|
||||
_guarded_unsigned_long_unique_dummy(int64_t){}
|
||||
};
|
||||
using _guarded_unsigned_long = std::conditional_t<
|
||||
std::is_same_v<unsigned long, uint32_t> ||
|
||||
@ -1776,7 +1776,7 @@ template <class Elem>
|
||||
// native_functions.yaml still return std::vector.
|
||||
// C10_DEPRECATED_MESSAGE("IValues based on std::vector<T> are potentially slow
|
||||
// and deprecated. Please use torch::List<T> instead.")
|
||||
std::vector<Elem> generic_to(IValue ivalue, _fake_type<std::vector<Elem>> /*unused*/) {
|
||||
std::vector<Elem> generic_to(IValue ivalue, _fake_type<std::vector<Elem>>) {
|
||||
// We need to do a deep copy of the vector because there might be other
|
||||
// references to this same IValue that also use the list. We can't just
|
||||
// move the elements out.
|
||||
@ -1826,18 +1826,18 @@ c10::intrusive_ptr<T> IValue::toCustomClass() const& {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T generic_to(IValue ivalue, _fake_type<T> /*unused*/) {
|
||||
T generic_to(IValue ivalue, _fake_type<T>) {
|
||||
using ElemType = typename std::remove_pointer<T>::type::element_type;
|
||||
return std::move(ivalue).template toCustomClass<ElemType>();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
tagged_capsule<T> generic_to(IValue ivalue, _fake_type<tagged_capsule<T>> /*unused*/) {
|
||||
tagged_capsule<T> generic_to(IValue ivalue, _fake_type<tagged_capsule<T>>) {
|
||||
return tagged_capsule<T>{std::move(ivalue)};
|
||||
}
|
||||
|
||||
template <typename Elem>
|
||||
c10::List<Elem> generic_to(IValue ivalue, _fake_type<c10::List<Elem>> /*unused*/) {
|
||||
c10::List<Elem> generic_to(IValue ivalue, _fake_type<c10::List<Elem>>) {
|
||||
return impl::toTypedList<Elem>(std::move(ivalue).toList());
|
||||
}
|
||||
|
||||
@ -1867,7 +1867,7 @@ std::vector<T> createVectorFromList(const c10::List<T>& impl) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
OptionalArray<T> generic_to(IValue ivalue, _fake_type<OptionalArray<T>> /*unused*/) {
|
||||
OptionalArray<T> generic_to(IValue ivalue, _fake_type<OptionalArray<T>>) {
|
||||
if (ivalue.isNone()) {
|
||||
return {};
|
||||
}
|
||||
@ -1880,8 +1880,8 @@ namespace detail {
|
||||
template <typename Elem, size_t... I>
|
||||
std::array<Elem, sizeof...(I)> generic_to_array(
|
||||
IValue ivalue,
|
||||
_fake_type<std::array<Elem, sizeof...(I)>> /*unused*/,
|
||||
std::index_sequence<I...> /*unused*/) {
|
||||
_fake_type<std::array<Elem, sizeof...(I)>>,
|
||||
std::index_sequence<I...>) {
|
||||
// We need to do a deep copy of the array because there might be other
|
||||
// references to this same IValue that also use the list. We can't just
|
||||
// move the elements out.
|
||||
@ -1906,7 +1906,7 @@ std::array<Elem, N> generic_to(
|
||||
template <typename Key, typename Value>
|
||||
c10::Dict<Key, Value> generic_to(
|
||||
IValue ivalue,
|
||||
_fake_type<c10::Dict<Key, Value>> /*unused*/) {
|
||||
_fake_type<c10::Dict<Key, Value>>) {
|
||||
return impl::toTypedDict<Key, Value>(std::move(ivalue).toGenericDict());
|
||||
}
|
||||
|
||||
@ -1915,7 +1915,7 @@ C10_DEPRECATED_MESSAGE(
|
||||
"IValues based on std::unordered_map are slow and deprecated. Please use c10::Dict<K, V> instead.")
|
||||
std::unordered_map<K, V> generic_to(
|
||||
IValue ivalue,
|
||||
_fake_type<std::unordered_map<K, V>> /*unused*/) {
|
||||
_fake_type<std::unordered_map<K, V>>) {
|
||||
std::unordered_map<K, V> specialized_dict;
|
||||
|
||||
for (const auto& item : std::move(ivalue).toGenericDict()) {
|
||||
@ -1926,7 +1926,7 @@ std::unordered_map<K, V> generic_to(
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::optional<T> generic_to(IValue ivalue, _fake_type<std::optional<T>> /*unused*/) {
|
||||
std::optional<T> generic_to(IValue ivalue, _fake_type<std::optional<T>>) {
|
||||
if (ivalue.isNone()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
@ -1937,7 +1937,7 @@ namespace detail {
|
||||
template <typename Tuple, std::size_t... INDEX>
|
||||
Tuple generic_to_tuple_impl(
|
||||
const ivalue::TupleElements& t,
|
||||
std::index_sequence<INDEX...> /*unused*/) {
|
||||
std::index_sequence<INDEX...>) {
|
||||
return std::make_tuple(
|
||||
t[INDEX].to<typename std::tuple_element<INDEX, Tuple>::type>()...);
|
||||
}
|
||||
@ -1951,7 +1951,7 @@ template <
|
||||
std::is_lvalue_reference<Args>...,
|
||||
std::negation<std::is_constructible<IValue, Args>>...>,
|
||||
std::nullptr_t> = nullptr>
|
||||
std::tuple<Args...> generic_to(const IValue& ivalue, _fake_type<std::tuple<Args...>> /*unused*/) {
|
||||
std::tuple<Args...> generic_to(const IValue& ivalue, _fake_type<std::tuple<Args...>>) {
|
||||
const auto& vals = ivalue.toTupleRef().elements();
|
||||
TORCH_CHECK(vals.size() == sizeof...(Args));
|
||||
return detail::generic_to_tuple_impl<std::tuple<Args...>>(vals, Indices{});
|
||||
@ -2311,7 +2311,7 @@ inline IValue::IValue(std::optional<T> v) : IValue() {
|
||||
}
|
||||
}
|
||||
|
||||
inline IValue::IValue(std::nullopt_t /*unused*/) : IValue() {}
|
||||
inline IValue::IValue(std::nullopt_t) : IValue() {}
|
||||
|
||||
inline IValue::IValue(c10::intrusive_ptr<ivalue::Object> v)
|
||||
: tag(Tag::Object) {
|
||||
@ -2482,15 +2482,15 @@ namespace ivalue {
|
||||
namespace detail {
|
||||
|
||||
template <typename T>
|
||||
IValue from_(T&& x, std::true_type /*unused*/) {
|
||||
IValue from_(T&& x, std::true_type) {
|
||||
return IValue(std::forward<T>(x));
|
||||
}
|
||||
template <typename T>
|
||||
IValue from_(c10::intrusive_ptr<T> x, std::false_type /*unused*/) {
|
||||
IValue from_(c10::intrusive_ptr<T> x, std::false_type) {
|
||||
return IValue(std::move(x));
|
||||
}
|
||||
template <typename T>
|
||||
IValue from_(T&& /*x*/, std::false_type /*unused*/) {
|
||||
IValue from_(T&& /*x*/, std::false_type) {
|
||||
static_assert(
|
||||
guts::false_t<T>::value,
|
||||
"You are calling from with a type that it doesn't support, and isn't a potential custom class (ie: is an intrusive_ptr)");
|
||||
@ -2546,19 +2546,19 @@ struct MaybeOwnedTraits<IValue> {
|
||||
return &borrow;
|
||||
}
|
||||
|
||||
static bool debugBorrowIsValid(const borrow_type& /*unused*/) {
|
||||
static bool debugBorrowIsValid(const borrow_type&) {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct IValue::TagType<c10::Type> {
|
||||
static TORCH_API c10::TypePtr get(const IValue& /*v*/);
|
||||
static TORCH_API c10::TypePtr get(const IValue&);
|
||||
};
|
||||
|
||||
template <>
|
||||
struct IValue::TagType<c10::DynamicType> {
|
||||
static TORCH_API c10::TypePtr get(const IValue& /*v*/);
|
||||
static TORCH_API c10::TypePtr get(const IValue&);
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
|
||||
@ -44,7 +44,7 @@ constexpr int checkStaticTypes() {
|
||||
}
|
||||
|
||||
template <typename... Ts, size_t... Is>
|
||||
constexpr std::array<ArgumentDef, sizeof...(Ts)> createArgumentVectorFromTypes(std::index_sequence<Is...> /*unused*/) {
|
||||
constexpr std::array<ArgumentDef, sizeof...(Ts)> createArgumentVectorFromTypes(std::index_sequence<Is...>) {
|
||||
return (
|
||||
// Check types for common errors
|
||||
checkStaticTypes<Ts...>(),
|
||||
|
||||
@ -83,7 +83,7 @@ inline bool operator!=(const OperatorName& lhs, const OperatorName& rhs) {
|
||||
}
|
||||
|
||||
TORCH_API std::string toString(const OperatorName& opName);
|
||||
TORCH_API std::ostream& operator<<(std::ostream& /*os*/, const OperatorName& /*opName*/);
|
||||
TORCH_API std::ostream& operator<<(std::ostream&, const OperatorName&);
|
||||
|
||||
} // namespace c10
|
||||
|
||||
|
||||
@ -16,7 +16,7 @@ class SingletonTypePtr {
|
||||
/* implicit */ SingletonTypePtr(T* p) : repr_(p) {}
|
||||
|
||||
// We need this to satisfy Pybind11, but it shouldn't be hit.
|
||||
explicit SingletonTypePtr(std::shared_ptr<T> /*unused*/) { TORCH_CHECK(false); }
|
||||
explicit SingletonTypePtr(std::shared_ptr<T>) { TORCH_CHECK(false); }
|
||||
|
||||
using element_type = typename std::shared_ptr<T>::element_type;
|
||||
|
||||
|
||||
@ -342,19 +342,19 @@ class Vectorized<c10::complex<double>> {
|
||||
return _mm256_cmp_pd(values, other.values, _CMP_NEQ_UQ);
|
||||
}
|
||||
Vectorized<c10::complex<double>> operator<(
|
||||
const Vectorized<c10::complex<double>>& /*unused*/) const {
|
||||
const Vectorized<c10::complex<double>>&) const {
|
||||
TORCH_CHECK(false, "not supported for complex numbers");
|
||||
}
|
||||
Vectorized<c10::complex<double>> operator<=(
|
||||
const Vectorized<c10::complex<double>>& /*unused*/) const {
|
||||
const Vectorized<c10::complex<double>>&) const {
|
||||
TORCH_CHECK(false, "not supported for complex numbers");
|
||||
}
|
||||
Vectorized<c10::complex<double>> operator>(
|
||||
const Vectorized<c10::complex<double>>& /*unused*/) const {
|
||||
const Vectorized<c10::complex<double>>&) const {
|
||||
TORCH_CHECK(false, "not supported for complex numbers");
|
||||
}
|
||||
Vectorized<c10::complex<double>> operator>=(
|
||||
const Vectorized<c10::complex<double>>& /*unused*/) const {
|
||||
const Vectorized<c10::complex<double>>&) const {
|
||||
TORCH_CHECK(false, "not supported for complex numbers");
|
||||
}
|
||||
|
||||
|
||||
@ -422,34 +422,18 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
|
||||
abType = CUDA_R_16F;
|
||||
cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
|
||||
#ifndef USE_ROCM
|
||||
auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
|
||||
if (fp16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
uint32_t mask =
|
||||
fp16_reduction ==
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
|
||||
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
|
||||
CUBLASLT_REDUCTION_SCHEME_NONE)
|
||||
: CUBLASLT_REDUCTION_SCHEME_NONE;
|
||||
preference.setAttribute(
|
||||
CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
|
||||
if (!at::globalContext().allowFP16ReductionCuBLAS()) {
|
||||
preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
|
||||
CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
|
||||
}
|
||||
#endif
|
||||
} else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
|
||||
abType = CUDA_R_16BF;
|
||||
cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
|
||||
#ifndef USE_ROCM
|
||||
auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
|
||||
if (bf16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
uint32_t mask =
|
||||
bf16_reduction ==
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
|
||||
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
|
||||
CUBLASLT_REDUCTION_SCHEME_NONE)
|
||||
: CUBLASLT_REDUCTION_SCHEME_NONE;
|
||||
preference.setAttribute(
|
||||
CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
|
||||
if (!at::globalContext().allowBF16ReductionCuBLAS()) {
|
||||
preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
|
||||
CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
@ -1136,15 +1120,8 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
|
||||
}
|
||||
if (prop->major >= 5) {
|
||||
cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
|
||||
auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
|
||||
TORCH_CHECK(fp16_reduction !=
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK,
|
||||
"torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction("
|
||||
"..., allow_splitk=False) requires the cuBLASLt backend");
|
||||
if (fp16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
cublas_flags = static_cast<cublasMath_t>(
|
||||
cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
|
||||
if (!at::globalContext().allowFP16ReductionCuBLAS()) {
|
||||
cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
|
||||
}
|
||||
// Disallow fp16 reductions that could lead to unexpected overflow issues.
|
||||
TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, cublas_flags));
|
||||
@ -1203,15 +1180,8 @@ inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DT
|
||||
GEMM_CHECK_ARGVALUES(at::BFloat16);
|
||||
#ifndef USE_ROCM
|
||||
cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
|
||||
auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
|
||||
TORCH_CHECK(bf16_reduction !=
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK,
|
||||
"torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction("
|
||||
"..., allow_splitk=False) requires the cuBLASLt backend");
|
||||
if (bf16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
cublas_flags = static_cast<cublasMath_t>(
|
||||
cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
|
||||
if (!at::globalContext().allowBF16ReductionCuBLAS()) {
|
||||
cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
|
||||
}
|
||||
#endif
|
||||
#if defined(USE_ROCM)
|
||||
@ -1607,34 +1577,18 @@ bool gemm_and_bias(
|
||||
abType = CUDA_R_16F;
|
||||
cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
|
||||
#ifndef USE_ROCM
|
||||
auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
|
||||
if (fp16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
uint32_t mask =
|
||||
fp16_reduction ==
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
|
||||
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
|
||||
CUBLASLT_REDUCTION_SCHEME_NONE)
|
||||
: CUBLASLT_REDUCTION_SCHEME_NONE;
|
||||
preference.setAttribute(
|
||||
CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
|
||||
if (!at::globalContext().allowFP16ReductionCuBLAS()) {
|
||||
preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
|
||||
CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
|
||||
}
|
||||
#endif
|
||||
} else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
|
||||
abType = CUDA_R_16BF;
|
||||
cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
|
||||
#ifndef USE_ROCM
|
||||
auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
|
||||
if (bf16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
uint32_t mask =
|
||||
bf16_reduction ==
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
|
||||
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
|
||||
CUBLASLT_REDUCTION_SCHEME_NONE)
|
||||
: CUBLASLT_REDUCTION_SCHEME_NONE;
|
||||
preference.setAttribute(
|
||||
CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
|
||||
if (!at::globalContext().allowBF16ReductionCuBLAS()) {
|
||||
preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
|
||||
CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -1861,8 +1815,6 @@ template bool gemm_and_bias(
|
||||
int64_t result_ld,
|
||||
GEMMAndBiasActivationEpilogue activation);
|
||||
|
||||
using at::blas::ScalingType;
|
||||
|
||||
int get_scale_mode(ScalingType scaling_type, ScalarType scale_dtype, bool use_fast_accum) {
|
||||
switch (scaling_type) {
|
||||
case ScalingType::BlockWise1x32:
|
||||
|
||||
@ -14,7 +14,6 @@
|
||||
*/
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <ATen/BlasBackend.h>
|
||||
#include <ATen/OpMathType.h>
|
||||
|
||||
namespace at::cuda::blas {
|
||||
@ -137,6 +136,15 @@ void int8_gemm(
|
||||
int32_t* result_ptr,
|
||||
int64_t result_ld);
|
||||
|
||||
enum class ScalingType : std::uint8_t {
|
||||
TensorWise, // fp32 scales
|
||||
RowWise, // fp32 scales
|
||||
BlockWise1x16, // fp8_e4m3fn scales
|
||||
BlockWise1x32, // fp8_e8m0fnu scales
|
||||
BlockWise1x128, // fp32 scales
|
||||
BlockWise128x128, // fp32 scales
|
||||
};
|
||||
|
||||
void scaled_gemm(
|
||||
char transa,
|
||||
char transb,
|
||||
@ -148,13 +156,13 @@ void scaled_gemm(
|
||||
int64_t mat1_ld,
|
||||
ScalarType mat1_dtype,
|
||||
ScalarType mat1_scale_dtype,
|
||||
at::blas::ScalingType mat1_scaling_type,
|
||||
ScalingType mat1_scaling_type,
|
||||
const void* mat2_ptr,
|
||||
const void* mat2_scale_ptr,
|
||||
int64_t mat2_ld,
|
||||
ScalarType mat2_dtype,
|
||||
ScalarType mat2_scale_dtype,
|
||||
at::blas::ScalingType mat2_scaling_type,
|
||||
ScalingType mat2_scaling_type,
|
||||
const void* bias_ptr,
|
||||
ScalarType bias_dtype,
|
||||
void* result_ptr,
|
||||
|
||||
@ -326,23 +326,6 @@ bool CUDAHooks::supportsBFloat16ConvolutionWithCuDNNv8() const {
|
||||
#endif
|
||||
}
|
||||
|
||||
bool CUDAHooks::supportsBFloat16RNNWithCuDNN() const {
|
||||
#if AT_CUDNN_ENABLED() && (CUDNN_VERSION >= 91300)
|
||||
if (!hasCUDA()) {
|
||||
return false;
|
||||
}
|
||||
cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
|
||||
// Check for Volta cores
|
||||
if (prop->major >= 8) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
long CUDAHooks::versionCuDNN() const {
|
||||
#if AT_CUDNN_ENABLED()
|
||||
return CUDNN_VERSION;
|
||||
|
||||
@ -17,7 +17,7 @@ TORCH_CUDA_CPP_API void set_magma_init_fn(void (*magma_init_fn)());
|
||||
|
||||
// The real implementation of CUDAHooksInterface
|
||||
struct CUDAHooks : public at::CUDAHooksInterface {
|
||||
CUDAHooks(at::CUDAHooksArgs /*unused*/) {}
|
||||
CUDAHooks(at::CUDAHooksArgs) {}
|
||||
void init() const override;
|
||||
Device getDeviceFromPtr(void* data) const override;
|
||||
bool isPinnedPtr(const void* data) const override;
|
||||
@ -45,7 +45,6 @@ struct CUDAHooks : public at::CUDAHooksInterface {
|
||||
bool supportsDilatedConvolutionWithCuDNN() const override;
|
||||
bool supportsDepthwiseConvolutionWithCuDNN() const override;
|
||||
bool supportsBFloat16ConvolutionWithCuDNNv8() const override;
|
||||
bool supportsBFloat16RNNWithCuDNN() const override;
|
||||
bool hasCUDART() const override;
|
||||
long versionCUDART() const override;
|
||||
long versionCuDNN() const override;
|
||||
|
||||
@ -29,7 +29,7 @@
|
||||
|
||||
namespace at::cuda::tunable {
|
||||
|
||||
using at::blas::ScalingType;
|
||||
using at::cuda::blas::ScalingType;
|
||||
|
||||
enum class BlasOp {
|
||||
N = 0,
|
||||
|
||||
@ -29,7 +29,7 @@ template <typename ParamsT>
|
||||
class Callable {
|
||||
public:
|
||||
virtual ~Callable() = default;
|
||||
virtual TuningStatus Call(const ParamsT* /*unused*/) {
|
||||
virtual TuningStatus Call(const ParamsT*) {
|
||||
return FAIL;
|
||||
}
|
||||
virtual TuningStatus IsSupported(const ParamsT* params) {
|
||||
|
||||
@ -166,10 +166,6 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual bool supportsBFloat16RNNWithCuDNN() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual long versionCuDNN() const {
|
||||
TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
|
||||
}
|
||||
|
||||
@ -25,7 +25,7 @@ struct TORCH_API HPUHooksInterface : AcceleratorHooksInterface {
|
||||
false, "Cannot get device of pointer on HPU without HPU backend");
|
||||
}
|
||||
|
||||
bool isPinnedPtr(const void* /*data*/) const override {
|
||||
bool isPinnedPtr(const void*) const override {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@ -410,7 +410,7 @@ struct ExistingBdimBatchRuleHelper<F, Func, c10::guts::typelist::typelist<A, T..
|
||||
|
||||
|
||||
template <typename F, F Method, typename... ExtraArgs>
|
||||
Tensor& unary_inplace_batch_rule(Tensor& self, std::optional<int64_t> /*unused*/, ExtraArgs... extra_args) {
|
||||
Tensor& unary_inplace_batch_rule(Tensor& self, std::optional<int64_t>, ExtraArgs... extra_args) {
|
||||
INVOKE(self, Method)(std::forward<ExtraArgs>(extra_args)...);
|
||||
return self;
|
||||
}
|
||||
|
||||
@ -465,11 +465,11 @@ static void dynamicLayerBack(const c10::OperatorHandle& op, torch::jit::Stack* s
|
||||
|
||||
// used for functions that have aliasing operations but should be treated like they're out of place (i.e. lift_fresh)
|
||||
static void dynamicLayerBackGradSpecialCase(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
|
||||
dynamicLayerBack(op, stack, true);
|
||||
return dynamicLayerBack(op, stack, true);
|
||||
}
|
||||
|
||||
static void dynamicLayerBackFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
|
||||
dynamicLayerBack(op, stack, false);
|
||||
return dynamicLayerBack(op, stack, false);
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL(_, FuncTorchDynamicLayerFrontMode, m) {
|
||||
|
||||
@ -18,7 +18,7 @@ extern std::atomic<const MetalInterface*> g_metal_impl_registry;
|
||||
|
||||
class MetalImplRegistrar {
|
||||
public:
|
||||
explicit MetalImplRegistrar(MetalInterface* /*impl*/);
|
||||
explicit MetalImplRegistrar(MetalInterface*);
|
||||
};
|
||||
|
||||
at::Tensor& metal_copy_(at::Tensor& self, const at::Tensor& src);
|
||||
|
||||
@ -12,7 +12,7 @@
|
||||
|
||||
#define MPS_ERROR_NOT_COMPILED "PyTorch code is not compiled with MPS enabled"
|
||||
#define MPS_ERROR_RUNTIME_TOO_LOW \
|
||||
"The MPS backend is supported on MacOS 14.0+. ", \
|
||||
"The MPS backend is supported on MacOS 13.0+.", \
|
||||
"Current OS version can be queried using `sw_vers`"
|
||||
#define MPS_ERROR_DOUBLE_NOT_SUPPORTED "Cannot convert a MPS Tensor to float64 dtype " \
|
||||
"as the MPS framework doesn't support float64. Please use float32 instead."
|
||||
|
||||
@ -2060,7 +2060,7 @@ std::tuple<Tensor, Tensor> linalg_lu_factor(const Tensor& A, bool pivot) {
|
||||
}
|
||||
|
||||
// TODO Deprecate this function in favour of linalg_lu_factor_ex
|
||||
std::tuple<Tensor, Tensor, Tensor> _lu_with_info(const Tensor& self, bool compute_pivots, bool /*unused*/) {
|
||||
std::tuple<Tensor, Tensor, Tensor> _lu_with_info(const Tensor& self, bool compute_pivots, bool) {
|
||||
TORCH_WARN_ONCE(
|
||||
"torch.lu is deprecated in favor of torch.linalg.lu_factor / torch.linalg.lu_factor_ex and will be ",
|
||||
"removed in a future PyTorch release.\n",
|
||||
|
||||
@ -375,7 +375,7 @@ static void bf16_gemv_trans(
|
||||
const at::BFloat16 beta,
|
||||
at::BFloat16* y,
|
||||
const int incy) {
|
||||
bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
|
||||
return bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
|
||||
}
|
||||
|
||||
template <>
|
||||
|
||||
@ -70,7 +70,7 @@ inline void searchsorted_maybe_trim_input_tensors(
|
||||
const Tensor& raw_boundaries) {
|
||||
Tensor trimmed_sorter;
|
||||
Tensor raw_sorter;
|
||||
searchsorted_maybe_trim_input_tensors(
|
||||
return searchsorted_maybe_trim_input_tensors(
|
||||
trimmed_input,
|
||||
trimmed_boundaries,
|
||||
trimmed_sorter,
|
||||
|
||||
@ -93,12 +93,6 @@ inline bool cond_cudnn_grid_sampler(
|
||||
const TensorBase& input,
|
||||
const TensorBase& grid
|
||||
) {
|
||||
auto st = input.scalar_type();
|
||||
if (!(st == kDouble || st == kFloat || st == kHalf))
|
||||
return false;
|
||||
st = grid.scalar_type();
|
||||
if (!(st == kDouble || st == kFloat || st == kHalf))
|
||||
return false;
|
||||
return (
|
||||
at::native::cudnn_is_acceptable(input) &&
|
||||
at::native::cudnn_is_acceptable(grid) &&
|
||||
|
||||
@ -108,13 +108,6 @@ bool use_mkldnn(const Tensor& input, TensorList params, TensorList hx) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool use_cudnn(const Tensor& t) {
|
||||
bool acceptable = at::cudnn_is_acceptable(t);
|
||||
auto st = t.scalar_type();
|
||||
bool bfloat16_cond = st == kBFloat16 && at::detail::getCUDAHooks().supportsBFloat16RNNWithCuDNN();
|
||||
return acceptable && (bfloat16_cond || st == kDouble || st == kFloat || st == kHalf);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
using pair_of = std::pair<T, T>;
|
||||
|
||||
@ -1207,7 +1200,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backwar
|
||||
bool train, \
|
||||
bool bidirectional, \
|
||||
bool batch_first) { \
|
||||
if (use_cudnn(_input)) { \
|
||||
if (at::cudnn_is_acceptable(_input)) { \
|
||||
Tensor output, hy; \
|
||||
NAME##_cudnn_stub( \
|
||||
_input.device().type(), \
|
||||
@ -1269,7 +1262,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backwar
|
||||
double dropout_p, \
|
||||
bool train, \
|
||||
bool bidirectional) { \
|
||||
if (use_cudnn(data)) { \
|
||||
if (at::cudnn_is_acceptable(data)) { \
|
||||
Tensor output, hy; \
|
||||
NAME##_packed_cudnn_stub( \
|
||||
data.device().type(), \
|
||||
@ -1437,7 +1430,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
|
||||
TensorList _params, bool has_biases,
|
||||
int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
|
||||
TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
|
||||
if (use_cudnn(_input)) {
|
||||
if (at::cudnn_is_acceptable(_input)) {
|
||||
Tensor output, hy, cy;
|
||||
lstm_cudnn_stub(_input.device().type(), output, hy, cy, _input, hx, _params, has_biases,
|
||||
num_layers, dropout_p, train, bidirectional, batch_first);
|
||||
@ -1498,7 +1491,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
|
||||
TensorList _params, bool has_biases,
|
||||
int64_t num_layers, double dropout_p, bool train, bool bidirectional) {
|
||||
TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
|
||||
if (use_cudnn(data)) {
|
||||
if (at::cudnn_is_acceptable(data)) {
|
||||
Tensor output, hy, cy;
|
||||
lstm_packed_cudnn_stub(data.device().type(), output, hy, cy, data, batch_sizes, hx,
|
||||
_params, has_biases, num_layers, dropout_p, train, bidirectional);
|
||||
|
||||
@ -15,11 +15,7 @@ namespace at::native {
|
||||
|
||||
Scalar item(const Tensor& self) {
|
||||
auto numel = self.sym_numel();
|
||||
TORCH_SYM_CHECK(
|
||||
numel.sym_eq(1),
|
||||
"a Tensor with ",
|
||||
numel,
|
||||
" elements cannot be converted to Scalar");
|
||||
TORCH_CHECK(numel == 1, "a Tensor with ", numel, " elements cannot be converted to Scalar");
|
||||
if (self.is_sparse()) {
|
||||
if (self._nnz() == 0) return Scalar(0);
|
||||
if (self.is_coalesced()) return at::_local_scalar_dense(self._values());
|
||||
|
||||
@ -346,17 +346,17 @@ template<typename acc_t>
|
||||
struct AbsSwitch {};
|
||||
|
||||
template<typename scalar_t, typename acc_t>
|
||||
inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch<acc_t> /*unused*/) {
|
||||
inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch<acc_t>) {
|
||||
return static_cast<acc_t>(data);
|
||||
}
|
||||
|
||||
template<typename scalar_t, typename acc_t>
|
||||
inline C10_DEVICE acc_t abs_if_complex(std::complex<scalar_t> data, AbsSwitch<acc_t> /*unused*/) {
|
||||
inline C10_DEVICE acc_t abs_if_complex(std::complex<scalar_t> data, AbsSwitch<acc_t>) {
|
||||
return static_cast<acc_t>(std::abs(data));
|
||||
}
|
||||
|
||||
template<typename scalar_t, typename acc_t>
|
||||
inline C10_DEVICE acc_t abs_if_complex(c10::complex<scalar_t> data, AbsSwitch<acc_t> /*unused*/) {
|
||||
inline C10_DEVICE acc_t abs_if_complex(c10::complex<scalar_t> data, AbsSwitch<acc_t>) {
|
||||
return static_cast<acc_t>(std::abs(at::opmath_type<c10::complex<scalar_t>>(data)));
|
||||
}
|
||||
|
||||
|
||||
@ -846,7 +846,7 @@ TORCH_IMPL_FUNC(clamp_Tensor_out)
|
||||
(const Tensor& self,
|
||||
const OptionalTensorRef min,
|
||||
const OptionalTensorRef max,
|
||||
const Tensor& /*unused*/) {
|
||||
const Tensor&) {
|
||||
if (min && max) {
|
||||
clamp_stub(device_type(), *this);
|
||||
} else if (min) {
|
||||
|
||||
@ -91,6 +91,9 @@ bool cudnn_is_acceptable(const TensorBase& self) {
|
||||
return false;
|
||||
if (!self.is_cuda())
|
||||
return false;
|
||||
auto st = self.scalar_type();
|
||||
if (!(st == kDouble || st == kFloat || st == kHalf))
|
||||
return false;
|
||||
if (!detail::getCUDAHooks().compiledWithCuDNN())
|
||||
return false;
|
||||
// cuDNN functions like grid_sampler returns CUDNN_STATUS_BAD_PARAM on empty
|
||||
|
||||
@ -25,11 +25,11 @@
|
||||
namespace at::native {
|
||||
|
||||
void _backward(const Tensor& self, TensorList inputs, const std::optional<Tensor>& gradient_opt, std::optional<bool> keep_graph, bool create_graph) {
|
||||
self._backward(inputs, gradient_opt, keep_graph, create_graph);
|
||||
return self._backward(inputs, gradient_opt, keep_graph, create_graph);
|
||||
}
|
||||
|
||||
void set_data(Tensor& self, const Tensor& new_data) {
|
||||
self.set_data(new_data);
|
||||
return self.set_data(new_data);
|
||||
}
|
||||
|
||||
Tensor data(const Tensor& self) {
|
||||
@ -54,7 +54,7 @@ Tensor& requires_grad_(Tensor& self, bool _requires_grad) {
|
||||
}
|
||||
|
||||
void retain_grad(Tensor& self) {
|
||||
self.retain_grad();
|
||||
return self.retain_grad();
|
||||
}
|
||||
|
||||
bool retains_grad(const Tensor& self) {
|
||||
|
||||
@ -300,8 +300,7 @@ void div_floor_kernel(TensorIteratorBase& iter) {
|
||||
// In the special case of unsigned integer division, floor division is
|
||||
// equivalent to truncation division (since the signs of the divisor and
|
||||
// dividend are always the same)
|
||||
div_trunc_kernel(iter);
|
||||
return;
|
||||
return div_trunc_kernel(iter);
|
||||
} else if (isIntegralType(dtype, /*includeBool*/ false)) {
|
||||
// There's no SIMD integer division, so don't try to vectorize it.
|
||||
AT_DISPATCH_INTEGRAL_TYPES(dtype, "div_floor_cpu", [&]() {
|
||||
|
||||
@ -452,11 +452,11 @@ void convolution_depthwise3x3_winograd_impl(
|
||||
#else
|
||||
|
||||
void convolution_depthwise3x3_winograd_impl(
|
||||
const Arguments& /*unused*/,
|
||||
const float* const /*unused*/,
|
||||
const float* const /*unused*/,
|
||||
const float* const /*unused*/,
|
||||
float* const /*unused*/) {
|
||||
const Arguments&,
|
||||
const float* const,
|
||||
const float* const,
|
||||
const float* const,
|
||||
float* const) {
|
||||
}
|
||||
|
||||
#endif /* __ARM_NEON__ */
|
||||
|
||||
@ -749,29 +749,21 @@ void flip_kernel(TensorIterator& iter, const bool quantized) {
|
||||
// });
|
||||
|
||||
if (iter_dtype == kByte) {
|
||||
cpu_hflip_vec<uint8_t>(iter);
|
||||
return;
|
||||
return cpu_hflip_vec<uint8_t>(iter);
|
||||
} else if (iter_dtype == kChar) {
|
||||
cpu_hflip_vec<int8_t>(iter);
|
||||
return;
|
||||
return cpu_hflip_vec<int8_t>(iter);
|
||||
} else if (iter_dtype == kInt) {
|
||||
cpu_hflip_vec<int32_t>(iter);
|
||||
return;
|
||||
return cpu_hflip_vec<int32_t>(iter);
|
||||
} else if (iter_dtype == kLong) {
|
||||
cpu_hflip_vec<int64_t>(iter);
|
||||
return;
|
||||
return cpu_hflip_vec<int64_t>(iter);
|
||||
} else if (iter_dtype == kShort) {
|
||||
cpu_hflip_vec<int16_t>(iter);
|
||||
return;
|
||||
return cpu_hflip_vec<int16_t>(iter);
|
||||
} else if (iter_dtype == kBool) {
|
||||
cpu_hflip_vec<bool>(iter);
|
||||
return;
|
||||
return cpu_hflip_vec<bool>(iter);
|
||||
} else if (iter_dtype == kFloat) {
|
||||
cpu_hflip_vec<float>(iter);
|
||||
return;
|
||||
return cpu_hflip_vec<float>(iter);
|
||||
} else if (iter_dtype == kDouble) {
|
||||
cpu_hflip_vec<double>(iter);
|
||||
return;
|
||||
return cpu_hflip_vec<double>(iter);
|
||||
}
|
||||
}
|
||||
// other dtypes (float16, bfloat16, complex) are handled by cpu_kernel_vec (see below)
|
||||
@ -786,12 +778,10 @@ void flip_kernel(TensorIterator& iter, const bool quantized) {
|
||||
c == input_strides_2[1] &&
|
||||
c == iter.element_size(0) * iter.shape()[0] // checks if dim=1 is contiguous as well
|
||||
) {
|
||||
cpu_hflip_channels_last_vec(iter);
|
||||
return;
|
||||
return cpu_hflip_channels_last_vec(iter);
|
||||
}
|
||||
// Special case: vertical flip using memcpy (faster than generic cpu_kernel_vec)
|
||||
cpu_vflip_memcpy(iter);
|
||||
return;
|
||||
return cpu_vflip_memcpy(iter);
|
||||
}
|
||||
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(), "flip_cpu",
|
||||
|
||||
@ -46,7 +46,7 @@ using namespace vec;
|
||||
template <typename traits, std::size_t... INDEX>
|
||||
typename traits::ArgsTuple
|
||||
dereference_impl(char* C10_RESTRICT data[], const int64_t* strides, int64_t i,
|
||||
std::index_sequence<INDEX...> /*unused*/) {
|
||||
std::index_sequence<INDEX...>) {
|
||||
return std::make_tuple(
|
||||
c10::load<typename traits::template arg<INDEX>::type>(
|
||||
data[INDEX] + i * strides[INDEX])...);
|
||||
@ -65,7 +65,7 @@ dereference_vec_impl(char* C10_RESTRICT data[],
|
||||
const typename traits::result_type& opt_scalar,
|
||||
size_t S,
|
||||
int64_t i,
|
||||
std::index_sequence<INDEX...> /*unused*/) {
|
||||
std::index_sequence<INDEX...>) {
|
||||
using Vec = typename traits::result_type;
|
||||
using scalar_t = typename Vec::value_type;
|
||||
return std::make_tuple(
|
||||
@ -231,7 +231,7 @@ vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, ve
|
||||
template <typename traits, typename cb_t>
|
||||
inline void unroll_contiguous_scalar_checks(
|
||||
const int64_t* /*strides*/,
|
||||
std::index_sequence<> /*unused*/,
|
||||
std::index_sequence<>,
|
||||
cb_t&& cb) {
|
||||
cb(0);
|
||||
}
|
||||
@ -239,7 +239,7 @@ inline void unroll_contiguous_scalar_checks(
|
||||
template <typename traits, typename cb_t, size_t INDEX0, size_t ...INDEX>
|
||||
inline void unroll_contiguous_scalar_checks(
|
||||
const int64_t* strides,
|
||||
std::index_sequence<INDEX0, INDEX...> /*unused*/,
|
||||
std::index_sequence<INDEX0, INDEX...>,
|
||||
cb_t&& cb) {
|
||||
if (is_contiguous_scalar<traits, INDEX0 + 1>(strides)) {
|
||||
cb(INDEX0 + 1);
|
||||
|
||||
@ -96,14 +96,11 @@ static void pow_tensor_scalar_kernel(
|
||||
dtype == kBFloat16 || isComplexType(dtype)) {
|
||||
// Dispatch to fast specialization for sqrt, rsqrt and reciprocal
|
||||
if (exp_scalar.equal(.5)) {
|
||||
sqrt_kernel(iter);
|
||||
return;
|
||||
return sqrt_kernel(iter);
|
||||
} else if (exp_scalar.equal(-0.5)) {
|
||||
rsqrt_kernel(iter);
|
||||
return;
|
||||
return rsqrt_kernel(iter);
|
||||
} else if (exp_scalar.equal(-1.0)) {
|
||||
reciprocal_kernel(iter);
|
||||
return;
|
||||
return reciprocal_kernel(iter);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -256,10 +256,10 @@ static void norm_kernel_tensor_iterator_impl(
|
||||
} else {
|
||||
if (iter.input_dtype() == kHalf && iter.dtype(0) == kFloat) {
|
||||
// type promotion that does cast and reduction in a single kernel
|
||||
norm_kernel_cpu_impl<at::Half, float>(iter, val); return;
|
||||
return norm_kernel_cpu_impl<at::Half, float>(iter, val);
|
||||
} else if (iter.input_dtype() == kBFloat16 && iter.dtype(0) == kFloat) {
|
||||
// type promotion that does cast and reduction in a single kernel
|
||||
norm_kernel_cpu_impl<at::BFloat16, float>(iter, val); return;
|
||||
return norm_kernel_cpu_impl<at::BFloat16, float>(iter, val);
|
||||
}
|
||||
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kHalf, kBFloat16, kComplexHalf, iter.input_dtype(), "norm_cpu", [&] {
|
||||
|
||||
@ -428,11 +428,10 @@ void fp16_gemv_trans(
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && alpha == 1.0);
|
||||
#if !defined(__aarch64__) || defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
|
||||
if (at::globalContext().allowFP16ReductionCPU()) {
|
||||
fp16_gemv_trans_fp16_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
|
||||
return;
|
||||
return fp16_gemv_trans_fp16_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
|
||||
}
|
||||
#endif
|
||||
fp16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
|
||||
return fp16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
|
||||
}
|
||||
|
||||
float bf16_dot_with_fp32_arith(const at::BFloat16* vec1, const at::BFloat16* vec2, int64_t len) {
|
||||
@ -466,7 +465,7 @@ void bf16_gemv_trans(
|
||||
at::BFloat16* y,
|
||||
const int incy) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && alpha == 1.0 && beta == 0.0);
|
||||
bf16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, y, incy);
|
||||
return bf16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, y, incy);
|
||||
}
|
||||
|
||||
float fp16_dot(
|
||||
|
||||
@ -4,7 +4,6 @@
|
||||
#include <c10/util/SmallVector.h>
|
||||
#include <c10/core/Scalar.h>
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/core/NamedTensor.h>
|
||||
@ -106,8 +105,7 @@ c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, b
|
||||
}
|
||||
}
|
||||
|
||||
using at::blas::ScalingType;
|
||||
using at::blas::SwizzleType;
|
||||
using at::cuda::blas::ScalingType;
|
||||
|
||||
/**
|
||||
* @brief Prepares matrices for CUBLAS operation
|
||||
@ -1114,7 +1112,7 @@ namespace{
|
||||
* - Returns Error.
|
||||
*/
|
||||
|
||||
using at::blas::ScalingType;
|
||||
using at::cuda::blas::ScalingType;
|
||||
|
||||
bool is_tensorwise_scaling(const at::Tensor& t, const at::Tensor& scale) {
|
||||
return isFloat8Type(t.scalar_type()) && scale.scalar_type() == kFloat && scale.numel() == 1;
|
||||
@ -1681,890 +1679,9 @@ _scaled_mm_cuda(const Tensor& mat_a, const Tensor& mat_b,
|
||||
bool use_fast_accum) {
|
||||
const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
|
||||
Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
|
||||
|
||||
return _scaled_mm_out_cuda(mat_a, mat_b, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out);
|
||||
}
|
||||
|
||||
/**
|
||||
* Track concrete implementations available
|
||||
*/
|
||||
enum class ScaledGemmImplementation {
|
||||
NONE = 0,
|
||||
TENSORWISE_TENSORWISE = 1,
|
||||
ROWWISE_ROWWISE = 2,
|
||||
BLOCK_128x128_1x128 = 3,
|
||||
BLOCK_1x128_128x128 = 4,
|
||||
BLOCK_1x128_1x128 = 5,
|
||||
MXFP8_MXFP8 = 6,
|
||||
NVFP4_NVFP4 = 7,
|
||||
NVFP4_NVFP4_SINGLE_SCALE = 8,
|
||||
};
|
||||
|
||||
/**
|
||||
* Convert passed int (enum) from python back into a
|
||||
* strictly-typed enum
|
||||
*/
|
||||
template <class EnumType, class ArrayType>
|
||||
std::vector<EnumType> convert_int_to_enum(ArrayType& v) {
|
||||
std::vector<EnumType> converted;
|
||||
converted.reserve(v.size());
|
||||
|
||||
for (auto vi : v) {
|
||||
converted.push_back(static_cast<EnumType>(vi));
|
||||
}
|
||||
return converted;
|
||||
}
|
||||
|
||||
/**
|
||||
* Both inputs must be fp8,
|
||||
* Each needs a single scale, {Tensorwise (float)}
|
||||
*/
|
||||
bool check_tensorwise_recipe(c10::ScalarType type_a,
|
||||
std::vector<ScalingType>& recipe_a,
|
||||
ArrayRef<Tensor>& scales_a,
|
||||
c10::ScalarType type_b,
|
||||
std::vector<ScalingType>& recipe_b,
|
||||
ArrayRef<Tensor>& scales_b) {
|
||||
// both types must be fp8
|
||||
if (!isFloat8Type(type_a) || !isFloat8Type(type_b)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// 1 scale each, {Tensorwise, float}
|
||||
if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) {
|
||||
return false;
|
||||
}
|
||||
// Need {Blockwise_1x32, e8m0} for A & B
|
||||
if (recipe_a[0] != ScalingType::TensorWise) return false;
|
||||
if (scales_a[0].scalar_type() != ScalarType::Float) return false;
|
||||
if (recipe_b[0] != ScalingType::TensorWise) return false;
|
||||
if (scales_b[0].scalar_type() != ScalarType::Float) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Both inputs must be fp8,
|
||||
* Each needs scales, {Rowwise (float)}
|
||||
*/
|
||||
bool check_rowwise_recipe(c10::ScalarType type_a,
|
||||
std::vector<ScalingType>& recipe_a,
|
||||
ArrayRef<Tensor>& scales_a,
|
||||
c10::ScalarType type_b,
|
||||
std::vector<ScalingType>& recipe_b,
|
||||
ArrayRef<Tensor>& scales_b) {
|
||||
// both types must be fp8
|
||||
if (!isFloat8Type(type_a) || !isFloat8Type(type_b)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// 1 scale each, {Tensorwise, float}
|
||||
if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Need {RowWise, dp32} for A & B
|
||||
if (recipe_a[0] != ScalingType::RowWise) return false;
|
||||
if (scales_a[0].scalar_type() != ScalarType::Float) return false;
|
||||
if (recipe_b[0] != ScalingType::RowWise) return false;
|
||||
if (scales_b[0].scalar_type() != ScalarType::Float) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Two-level scaling, canonical NVFP4
|
||||
* Both inputs must be fp4
|
||||
* A, B need 2 scales, {Blockwise_1x16 (e4m3), Tensorwise (fp32)}
|
||||
*/
|
||||
bool check_nvfp4_recipe(c10::ScalarType type_a,
|
||||
std::vector<ScalingType>& recipe_a,
|
||||
ArrayRef<Tensor>& scales_a,
|
||||
c10::ScalarType type_b,
|
||||
std::vector<ScalingType>& recipe_b,
|
||||
ArrayRef<Tensor>& scales_b) {
|
||||
// both types must be fp4
|
||||
if (type_a != ScalarType::Float4_e2m1fn_x2 || type_b != ScalarType::Float4_e2m1fn_x2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// 2 scales, 2 recipes for each input
|
||||
if (scales_a.size() != 2 || recipe_a.size() != 2 || scales_b.size() != 2 || recipe_b.size() != 2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Need {Blockwise_1x16, e4m3 for scale[0], Tensorwise, fp32 for scale[1]}
|
||||
if (recipe_a[0] != ScalingType::BlockWise1x16 || recipe_a[1] != ScalingType::TensorWise) return false;
|
||||
if (scales_a[0].scalar_type() != ScalarType::Float8_e4m3fn || scales_a[1].scalar_type() != ScalarType::Float) return false;
|
||||
if (recipe_b[0] != ScalingType::BlockWise1x16 || recipe_b[1] != ScalingType::TensorWise) return false;
|
||||
if (scales_b[0].scalar_type() != ScalarType::Float8_e4m3fn || scales_b[1].scalar_type() != ScalarType::Float) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Single-level scaling, what PyT currently understands
|
||||
* Both inputs must be fp4
|
||||
* A, B need 1 scale, {Blockwise_1x16 (e4m3)}
|
||||
*/
|
||||
bool check_nvfp4_recipe_single_scale
|
||||
(c10::ScalarType type_a,
|
||||
std::vector<ScalingType>& recipe_a,
|
||||
ArrayRef<Tensor>& scales_a,
|
||||
c10::ScalarType type_b,
|
||||
std::vector<ScalingType>& recipe_b,
|
||||
ArrayRef<Tensor>& scales_b) {
|
||||
// both types must be fp4
|
||||
if (type_a != ScalarType::Float4_e2m1fn_x2 || type_b != ScalarType::Float4_e2m1fn_x2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// 2 scales, 2 recipes for each input
|
||||
if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Need {Blockwise_1x16, e4m3 for scale[0], Tensorwise, fp32 for scale[1]}
|
||||
if (recipe_a[0] != ScalingType::BlockWise1x16) return false;
|
||||
if (scales_a[0].scalar_type() != ScalarType::Float8_e4m3fn) return false;
|
||||
if (recipe_b[0] != ScalingType::BlockWise1x16) return false;
|
||||
if (scales_b[0].scalar_type() != ScalarType::Float8_e4m3fn) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Both inputs must be fp8
|
||||
* A, B must only have 1 scale each, A: {Blockwise_1x128 (float), B: {Blockwise_128x128 (float)
|
||||
*/
|
||||
bool check_deepseek_recipe(ScalingType expected_recipe_a,
|
||||
ScalingType expected_recipe_b,
|
||||
c10::ScalarType type_a,
|
||||
std::vector<ScalingType>& recipe_a,
|
||||
ArrayRef<Tensor>& scales_a,
|
||||
c10::ScalarType type_b,
|
||||
std::vector<ScalingType>& recipe_b,
|
||||
ArrayRef<Tensor>& scales_b) {
|
||||
// both types must be fp8
|
||||
if (type_a != ScalarType::Float8_e4m3fn || type_b != ScalarType::Float8_e4m3fn) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// 1 scales, 1 recipes for each input
|
||||
if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Need {Blockwise_1x128, float} for A, {Blockwise_128x128, float} for B
|
||||
if (recipe_a[0] != expected_recipe_a) return false;
|
||||
if (scales_a[0].scalar_type() != ScalarType::Float) return false;
|
||||
if (recipe_b[0] != expected_recipe_b) return false;
|
||||
if (scales_b[0].scalar_type() != ScalarType::Float) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Both inputs must be fp8
|
||||
* A, B must have 1 scale each, {Blockwise_1x32, e8m0}
|
||||
*/
|
||||
bool check_mxfp8_recipe(c10::ScalarType type_a,
|
||||
std::vector<ScalingType>& recipe_a,
|
||||
ArrayRef<Tensor>& scales_a,
|
||||
c10::ScalarType type_b,
|
||||
std::vector<ScalingType>& recipe_b,
|
||||
ArrayRef<Tensor>& scales_b) {
|
||||
// both types must be fp8
|
||||
if (type_a != ScalarType::Float8_e4m3fn || type_b != ScalarType::Float8_e4m3fn) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// 1 scales, 1 recipes for each input
|
||||
if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Need {Blockwise_1x32, e8m0} for A & B
|
||||
if (recipe_a[0] != ScalingType::BlockWise1x32) return false;
|
||||
if (scales_a[0].scalar_type() != ScalarType::Float8_e8m0fnu) return false;
|
||||
if (recipe_b[0] != ScalingType::BlockWise1x32) return false;
|
||||
if (scales_b[0].scalar_type() != ScalarType::Float8_e8m0fnu) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
using acceptance_fn = std::function<bool(c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&, c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&)>;
|
||||
using namespace std::placeholders;
|
||||
|
||||
std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 8> scale_kernel_dispatch = {{
|
||||
{ "tensorwise_tensorwise", check_tensorwise_recipe, ScaledGemmImplementation::TENSORWISE_TENSORWISE },
|
||||
{ "rowwise_rowwise", check_rowwise_recipe, ScaledGemmImplementation::ROWWISE_ROWWISE},
|
||||
{ "block_1x128_128x128", std::bind(check_deepseek_recipe, ScalingType::BlockWise1x128, ScalingType::BlockWise128x128, _1, _2, _3, _4, _5, _6),
|
||||
ScaledGemmImplementation::BLOCK_1x128_128x128},
|
||||
{ "block_128x128_1x128", std::bind(check_deepseek_recipe, ScalingType::BlockWise128x128, ScalingType::BlockWise1x128, _1, _2, _3, _4, _5, _6),
|
||||
ScaledGemmImplementation::BLOCK_128x128_1x128},
|
||||
{ "block_1x128_1x128", std::bind(check_deepseek_recipe, ScalingType::BlockWise1x128, ScalingType::BlockWise1x128, _1, _2, _3, _4, _5, _6),
|
||||
ScaledGemmImplementation::BLOCK_1x128_1x128},
|
||||
{ "nvfp4_nvfp4", check_nvfp4_recipe, ScaledGemmImplementation::NVFP4_NVFP4},
|
||||
{ "nvfp4_nvfp4_single_scale", check_nvfp4_recipe_single_scale, ScaledGemmImplementation::NVFP4_NVFP4_SINGLE_SCALE },
|
||||
{ "mxfp8_mxfp8", check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8}}};
|
||||
|
||||
Tensor&
|
||||
_cutlass_scaled_gemm(
|
||||
const Tensor& mat1, const Tensor& mat2,
|
||||
const Tensor& scale_a, const Tensor& scale_b,
|
||||
const ScalingType scaling_choice_a, const ScalingType scaling_choice_b,
|
||||
const std::optional<Tensor>& bias,
|
||||
const bool use_fast_accum,
|
||||
Tensor& out) {
|
||||
cublasCommonArgs args(mat1, mat2, out, scale_a, scale_b, std::nullopt, scaling_choice_a, scaling_choice_b);
|
||||
const auto out_dtype_ = args.result->scalar_type();
|
||||
TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt");
|
||||
|
||||
#ifdef USE_ROCM
|
||||
auto tuning_ctx = at::cuda::tunable::getTuningContext();
|
||||
if (tuning_ctx->IsTunableOpEnabled()) {
|
||||
#define TUNABLE_DISPATCH(BLASOP_A, BLASOP_B) \
|
||||
if (mat1.scalar_type() == ScalarType::Float8_e4m3fnuz) { \
|
||||
if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) { \
|
||||
static at::cuda::tunable::ScaledGemmTunableOp< \
|
||||
at::Float8_e4m3fnuz, at::Float8_e4m3fnuz, scalar_t, \
|
||||
BLASOP_A, BLASOP_B> scaledgemm{}; \
|
||||
scaledgemm(¶ms); \
|
||||
} \
|
||||
else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \
|
||||
static at::cuda::tunable::ScaledGemmTunableOp< \
|
||||
at::Float8_e4m3fnuz, at::Float8_e5m2fnuz, scalar_t, \
|
||||
BLASOP_A, BLASOP_B> scaledgemm{}; \
|
||||
scaledgemm(¶ms); \
|
||||
} \
|
||||
} \
|
||||
else if (mat1.scalar_type() == ScalarType::Float8_e5m2fnuz) { \
|
||||
if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) { \
|
||||
static at::cuda::tunable::ScaledGemmTunableOp< \
|
||||
at::Float8_e5m2fnuz, at::Float8_e4m3fnuz, scalar_t, \
|
||||
BLASOP_A, BLASOP_B> scaledgemm{}; \
|
||||
scaledgemm(¶ms); \
|
||||
} \
|
||||
else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \
|
||||
static at::cuda::tunable::ScaledGemmTunableOp< \
|
||||
at::Float8_e5m2fnuz, at::Float8_e5m2fnuz, scalar_t, \
|
||||
BLASOP_A, BLASOP_B> scaledgemm{}; \
|
||||
scaledgemm(¶ms); \
|
||||
} \
|
||||
} \
|
||||
else if (mat1.scalar_type() == ScalarType::Float8_e4m3fn) { \
|
||||
if (mat2.scalar_type() == ScalarType::Float8_e4m3fn) { \
|
||||
static at::cuda::tunable::ScaledGemmTunableOp< \
|
||||
at::Float8_e4m3fn, at::Float8_e4m3fn, scalar_t, \
|
||||
BLASOP_A, BLASOP_B> scaledgemm{}; \
|
||||
scaledgemm(¶ms); \
|
||||
} \
|
||||
else if (mat2.scalar_type() == ScalarType::Float8_e5m2) { \
|
||||
static at::cuda::tunable::ScaledGemmTunableOp< \
|
||||
at::Float8_e4m3fn, at::Float8_e5m2, scalar_t, \
|
||||
BLASOP_A, BLASOP_B> scaledgemm{}; \
|
||||
scaledgemm(¶ms); \
|
||||
} \
|
||||
} \
|
||||
else if (mat1.scalar_type() == ScalarType::Float8_e5m2) { \
|
||||
if (mat2.scalar_type() == ScalarType::Float8_e4m3fn) { \
|
||||
static at::cuda::tunable::ScaledGemmTunableOp< \
|
||||
at::Float8_e5m2, at::Float8_e4m3fn, scalar_t, \
|
||||
BLASOP_A, BLASOP_B> scaledgemm{}; \
|
||||
scaledgemm(¶ms); \
|
||||
} \
|
||||
else if (mat2.scalar_type() == ScalarType::Float8_e5m2) { \
|
||||
static at::cuda::tunable::ScaledGemmTunableOp< \
|
||||
at::Float8_e5m2, at::Float8_e5m2, scalar_t, \
|
||||
BLASOP_A, BLASOP_B> scaledgemm{}; \
|
||||
scaledgemm(¶ms); \
|
||||
} \
|
||||
}
|
||||
AT_DISPATCH_V2(out_dtype_, "_tunable_scaled_gemm", AT_WRAP([&] {
|
||||
bool transa_ = ((args.transa != 'n') && (args.transa != 'N'));
|
||||
bool transb_ = ((args.transb != 'n') && (args.transb != 'N'));
|
||||
at::cuda::tunable::ScaledGemmParams<scalar_t> params;
|
||||
params.transa = args.transa;
|
||||
params.transb = args.transb;
|
||||
params.m = args.m;
|
||||
params.n = args.n;
|
||||
params.k = args.k;
|
||||
params.a = args.mata->data_ptr();
|
||||
params.a_scale_ptr = args.scale_mata_ptr;
|
||||
params.a_scale_dtype = args.scale_mata_dtype.value();
|
||||
params.lda = args.lda;
|
||||
params.a_dtype = args.mata->scalar_type();
|
||||
params.a_scale_dtype = args.scale_mata_dtype.value();
|
||||
params.a_scaling_type = args.scaling_mata_type.value();
|
||||
params.b = args.matb->data_ptr();
|
||||
params.b_scale_ptr = args.scale_matb_ptr;
|
||||
params.b_scale_dtype = args.scale_matb_dtype.value();
|
||||
params.ldb = args.ldb;
|
||||
params.b_dtype = args.matb->scalar_type();
|
||||
params.b_scale_dtype = args.scale_matb_dtype.value();
|
||||
params.b_scaling_type = args.scaling_matb_type.value();
|
||||
params.bias_ptr = bias ? bias->data_ptr(): nullptr;
|
||||
params.bias_dtype = bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_;
|
||||
params.c = args.result->data_ptr();
|
||||
params.c_scale_ptr = args.scale_result_ptr;
|
||||
params.ldc = args.result_ld;
|
||||
params.c_dtype = out_dtype_;
|
||||
params.use_fast_accum = use_fast_accum;
|
||||
if (transa_ && transb_) {
|
||||
TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::T)
|
||||
}
|
||||
else if (transa_ && !transb_) {
|
||||
TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::N)
|
||||
}
|
||||
else if (!transa_ && transb_) {
|
||||
TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::T)
|
||||
}
|
||||
else if (!transa_ && !transb_) {
|
||||
TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::N)
|
||||
}
|
||||
else {
|
||||
TORCH_CHECK(false, "unreachable");
|
||||
}
|
||||
}),
|
||||
kHalf, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_FLOATING_TYPES));
|
||||
#undef TUNABLE_DISPATCH
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
at::cuda::blas::scaled_gemm(
|
||||
args.transa,
|
||||
args.transb,
|
||||
args.m,
|
||||
args.n,
|
||||
args.k,
|
||||
args.mata->data_ptr(),
|
||||
args.scale_mata_ptr,
|
||||
args.lda,
|
||||
args.mata->scalar_type(),
|
||||
args.scale_mata_dtype.value(),
|
||||
args.scaling_mata_type.value(),
|
||||
args.matb->data_ptr(),
|
||||
args.scale_matb_ptr,
|
||||
args.ldb,
|
||||
args.matb->scalar_type(),
|
||||
args.scale_matb_dtype.value(),
|
||||
args.scaling_matb_type.value(),
|
||||
bias ? bias->data_ptr(): nullptr,
|
||||
bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_,
|
||||
args.result->data_ptr(),
|
||||
args.scale_result_ptr,
|
||||
args.result_ld,
|
||||
out_dtype_,
|
||||
use_fast_accum);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
Tensor&
|
||||
_scaled_tensorwise_tensorwise(
|
||||
const Tensor& mat_a, const Tensor& mat_b,
|
||||
const Tensor& scale_a, const Tensor& scale_b,
|
||||
const std::optional<Tensor>& bias,
|
||||
const c10::ScalarType out_dtype,
|
||||
bool use_fast_accum,
|
||||
Tensor& out) {
|
||||
// Restrictions:
|
||||
// A, B are FP8, scales are fp32
|
||||
//
|
||||
TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
|
||||
mat_a.scalar_type(), mat_b.scalar_type());
|
||||
TORCH_CHECK_VALUE(scale_a.numel() == 1 && scale_a.scalar_type() == kFloat, "scale_a must have 1 Float element")
|
||||
TORCH_CHECK_VALUE(scale_b.numel() == 1 && scale_b.scalar_type() == kFloat, "scale_b must have 1 Float element")
|
||||
|
||||
auto scaling_choice_a = ScalingType::TensorWise;
|
||||
auto scaling_choice_b = ScalingType::TensorWise;
|
||||
|
||||
_cutlass_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
Tensor&
|
||||
_scaled_rowwise_rowwise(
|
||||
const Tensor& mat_a, const Tensor& mat_b,
|
||||
const Tensor& scale_a, const Tensor& scale_b,
|
||||
const std::optional<Tensor>& bias,
|
||||
const c10::ScalarType out_dtype,
|
||||
bool use_fast_accum,
|
||||
Tensor& out) {
|
||||
// Restrictions:
|
||||
// A, B are FP8, scales are fp32, shape M/N for A/B
|
||||
TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
|
||||
mat_a.scalar_type(), mat_b.scalar_type());
|
||||
TORCH_CHECK_VALUE(scale_a.size(0) == mat_a.size(0) && scale_a.size(1) == 1, "scale_a must have shape [", mat_a.size(0), ", 1], got [", scale_a.sizes(), "]");
|
||||
TORCH_CHECK_VALUE(scale_a.numel() == mat_a.size(0) && scale_a.scalar_type() == kFloat, "scale_a must have ", mat_a.size(0), " Float elements, got ", scale_a.numel())
|
||||
TORCH_CHECK_VALUE(scale_b.numel() == mat_b.size(1) && scale_b.scalar_type() == kFloat, "scale_b must have ", mat_b.size(1), " Float elements, got ", scale_b.numel())
|
||||
|
||||
TORCH_CHECK_VALUE(scale_a.stride(1) == 1, "expected scale_a.stride(1) to be 1, but got ", scale_a.stride(1));
|
||||
TORCH_CHECK_VALUE(scale_b.stride(1) == 1, "expected scale_b.stride(1) to be 1, but got ", scale_b.stride(1));
|
||||
|
||||
auto scaling_choice_a = ScalingType::RowWise;
|
||||
auto scaling_choice_b = ScalingType::RowWise;
|
||||
//
|
||||
// NVIDIA's cuBLAS only started supporting row-wise scaling in version 12.9,
|
||||
// and only for compute capability 9.0+. In other cases we use CUTLASS.
|
||||
#ifndef USE_ROCM
|
||||
// We are doing row-wise scaling
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
if (((dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900)
|
||||
// cuBLAS only supports tiled 1D factor layout for 1D block scaling, no 2D block scales
|
||||
|| (dprops->major == 10 && (scale_a.sizes().size() || scale_b.sizes().size())))) {
|
||||
TORCH_CHECK(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling.");
|
||||
at::cuda::detail::f8f8bf16_rowwise(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a,
|
||||
scale_b,
|
||||
bias,
|
||||
use_fast_accum,
|
||||
out);
|
||||
return out;
|
||||
}
|
||||
#else
|
||||
|
||||
// For ROCm, match behavior of f8f8bf16_rowwise type checking, for unit test purposes.
|
||||
//Tensor b = mat_b;
|
||||
if (_scaled_mm_is_fnuz()) {
|
||||
TORCH_CHECK_VALUE(mat_b.dtype() == at::kFloat8_e4m3fnuz, "expected mat_b.dtype() to be at::kFloat8_e4m3fnuz, but got ", mat_b.dtype());
|
||||
}
|
||||
else {
|
||||
TORCH_CHECK_VALUE(mat_b.dtype() == at::kFloat8_e4m3fn, "expected mat_b.dtype() to be at::kFloat8_e4m3fn, but got ", mat_b.dtype());
|
||||
}
|
||||
// Until more than bf16 is supported.
|
||||
TORCH_CHECK_VALUE(out.scalar_type() == ScalarType::BFloat16,
|
||||
"hipblaslt rowwise _scaled_mm only supports BFloat16 output but got ", out.scalar_type());
|
||||
#endif
|
||||
|
||||
_cutlass_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
Tensor&
|
||||
_scaled_block1x128_block1x128(
|
||||
const Tensor& mat_a, const Tensor& mat_b,
|
||||
const Tensor& scale_a, const Tensor& scale_b,
|
||||
const std::optional<Tensor>& bias,
|
||||
const c10::ScalarType out_dtype,
|
||||
const bool use_fast_accum,
|
||||
Tensor& out) {
|
||||
// Restrictions:
|
||||
// A, B are FP8, scales are fp32, shape K//128
|
||||
TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
|
||||
mat_a.scalar_type(), mat_b.scalar_type());
|
||||
TORCH_CHECK_VALUE(scale_a.sizes()[0] == mat_a.sizes()[0] && scale_a.sizes()[1] == mat_a.sizes()[1] / 128 && scale_a.scalar_type() == kFloat,
|
||||
"scale_a must have shape ", mat_a.sizes()[0], " x ", mat_a.sizes()[1] / 128, " Float elements, got ", scale_a.sizes())
|
||||
TORCH_CHECK_VALUE(scale_b.sizes()[0] == ceil_div<int64_t>(mat_b.sizes()[0], 128) && scale_b.sizes()[1] == mat_b.sizes()[1] && scale_b.scalar_type() == kFloat,
|
||||
"scale_b must have shape ", ceil_div<int64_t>(mat_b.sizes()[0], 128), " x ", mat_b.sizes()[1], " Float elements, got ", scale_b.sizes())
|
||||
|
||||
TORCH_CHECK(scale_a.stride(0) == 1, "expected scale_a.stride(0) to be 1, but got ", scale_a.stride(0));
|
||||
TORCH_CHECK(scale_b.stride(1) == 1, "expected scale_b.stride(1) to be 1, but got ", scale_b.stride(1));
|
||||
TORCH_CHECK(scale_b.stride(0) == scale_b.size(1),
|
||||
"expected scale_b.stride(0) to be ", scale_b.size(1), ", but got ", scale_b.size(1));
|
||||
|
||||
auto scaling_choice_a = ScalingType::BlockWise1x128;
|
||||
auto scaling_choice_b = ScalingType::BlockWise1x128;
|
||||
|
||||
_cutlass_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
Tensor&
|
||||
_scaled_block128x128_block1x128(
|
||||
const Tensor& mat_a, const Tensor& mat_b,
|
||||
const Tensor& scale_a, const Tensor& scale_b,
|
||||
const std::optional<Tensor>& bias,
|
||||
const c10::ScalarType out_dtype,
|
||||
const bool use_fast_accum,
|
||||
Tensor& out) {
|
||||
// Restrictions:
|
||||
// A, B are FP8, scales are fp32, shape K//128
|
||||
TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
|
||||
mat_a.scalar_type(), mat_b.scalar_type());
|
||||
TORCH_CHECK_VALUE(scale_a.sizes()[0] == ceil_div<int64_t>(mat_a.sizes()[0], 128) && scale_a.sizes()[1] == ceil_div<int64_t>(mat_a.sizes()[1], 128) && scale_a.scalar_type() == kFloat,
|
||||
"scale_a must have shape ", ceil_div<int64_t>(mat_a.sizes()[0], 128), " x ", ceil_div<int64_t>(mat_a.sizes()[1], 128), " Float elements, got ", scale_a.sizes())
|
||||
TORCH_CHECK_VALUE(scale_b.sizes()[0] == ceil_div<int64_t>(mat_b.sizes()[0], 128) && scale_b.sizes()[1] == mat_b.sizes()[1] && scale_b.scalar_type() == kFloat,
|
||||
"scale_b must have shape ", ceil_div<int64_t>(mat_b.sizes()[0], 128), " x ", mat_b.sizes()[1], " Float elements, got ", scale_b.sizes())
|
||||
|
||||
TORCH_CHECK_VALUE(scale_a.stride(1) == 1, "expected scale_a.stride(1) to be 1, but got ", scale_a.stride(1));
|
||||
TORCH_CHECK_VALUE(scale_b.stride(1) == 1, "expected scale_b.stride(1) to be 1, but got ", scale_b.stride(1));
|
||||
TORCH_CHECK_VALUE(scale_b.stride(0) == scale_b.size(1),
|
||||
"expected scale_b.stride(0) to be ", scale_b.size(1), ", but got ", scale_b.stride(0));
|
||||
|
||||
auto scaling_choice_a = ScalingType::BlockWise128x128;
|
||||
auto scaling_choice_b = ScalingType::BlockWise1x128;
|
||||
|
||||
_cutlass_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
Tensor&
|
||||
_scaled_block1x128_block128x128(
|
||||
const Tensor& mat_a, const Tensor& mat_b,
|
||||
const Tensor& scale_a, const Tensor& scale_b,
|
||||
const std::optional<Tensor>& bias,
|
||||
const c10::ScalarType out_dtype,
|
||||
const bool use_fast_accum,
|
||||
Tensor& out) {
|
||||
// Restrictions:
|
||||
// A, B are FP8, scales are fp32, A: shape K//128, B: K//128, N//128
|
||||
TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
|
||||
mat_a.scalar_type(), mat_b.scalar_type());
|
||||
TORCH_CHECK_VALUE(scale_a.sizes()[0] == mat_a.sizes()[0] && scale_a.sizes()[1] == mat_a.sizes()[1] / 128 && scale_a.scalar_type() == kFloat,
|
||||
"scale_a must have shape ", mat_a.sizes()[0], " x ", mat_a.sizes()[1] / 128, " Float elements, got ", scale_a.sizes())
|
||||
TORCH_CHECK_VALUE(scale_b.sizes()[0] == mat_b.sizes()[0] / 128 && scale_b.sizes()[1] == mat_b.sizes()[1] / 128 && scale_b.scalar_type() == kFloat,
|
||||
"scale_b must have shape ", mat_b.sizes()[0] / 128, " x ", mat_b.sizes()[1] / 128, " Float elements, got ", scale_b.sizes())
|
||||
|
||||
TORCH_CHECK_VALUE(scale_a.stride(0) == 1, "expected scale_a.stride(0) to be 1, but got ", scale_a.stride(0));
|
||||
TORCH_CHECK_VALUE(scale_b.stride(0) == 1, "expected scale_b.stride(0) to be 1, but got ", scale_b.stride(0));
|
||||
TORCH_CHECK_VALUE(scale_b.stride(1) == scale_b.size(0),
|
||||
"expected scale_b.stride(1) to be ", scale_b.size(0), ", but got ", scale_b.stride(1));
|
||||
|
||||
auto scaling_choice_a = ScalingType::BlockWise1x128;
|
||||
auto scaling_choice_b = ScalingType::BlockWise128x128;
|
||||
|
||||
_cutlass_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
Tensor&
|
||||
_scaled_mxfp8_mxfp8(
|
||||
const Tensor& mat_a, const Tensor& mat_b,
|
||||
const Tensor& scale_a, const SwizzleType swizzle_a,
|
||||
const Tensor& scale_b, const SwizzleType swizzle_b,
|
||||
const std::optional<Tensor>& bias,
|
||||
const c10::ScalarType out_dtype,
|
||||
Tensor& out) {
|
||||
// Restrictions:
|
||||
// A, B are FP8, scales are e8m0, A: shape K//32, B: K, N//32
|
||||
// Scales must be swizzled
|
||||
TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
|
||||
mat_a.scalar_type(), mat_b.scalar_type());
|
||||
|
||||
auto scale_a_elems = round_up<int64_t>(mat_a.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_a.size(1), 32), 4);
|
||||
auto scale_b_elems = round_up<int64_t>(mat_b.size(1), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_b.size(0), 32), 4);
|
||||
TORCH_CHECK_VALUE(scale_a_elems == scale_a.numel(),
|
||||
"For Blockwise scaling scale_a should have ", scale_a_elems, " elements, got: ", scale_a.numel());
|
||||
TORCH_CHECK_VALUE(scale_b_elems == scale_b.numel(),
|
||||
"For Blockwise scaling scale_b should have ", scale_b_elems, " elements, got: ", scale_b.numel());
|
||||
|
||||
TORCH_CHECK_VALUE(swizzle_a == SwizzleType::SWIZZLE_32_4_4, "scale_a must be swizzled to SWIZZLE_32_4_4 format");
|
||||
TORCH_CHECK_VALUE(swizzle_b == SwizzleType::SWIZZLE_32_4_4, "scale_b must be swizzled to SWIZZLE_32_4_4 format");
|
||||
|
||||
TORCH_CHECK_VALUE(scale_a.is_contiguous() && scale_b.is_contiguous(),
|
||||
"For Blockwise scaling both scales should be contiguous");
|
||||
|
||||
TORCH_CHECK_VALUE(out.scalar_type() == out_dtype, "expected out.scalar_type() to be ", out_dtype, ", but got ", out_dtype);
|
||||
|
||||
auto scaling_choice_a = ScalingType::BlockWise1x32;
|
||||
auto scaling_choice_b = ScalingType::BlockWise1x32;
|
||||
|
||||
#ifdef USE_ROCM
|
||||
#if ROCM_VERSION >= 70000
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
|
||||
"Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
|
||||
|
||||
TORCH_CHECK_VALUE(mat_a.size(0) % 32 == 0 && mat_a.size(1) % 32 == 0 &&
|
||||
mat_b.size(0) % 32 == 0 && mat_b.size(1) % 32 == 0,
|
||||
"Matrix dimensions must be multiples of 32 for block-wise scaling");
|
||||
|
||||
TORCH_CHECK_VALUE(out.scalar_type() == ScalarType::BFloat16 ||
|
||||
out.scalar_type() == ScalarType::Half,
|
||||
"Block-wise scaling only supports BFloat16 or Half output types");
|
||||
#else
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later");
|
||||
#endif
|
||||
#endif
|
||||
|
||||
return _cutlass_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
|
||||
}
|
||||
|
||||
Tensor&
|
||||
_scaled_nvfp4_nvfp4(
|
||||
const Tensor& mat_a, const Tensor& mat_b,
|
||||
const Tensor& scale_a, const SwizzleType swizzle_a,
|
||||
const Tensor& scale_b, const SwizzleType swizzle_b,
|
||||
const std::optional<Tensor>& bias,
|
||||
const c10::ScalarType out_dtype,
|
||||
const bool single_scale,
|
||||
Tensor& out) {
|
||||
#ifdef USE_ROCM
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "NVFP4 scaling not supported on ROCM");
|
||||
#endif
|
||||
TORCH_CHECK_VALUE(single_scale, "Only single-scaled NVFP4 currently supported");
|
||||
// Restrictions:
|
||||
// A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32
|
||||
// Scales must be swizzled
|
||||
TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2 && mat_b.scalar_type() == at::kFloat4_e2m1fn_x2, "mat_a and mat_b must be fp4 types, got: ",
|
||||
mat_a.scalar_type(), mat_b.scalar_type());
|
||||
// Note: fp4x2 format, need to double the K dimension for checking purposes.
|
||||
auto scale_a_elems = round_up<int64_t>(mat_a.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_a.size(1) * 2, 16), 4);
|
||||
auto scale_b_elems = round_up<int64_t>(mat_b.size(1), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_b.size(0) * 2, 16), 4);
|
||||
TORCH_CHECK_VALUE(scale_a_elems == scale_a.numel(),
|
||||
"For Blockwise scaling scale_a should have ", scale_a_elems, " elements, got: ", scale_a.numel());
|
||||
TORCH_CHECK_VALUE(scale_b_elems == scale_b.numel(),
|
||||
"For Blockwise scaling scale_b should have ", scale_b_elems, " elements, got: ", scale_b.numel());
|
||||
|
||||
TORCH_CHECK_VALUE(swizzle_a == SwizzleType::SWIZZLE_32_4_4, "scale_a must be swizzled to SWIZZLE_32_4_4 format");
|
||||
TORCH_CHECK_VALUE(swizzle_b == SwizzleType::SWIZZLE_32_4_4, "scale_b must be swizzled to SWIZZLE_32_4_4 format");
|
||||
|
||||
TORCH_CHECK_VALUE(scale_a.is_contiguous() && scale_b.is_contiguous(),
|
||||
"For Blockwise scaling both scales should be contiguous");
|
||||
|
||||
auto scaling_choice_a = ScalingType::BlockWise1x16;
|
||||
auto scaling_choice_b = ScalingType::BlockWise1x16;
|
||||
return _cutlass_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
|
||||
}
|
||||
|
||||
|
||||
// V2: Computes matrix multiply + bias while applying scaling to input and output matrices
|
||||
// Scales are only applicable when matrices are of Float8 type and assumed to be equal to 1.0 by default.
|
||||
// If output matrix type is 16 or 32-bit type, scale_result is not applied.
|
||||
// Known limitations:
|
||||
// - Only works if mat1 is row-major and mat2 is column-major
|
||||
// - Only works if matrices sizes are divisible by 32
|
||||
// - If 1-dimensional tensors are used then scale_a should be size = mat1.size(0)
|
||||
// and scale_b should have size = to mat2.size(1)
|
||||
// Arguments:
|
||||
// - `mat1`: the first operand of the matrix multiply, can be type `torch.float8_e4m3fn` or `torch.float8_e5m2`
|
||||
// - `mat2`: the second operand of the matrix multiply, can be type `torch.float8_e4m3fn` or `torch.float8_e5m2`
|
||||
// - `scale_a`: a tensor with the inverse scale of `mat1`, whose shape/strides/dtype depend on the scaling scheme
|
||||
// - `scale_recipe_a`: An integer corresponding to an enum describing the scaling scheme used for `scale_a`
|
||||
// - `swizzle_a`: An integer corresponding to a `SwizzleType` enum describing the swizzling scheme for `scale_a`
|
||||
// - `scale_b`: a tensor with the inverse scale of `mat2`, whose shape/strides/dtype depend on the scaling scheme
|
||||
// - `scale_recipe_b`: An integer corresponding to an enum describing the scaling scheme used for `scale_b`
|
||||
// - `swizzle_b`: An integer corresponding to a `SwizzleType` enum describing the swizzling scheme for `scale_b`
|
||||
// - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16`
|
||||
// - `out_dtype`: the output dtype, can either be a float8 or a higher precision floating point type
|
||||
// - `use_fast_accum`: if true, enables fast float8 accumulation. Backends may ignore this option if not applicable.
|
||||
// - `out`: a reference to the output tensor
|
||||
Tensor&
|
||||
_scaled_mm_cuda_v2_out(
|
||||
const Tensor& mat_a, const Tensor& mat_b,
|
||||
ArrayRef<Tensor> scale_a,
|
||||
IntArrayRef scale_recipe_a,
|
||||
IntArrayRef swizzle_a,
|
||||
ArrayRef<Tensor> scale_b,
|
||||
IntArrayRef scale_recipe_b,
|
||||
IntArrayRef swizzle_b,
|
||||
const std::optional<Tensor>& bias,
|
||||
const std::optional<c10::ScalarType> out_dtype,
|
||||
IntArrayRef contraction_dim,
|
||||
bool use_fast_accum,
|
||||
Tensor& out) {
|
||||
// Check sizes
|
||||
bool allowed_device = _scaled_mm_allowed_device();
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(allowed_device,
|
||||
"torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+");
|
||||
TORCH_CHECK_VALUE(mat_a.dim() == 2, "mat_a must be a matrix");
|
||||
TORCH_CHECK_VALUE(mat_b.dim() == 2, "mat_b must be a matrix");
|
||||
|
||||
// If any of M, K, N is 0 - return early (the tensorwise/rowwise float8 gemm kernels
|
||||
// do not support this case).
|
||||
if (mat_a.size(0) == 0 || mat_a.size(1) == 0 || mat_b.size(1) == 0) {
|
||||
// `out` was created with `at::empty`. In the case where we are multiplying
|
||||
// MxK by KxN and K is the zero dim, we need to initialize here to properly
|
||||
// return a tensor of zeros.
|
||||
at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)});
|
||||
if (mat_a.size(1) == 0) {
|
||||
out.zero_();
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
// Check if the input matrix sizes can be multiplied
|
||||
// - if optional contraction dims are provided, use those
|
||||
// -- mostly for < 1B formats (i.e. nvfp4x2) where cheap .t() is not available.
|
||||
if (contraction_dim.size() > 0) {
|
||||
TORCH_CHECK_VALUE(contraction_dim.size() == 2, "contraction_dim must have exactly 2 elements");
|
||||
auto mat_a_dim = contraction_dim[0];
|
||||
auto mat_b_dim = contraction_dim[1];
|
||||
TORCH_CHECK_VALUE(
|
||||
mat_a.size(mat_a_dim) == mat_b.size(mat_b_dim), "mat_a and mat_b shapes cannot be multiplied (",
|
||||
mat_a.size(0), "x", mat_a.size(1), " and ", mat_b.size(0), "x", mat_b.size(1), ") ",
|
||||
"with contraction dims mat_a: ", mat_a_dim, ", mat_b: ", mat_b_dim);
|
||||
} else {
|
||||
TORCH_CHECK_VALUE(
|
||||
mat_a.size(1) == mat_b.size(0), "mat_a and mat_b shapes cannot be multiplied (",
|
||||
mat_a.size(0), "x", mat_a.size(1), " and ", mat_b.size(0), "x", mat_b.size(1), ")");
|
||||
}
|
||||
|
||||
TORCH_CHECK_VALUE(!bias || bias->numel() == mat_b.sizes()[1], "Bias must be size ", mat_b.sizes()[1],
|
||||
" but got ", bias->numel());
|
||||
TORCH_CHECK_VALUE(
|
||||
mat_a.sizes()[1] % 16 == 0,
|
||||
"Expected trailing dimension of mat1 to be divisible by 16 ",
|
||||
"but got mat1 shape: (",
|
||||
mat_a.sizes()[0],
|
||||
"x",
|
||||
mat_a.sizes()[1],
|
||||
").");
|
||||
TORCH_CHECK_VALUE(mat_b.sizes()[0] % 16 == 0 && mat_b.sizes()[1] % 16 == 0, "mat2 shape (", mat_b.sizes()[0], "x",
|
||||
mat_b.sizes()[1], ") must be divisible by 16");
|
||||
|
||||
// TODO(slayton): Existing checks, not sure if they should really be here.
|
||||
TORCH_CHECK_VALUE(!out_dtype || *out_dtype == out.scalar_type(), "out_dtype must match output matrix type");
|
||||
TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) || mat_a.scalar_type() == ScalarType::Float4_e2m1fn_x2,
|
||||
"Expected mat_a to be Float8 or Float4_x2 matrix got ", mat_a.scalar_type());
|
||||
TORCH_CHECK_VALUE(isFloat8Type(mat_b.scalar_type()) || mat_b.scalar_type() == ScalarType::Float4_e2m1fn_x2,
|
||||
"Expected mat_b to be Float8 or Float4_x2 matrix got ", mat_b.scalar_type());
|
||||
#ifndef USE_ROCM
|
||||
// Type restrictions imposed by CuBLASLt as of CUDA-12.1
|
||||
TORCH_CHECK_VALUE(mat_a.scalar_type() != ScalarType::Float8_e5m2 || mat_b.scalar_type() != ScalarType::Float8_e5m2,
|
||||
"Multiplication of two Float8_e5m2 matrices is not supported");
|
||||
#endif
|
||||
if (use_fast_accum) {
|
||||
TORCH_CHECK_VALUE(mat_a.scalar_type() != ScalarType::Float4_e2m1fn_x2 && mat_b.scalar_type() != ScalarType::Float4_e2m1fn_x2, "`use_fast_accum` is not supported when `mat_a` or `mat_b` tensors have the `Float4_e2m1fn_x2` dtype.");
|
||||
}
|
||||
#ifdef USE_ROCM
|
||||
if (mat_a.scalar_type() == ScalarType::Float4_e2m1fn_x2 || mat_b.scalar_type() == ScalarType::Float4_e2m1fn_x2) {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(ROCM_VERSION >= 70000,
|
||||
"Float4_e2m1fn_x2 is only supported for ROCm 7.0 and above");
|
||||
}
|
||||
if (mat_a.scalar_type() == ScalarType::Float8_e5m2 || mat_b.scalar_type() == ScalarType::Float8_e5m2) {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(ROCM_VERSION >= 60500,
|
||||
"Float8_e5m2 is only supported for ROCm 6.5 and above");
|
||||
}
|
||||
if (mat_a.scalar_type() == ScalarType::Float8_e4m3fn || mat_b.scalar_type() == ScalarType::Float8_e4m3fn) {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(ROCM_VERSION >= 60500,
|
||||
"Float8_e4m3fn is only supported for ROCm 6.5 and above");
|
||||
}
|
||||
#endif
|
||||
if (bias) {
|
||||
TORCH_CHECK_VALUE(out.scalar_type() != kFloat,
|
||||
"Bias is not supported when out_dtype is set to Float32");
|
||||
|
||||
TORCH_CHECK_VALUE(bias->scalar_type() == ScalarType::BFloat16 ||
|
||||
bias->scalar_type() == ScalarType::Half,
|
||||
"Bias must be BFloat16 or Half, but got ", bias->scalar_type());
|
||||
|
||||
TORCH_CHECK_VALUE((out.scalar_type() != kFloat &&
|
||||
out.scalar_type() != ScalarType::BFloat16) ||
|
||||
bias->scalar_type() == ScalarType::BFloat16,
|
||||
"Bias must be BFloat16 to compute ", out.scalar_type(),
|
||||
" output, but got ", bias->scalar_type());
|
||||
|
||||
TORCH_CHECK_VALUE(out.scalar_type() != ScalarType::Half ||
|
||||
bias->scalar_type() == ScalarType::Half,
|
||||
"Bias must be Float16 to compute ", out.scalar_type(),
|
||||
" output, but got ", bias->scalar_type());
|
||||
}
|
||||
{
|
||||
auto bias_ = bias.value_or(Tensor());
|
||||
|
||||
// NOLINTNEXTLINE(*c-array*)
|
||||
TensorArg targs[]{{out, "out", 0}, {mat_a, "mat_a", 1}, {mat_b, "mat_b", 2},
|
||||
{bias_, "bias", 3}, {scale_a[0], "scale_a", 4}, {scale_b[0], "scale_b", 5}};
|
||||
checkAllSameGPU(__func__, targs);
|
||||
}
|
||||
|
||||
auto out_dtype_ = out_dtype.value_or(at::ScalarType::BFloat16);
|
||||
|
||||
// Conversion of implicitly-defined enums to explicit
|
||||
auto scale_recipe_a_enum = convert_int_to_enum<ScalingType>(scale_recipe_a);
|
||||
auto swizzle_a_enum = convert_int_to_enum<SwizzleType>(swizzle_a);
|
||||
auto scale_recipe_b_enum = convert_int_to_enum<ScalingType>(scale_recipe_b);
|
||||
auto swizzle_b_enum = convert_int_to_enum<SwizzleType>(swizzle_b);
|
||||
|
||||
// at this point we can start working out what we want to be doing
|
||||
// Try to do as few steps as possible.
|
||||
// NOTE: support is deliberately sparse, can explicitly enumerate all combinations allowed.
|
||||
// Do this via a list of defined (name, acceptance, concrete_impl) tuples.
|
||||
bool found_impl = false;
|
||||
ScaledGemmImplementation gemm_impl = ScaledGemmImplementation::NONE;
|
||||
|
||||
for (const auto& fn_entry : scale_kernel_dispatch) {
|
||||
const auto [name, accept_fn, scaled_gemm_impl] = fn_entry;
|
||||
bool ok = accept_fn(mat_a.scalar_type(),
|
||||
scale_recipe_a_enum,
|
||||
scale_a,
|
||||
mat_b.scalar_type(),
|
||||
scale_recipe_b_enum,
|
||||
scale_b);
|
||||
if (ok) {
|
||||
gemm_impl = scaled_gemm_impl;
|
||||
found_impl = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
TORCH_CHECK_VALUE(
|
||||
found_impl,
|
||||
"Invalid scaling configuration.\n"
|
||||
"- For TensorWise scaling, a and b should be float8, scales should be float and singletons.\n"
|
||||
"- For RowWise scaling, a and b should be float8, scales should be float, scale_a should be (", mat_a.size(0), ", 1) and scale_b should be (1, ", mat_b.size(1), "), and both should be contiguous.\n"
|
||||
"- For BlockWise 1x128 scaling, a and b should be float8, scales should be float, scale_a should be (", mat_a.size(0), ", ", ceil_div<int64_t>(mat_a.size(1), 128), ") and scale_b should be (", ceil_div<int64_t>(mat_b.size(0), 128), ", ", mat_b.size(1), "), and both should be outer-dim-major.\n"
|
||||
"- For BlockWise 128x128 scaling, a and b should be float8, scales should be float, scale_a should be (", ceil_div<int64_t>(mat_a.size(0), 128), ", ", ceil_div<int64_t>(mat_a.size(1), 128), ") and scale_b should be (", ceil_div<int64_t>(mat_b.size(0), 128), ", ", ceil_div<int64_t>(mat_b.size(1), 128), "), and both should be near-inner-dim-major (with 16-byte aligned strides).\n"
|
||||
"- For Blockwise 1x32 scaling, a and b should be float8, scales should be float8_e8m0fnu, scale_a should have ", round_up<int64_t>(mat_a.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_a.size(1), 32), 4), " elements and scale_b should have ", round_up<int64_t>(mat_b.size(1), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_b.size(0), 32), 4), " elements, and both should be contiguous.\n"
|
||||
"- For Blockwise 1x16 scaling, a and b should be float4 (packed 2x), scales should be float8_e4m3fn, scale_a should have ", round_up<int64_t>(mat_a.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_a.size(1) * 2, 16), 4), " elements and scale_b should have ", round_up<int64_t>(mat_b.size(1), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_b.size(0) * 2, 16), 4), " elements, and both should be contiguous.\n"
|
||||
"Got mat_a.dtype()=", mat_a.scalar_type(), ", scale_a[0].dtype()=", scale_a[0].scalar_type(), ", scale_a[0].size()=", scale_a[0].sizes(), ", scale_a[0].stride()=", scale_a[0].strides(), ", ",
|
||||
"mat_b.dtype()=", mat_b.scalar_type(), ", scale_b[0].dtype()=", scale_b[0].scalar_type(), ", scale_b[0].size()=", scale_b[0].sizes(), " and scale_b[0].stride()=", scale_b[0].strides()
|
||||
);
|
||||
|
||||
at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)});
|
||||
|
||||
auto bias_ = bias.value_or(Tensor());
|
||||
|
||||
// dispatch to appropriate lower-level calls for error checking & execution
|
||||
if (gemm_impl == ScaledGemmImplementation::TENSORWISE_TENSORWISE) {
|
||||
return _scaled_tensorwise_tensorwise(mat_a, mat_b, scale_a[0], scale_b[0], bias, out_dtype_, use_fast_accum, out);
|
||||
} else if (gemm_impl == ScaledGemmImplementation::ROWWISE_ROWWISE) {
|
||||
return _scaled_rowwise_rowwise(mat_a, mat_b, scale_a[0], scale_b[0], bias, out_dtype_, use_fast_accum, out);
|
||||
} else if (gemm_impl == ScaledGemmImplementation::BLOCK_128x128_1x128) {
|
||||
return _scaled_block128x128_block1x128(mat_a, mat_b, scale_a[0], scale_b[0], bias, out_dtype_, use_fast_accum, out);
|
||||
} else if (gemm_impl == ScaledGemmImplementation::BLOCK_1x128_128x128) {
|
||||
return _scaled_block1x128_block128x128(mat_a, mat_b, scale_a[0], scale_b[0], bias, out_dtype_, use_fast_accum, out);
|
||||
} else if (gemm_impl == ScaledGemmImplementation::BLOCK_1x128_1x128) {
|
||||
return _scaled_block1x128_block1x128(mat_a, mat_b, scale_a[0], scale_b[0], bias, out_dtype_, use_fast_accum, out);
|
||||
} else if (gemm_impl == ScaledGemmImplementation::MXFP8_MXFP8) {
|
||||
return _scaled_mxfp8_mxfp8(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out);
|
||||
} else if (gemm_impl == ScaledGemmImplementation::NVFP4_NVFP4) {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "Only single-scale NVFP4 currently supported");
|
||||
} else if (gemm_impl == ScaledGemmImplementation::NVFP4_NVFP4_SINGLE_SCALE) {
|
||||
return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, true /* single_scale */, out);
|
||||
} else {
|
||||
TORCH_CHECK_VALUE(false, "Invalid state - found an implementation, but not really");
|
||||
}
|
||||
}
|
||||
|
||||
Tensor
|
||||
_scaled_mm_cuda_v2(
|
||||
const Tensor& mat_a, const Tensor& mat_b,
|
||||
ArrayRef<Tensor> scale_a,
|
||||
IntArrayRef scale_recipe_a,
|
||||
IntArrayRef swizzle_a,
|
||||
ArrayRef<Tensor> scale_b,
|
||||
IntArrayRef scale_recipe_b,
|
||||
IntArrayRef swizzle_b,
|
||||
const std::optional<Tensor>& bias,
|
||||
const std::optional<c10::ScalarType> out_dtype,
|
||||
IntArrayRef contraction_dim,
|
||||
bool use_fast_accum) {
|
||||
const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
|
||||
Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
|
||||
|
||||
return _scaled_mm_cuda_v2_out(
|
||||
mat_a, mat_b,
|
||||
scale_a, scale_recipe_a, swizzle_a,
|
||||
scale_b, scale_recipe_b, swizzle_b,
|
||||
bias,
|
||||
out_dtype,
|
||||
contraction_dim,
|
||||
use_fast_accum,
|
||||
out);
|
||||
}
|
||||
|
||||
Tensor
|
||||
_scaled_grouped_mm_cuda(const Tensor& mat_a, const Tensor& mat_b,
|
||||
|
||||
@ -59,7 +59,7 @@ constexpr uint64_t getDefaultMaxThreadsPerBlock() {
|
||||
#ifdef USE_ROCM
|
||||
#define SKIP_SORTED_INDICES 32
|
||||
template <typename scalar_t, int SZ>
|
||||
__global__ void indexing_backward_kernel_many_indices(
|
||||
__global__ void indexing_backward_kernel(
|
||||
const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
|
||||
int64_t numel, int64_t stride, int64_t stride_before, int64_t outer_dim, bool accumulate) {
|
||||
using opmath_t = at::opmath_type<scalar_t>;
|
||||
@ -254,8 +254,7 @@ __global__ void indexing_backward_kernel_stride_1(
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#else
|
||||
template <typename scalar_t, int SZ>
|
||||
__global__ void indexing_backward_kernel(
|
||||
const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
|
||||
@ -334,7 +333,6 @@ __global__ void indexing_backward_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef USE_ROCM
|
||||
template <typename scalar_t>
|
||||
__global__ void indexing_backward_kernel_stride_1(
|
||||
const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
|
||||
@ -782,43 +780,11 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
|
||||
kBool,
|
||||
kBFloat16);
|
||||
} else {
|
||||
#ifdef USE_ROCM
|
||||
if (num_indices >= 200000)
|
||||
AT_DISPATCH_V2(
|
||||
expandedValue.scalar_type(),
|
||||
"indexing_backward_many_indices",
|
||||
AT_WRAP([&] {
|
||||
indexing_backward_kernel_many_indices<scalar_t, UNROLL><<<new_grid, block, smem_dups_size, stream>>>(
|
||||
sorted_indices.const_data_ptr<int64_t>(),
|
||||
orig_indices.const_data_ptr<int64_t>(),
|
||||
expandedValue.const_data_ptr<scalar_t>(),
|
||||
src_.mutable_data_ptr<scalar_t>(),
|
||||
num_indices,
|
||||
sliceSize,
|
||||
strideBefore,
|
||||
nElemBefore,
|
||||
accumulate);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}),
|
||||
AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
|
||||
// AT_EXPAND(AT_FLOAT8_TYPES),
|
||||
// TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
|
||||
// should not be supported here, then reenable AT_FLOAT8_DTYPES
|
||||
kFloat8_e4m3fn,
|
||||
kFloat8_e5m2,
|
||||
kFloat8_e4m3fnuz,
|
||||
kFloat8_e5m2fnuz,
|
||||
kComplexHalf,
|
||||
kHalf,
|
||||
kBool,
|
||||
kBFloat16);
|
||||
else
|
||||
#endif
|
||||
AT_DISPATCH_V2(
|
||||
expandedValue.scalar_type(),
|
||||
"indexing_backward",
|
||||
AT_WRAP([&] {
|
||||
indexing_backward_kernel<scalar_t, UNROLL><<<grid, block, 0, stream>>>(
|
||||
indexing_backward_kernel<scalar_t, UNROLL><<<KERNEL_GRID, block, KERNEL_SMEM, stream>>>(
|
||||
sorted_indices.const_data_ptr<int64_t>(),
|
||||
orig_indices.const_data_ptr<int64_t>(),
|
||||
expandedValue.const_data_ptr<scalar_t>(),
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user