mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-23 14:59:34 +08:00
Compare commits
180 Commits
aoti_packa
...
DynamoFixG
Author | SHA1 | Date | |
---|---|---|---|
c4d369369f | |||
a13f24980e | |||
6869487ca4 | |||
5d9105f2ca | |||
191e6bb367 | |||
a15a08725b | |||
756ea14378 | |||
d7c5ea03df | |||
d11e253ee3 | |||
01d5211679 | |||
b496a04735 | |||
03be8d227b | |||
df1b8c3e41 | |||
94f39d5749 | |||
2eb8b70d1b | |||
29680dd928 | |||
69bcc97937 | |||
babac1d561 | |||
8594b98b0a | |||
b3fc84229e | |||
e409e84a7a | |||
9c3742e7a7 | |||
664a137dbb | |||
4f5a0deb83 | |||
4752d8fec9 | |||
715f0a26d7 | |||
e9e2553603 | |||
43fac7f55d | |||
a875f27482 | |||
f34e0a941a | |||
81dbeb06f4 | |||
7a1ead755f | |||
90b4e130d6 | |||
4308b8a28f | |||
94b1ec8c7c | |||
054268c9eb | |||
af40828bbb | |||
5a1fbf45ad | |||
aed5ed1076 | |||
af4c29fea8 | |||
486b4d2414 | |||
8f83b3e71c | |||
f0c9f3bddb | |||
1d182dd81c | |||
0b15f7ae05 | |||
f1229b6db9 | |||
b1ac252f55 | |||
5ba11df4f8 | |||
15800888b6 | |||
e7ed1a00eb | |||
2982406721 | |||
005c3d449e | |||
b2b3947565 | |||
81994b08a0 | |||
71aefd5595 | |||
001e1d2637 | |||
e0cb1848d0 | |||
a4110fedcf | |||
37c6087334 | |||
0b85236477 | |||
4c0fec3e4d | |||
fdc622b513 | |||
91b9484264 | |||
5c827a4133 | |||
83458197d1 | |||
0b01ff4de0 | |||
01f3a43462 | |||
f332017294 | |||
0a3e4e894c | |||
73adac05d1 | |||
0d39ecb2ce | |||
90c0825e2d | |||
fd4bde430a | |||
b5e93ffdcf | |||
f8d0d65ddc | |||
f46ddb1e65 | |||
20082d7136 | |||
7158aa22e8 | |||
2035f6b2e6 | |||
2b58adc3bd | |||
322091d8d8 | |||
2bb4e6876c | |||
56ef7743fc | |||
64108bdbed | |||
c855f8632e | |||
12d2ef557f | |||
65aa62d50d | |||
6a09f9306c | |||
19bf67be32 | |||
1927783aa3 | |||
184817c7a8 | |||
da903b6a8b | |||
f76fdcaaf8 | |||
608792153f | |||
086dec3235 | |||
ad7b2bebc6 | |||
d444384003 | |||
3040a5d294 | |||
97463d4cf3 | |||
c813617c53 | |||
e659661ffa | |||
41808b2ba9 | |||
c0510dc447 | |||
9ec10dc26a | |||
43fc859625 | |||
f713abab16 | |||
bd3b98a8a5 | |||
e98c4e835b | |||
7b15534434 | |||
c32118dc3e | |||
e3ae80fc03 | |||
483f4e0db9 | |||
d1a62c8036 | |||
6861a27062 | |||
955f21dc2c | |||
9f5e1beaf3 | |||
2e027e8742 | |||
1e42fde45e | |||
f505caa71b | |||
65f10becdf | |||
df640df68a | |||
4c3c0ef2f1 | |||
bc33b10202 | |||
2855a045b3 | |||
9ecd092bd9 | |||
078d475d3b | |||
f37a6523ef | |||
b13cd141b3 | |||
5e47b4dd60 | |||
ee5389d520 | |||
ab01a0d7d3 | |||
801e282f39 | |||
87c9fbda22 | |||
3cc8af2d67 | |||
1fb072ac2a | |||
cac5e13e13 | |||
68350660ee | |||
ef7e2ca77e | |||
cdaaf3e4a3 | |||
0ea59c3c55 | |||
8f705d019a | |||
4bcc05777e | |||
2a6cdba6e5 | |||
53f6cc7529 | |||
ac901bf79a | |||
c965d6dbb2 | |||
ac08556f67 | |||
5fe7f29b9e | |||
ded099ecbf | |||
63fcc3e6c4 | |||
fd3e15c14f | |||
ff5faa744a | |||
4725871a81 | |||
bcd96cc6ff | |||
50e077beaa | |||
56d66ac0d7 | |||
49f7d8d19d | |||
afee8062d5 | |||
e89d12bf5d | |||
d4752bc7f6 | |||
44a5d41993 | |||
361c5d362c | |||
1fc71d1b57 | |||
8f54e27e5d | |||
8c0bc879b9 | |||
746fe78ecd | |||
b63bbe1661 | |||
3912ba3e94 | |||
cfc5cc17dc | |||
fdc8ccc5bc | |||
48b54b45d6 | |||
6861fa43e5 | |||
c1f40d33c8 | |||
7e7ac2039d | |||
23ab6a45e5 | |||
b558c986e8 | |||
415e641572 | |||
11f5f65686 | |||
af32d16a71 | |||
ba480d6bf7 |
@ -37,9 +37,9 @@ case ${DOCKER_TAG_PREFIX} in
|
||||
rocm*)
|
||||
BASE_TARGET=rocm
|
||||
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
|
||||
# add gfx950 conditionally starting in ROCm 7.0
|
||||
# add gfx950, gfx115x conditionally starting in ROCm 7.0
|
||||
if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
|
||||
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
|
||||
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
|
||||
fi
|
||||
EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
|
||||
;;
|
||||
|
@ -344,7 +344,7 @@ docker build \
|
||||
--build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
|
||||
--build-arg "KATEX=${KATEX:-}" \
|
||||
--build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
|
||||
--build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942}" \
|
||||
--build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx1100}" \
|
||||
--build-arg "IMAGE_NAME=${IMAGE_NAME}" \
|
||||
--build-arg "UCX_COMMIT=${UCX_COMMIT}" \
|
||||
--build-arg "UCC_COMMIT=${UCC_COMMIT}" \
|
||||
|
@ -1 +1 @@
|
||||
27664085f804afc83df26f740bb46c365854f2c4
|
||||
7416ffcb92cdbe98d9f97e4e6f95247e46dfc9fd
|
||||
|
@ -46,9 +46,9 @@ case ${DOCKER_TAG_PREFIX} in
|
||||
BASE_TARGET=rocm
|
||||
GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
|
||||
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
|
||||
# add gfx950 conditionally starting in ROCm 7.0
|
||||
# add gfx950, gfx115x conditionally starting in ROCm 7.0
|
||||
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
|
||||
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
|
||||
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
|
||||
fi
|
||||
DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
|
||||
;;
|
||||
|
@ -115,6 +115,9 @@ RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio
|
||||
# cmake-3.28.0 from pip for onnxruntime
|
||||
RUN python3 -mpip install cmake==3.28.0
|
||||
|
||||
ADD ./common/patch_libstdc.sh patch_libstdc.sh
|
||||
RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
|
||||
|
||||
# build onnxruntime 1.21.0 from sources.
|
||||
# it is not possible to build it from sources using pip,
|
||||
# so just build it from upstream repository.
|
||||
|
@ -84,9 +84,9 @@ case ${image} in
|
||||
DEVTOOLSET_VERSION="11"
|
||||
GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
|
||||
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
|
||||
# add gfx950 conditionally starting in ROCm 7.0
|
||||
# add gfx950, gfx115x conditionally starting in ROCm 7.0
|
||||
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
|
||||
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
|
||||
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
|
||||
fi
|
||||
DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
|
||||
;;
|
||||
|
@ -120,9 +120,8 @@ ninja==1.11.1.4
|
||||
numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
|
||||
numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
|
||||
#Description: Just-In-Time Compiler for Numerical Functions
|
||||
#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
|
||||
#Pinned versions: 0.55.2, 0.60.0
|
||||
#test that import: test_numba_integration.py
|
||||
#For numba issue see https://github.com/pytorch/pytorch/issues/51511
|
||||
#Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073
|
||||
|
||||
#numpy
|
||||
@ -242,10 +241,9 @@ pygments==2.15.0
|
||||
#Pinned versions: 14.1.0
|
||||
#test that import:
|
||||
|
||||
scikit-image==0.19.3 ; python_version < "3.10"
|
||||
scikit-image==0.22.0 ; python_version >= "3.10"
|
||||
scikit-image==0.22.0
|
||||
#Description: image processing routines
|
||||
#Pinned versions:
|
||||
#Pinned versions: 0.22.0
|
||||
#test that import: test_nn.py
|
||||
|
||||
#scikit-learn
|
||||
|
@ -5,7 +5,7 @@ DESIRED_ROCM ?= 7.0
|
||||
DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
|
||||
PACKAGE_NAME = magma-rocm
|
||||
# inherit this from underlying docker image, do not pass this env var to docker
|
||||
#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
|
||||
#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201
|
||||
|
||||
DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
|
||||
-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
|
||||
@ -18,7 +18,6 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
|
||||
.PHONY: all
|
||||
all: magma-rocm70
|
||||
all: magma-rocm64
|
||||
all: magma-rocm63
|
||||
|
||||
.PHONY:
|
||||
clean:
|
||||
@ -34,8 +33,3 @@ magma-rocm70:
|
||||
magma-rocm64: DESIRED_ROCM := 6.4
|
||||
magma-rocm64:
|
||||
$(DOCKER_RUN)
|
||||
|
||||
.PHONY: magma-rocm63
|
||||
magma-rocm63: DESIRED_ROCM := 6.3
|
||||
magma-rocm63:
|
||||
$(DOCKER_RUN)
|
||||
|
@ -67,7 +67,7 @@ fi
|
||||
# wheels with cxx11-abi
|
||||
|
||||
echo "Checking that the gcc ABI is what we expect"
|
||||
if [[ "$(uname)" != 'Darwin' && "$(uname -m)" != "s390x" ]]; then
|
||||
if [[ "$(uname)" != 'Darwin' ]]; then
|
||||
# We also check that there are cxx11 symbols in libtorch
|
||||
#
|
||||
echo "Checking that symbols in libtorch.so have the right gcc abi"
|
||||
|
@ -256,7 +256,7 @@ test_torchbench_smoketest() {
|
||||
local device=mps
|
||||
local dtypes=(undefined float16 bfloat16 notset)
|
||||
local dtype=${dtypes[$1]}
|
||||
local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
|
||||
local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16)
|
||||
|
||||
for backend in eager inductor; do
|
||||
|
||||
@ -319,7 +319,7 @@ test_aoti_torchbench_smoketest() {
|
||||
local device=mps
|
||||
local dtypes=(undefined float16 bfloat16 notset)
|
||||
local dtype=${dtypes[$1]}
|
||||
local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
|
||||
local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16)
|
||||
|
||||
echo "Launching torchbench inference performance run for AOT Inductor and dtype ${dtype}"
|
||||
local dtype_arg="--${dtype}"
|
||||
|
@ -838,7 +838,7 @@ test_dynamo_benchmark() {
|
||||
elif [[ "${suite}" == "timm_models" ]]; then
|
||||
export TORCHBENCH_ONLY_MODELS="inception_v3"
|
||||
elif [[ "${suite}" == "torchbench" ]]; then
|
||||
export TORCHBENCH_ONLY_MODELS="hf_Bert"
|
||||
export TORCHBENCH_ONLY_MODELS="BERT_pytorch"
|
||||
fi
|
||||
fi
|
||||
test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
|
||||
@ -869,13 +869,13 @@ test_inductor_torchbench_smoketest_perf() {
|
||||
mkdir -p "$TEST_REPORTS_DIR"
|
||||
|
||||
python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
|
||||
--batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
|
||||
--batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only BERT_pytorch \
|
||||
--output "$TEST_REPORTS_DIR/inductor_training_smoketest.csv"
|
||||
# The threshold value needs to be actively maintained to make this check useful
|
||||
python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4
|
||||
|
||||
# Check memory compression ratio for a few models
|
||||
for test in hf_Albert timm_vision_transformer; do
|
||||
for test in BERT_pytorch yolov3; do
|
||||
python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --amp --training \
|
||||
--disable-cudagraphs --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" \
|
||||
--only $test --output "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv"
|
||||
@ -886,7 +886,7 @@ test_inductor_torchbench_smoketest_perf() {
|
||||
done
|
||||
|
||||
# Perform some "warm-start" runs for a few huggingface models.
|
||||
for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
|
||||
for test in AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
|
||||
python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
|
||||
--only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
|
||||
python benchmarks/dynamo/check_accuracy.py \
|
||||
|
@ -38,7 +38,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
|
||||
fi
|
||||
|
||||
# TODO: Move this to .ci/docker/requirements-ci.txt
|
||||
python -m pip install "psutil==5.9.1" "pynvml==11.4.1" "pytest-shard==0.1.2"
|
||||
python -m pip install "psutil==5.9.1" nvidia-ml-py "pytest-shard==0.1.2"
|
||||
|
||||
run_tests() {
|
||||
# Run nvidia-smi if available
|
||||
|
@ -71,14 +71,7 @@ export PYTORCH_BUILD_NUMBER=1
|
||||
|
||||
# Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS
|
||||
TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
|
||||
|
||||
# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
|
||||
TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||
|
||||
# CUDA 12.9/13.0 builds have triton for Linux and Linux aarch64 binaries.
|
||||
if [[ "$DESIRED_CUDA" == "cu129" ]] || [[ "$DESIRED_CUDA" == "cu130" ]]; then
|
||||
TRITON_CONSTRAINT="platform_system == 'Linux'"
|
||||
fi
|
||||
TRITON_CONSTRAINT="platform_system == 'Linux'"
|
||||
|
||||
if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" && ! "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then
|
||||
TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
|
||||
|
35
.github/actions/setup-linux/action.yml
vendored
35
.github/actions/setup-linux/action.yml
vendored
@ -28,6 +28,10 @@ runs:
|
||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
||||
echo "system info $(uname -a)"
|
||||
|
||||
- name: Print GPU info (if present)
|
||||
shell: bash
|
||||
run: if [ -f /usr/bin/nvidia-smi ]; then nvidia-smi; fi
|
||||
|
||||
- name: Check if in a container runner
|
||||
shell: bash
|
||||
id: check_container_runner
|
||||
@ -82,37 +86,6 @@ runs:
|
||||
# Prune all of the docker images
|
||||
docker system prune -af
|
||||
|
||||
- name: Manually resolve download.pytorch.org
|
||||
shell: bash
|
||||
continue-on-error: true
|
||||
run: |
|
||||
set +e
|
||||
set -x
|
||||
|
||||
PT_DOMAIN=download.pytorch.org
|
||||
# TODO: Flaky access to download.pytorch.org https://github.com/pytorch/pytorch/issues/100400,
|
||||
# cleaning this up once the issue is fixed. There are more than one resolved IP here, the last
|
||||
# one is returned at random
|
||||
RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" | tail -n1)
|
||||
|
||||
if [ -z "${RESOLVED_IP}" ]; then
|
||||
echo "Couldn't resolve ${PT_DOMAIN}, retrying with Google DNS..."
|
||||
RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" @8.8.8.8 | tail -n1)
|
||||
|
||||
if [ -z "${RESOLVED_IP}" ]; then
|
||||
echo "Couldn't resolve ${PT_DOMAIN}, exiting..."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if grep -r "${PT_DOMAIN}" /etc/hosts; then
|
||||
# Clean up any old records first
|
||||
sudo sed -i "/${PT_DOMAIN}/d" /etc/hosts
|
||||
fi
|
||||
|
||||
echo "${RESOLVED_IP} ${PT_DOMAIN}" | sudo tee -a /etc/hosts
|
||||
cat /etc/hosts
|
||||
|
||||
- name: Check that the docker daemon is running
|
||||
shell: bash
|
||||
continue-on-error: true
|
||||
|
BIN
.github/scripts/drci_mocks.json.gz
vendored
BIN
.github/scripts/drci_mocks.json.gz
vendored
Binary file not shown.
1
.github/scripts/github_utils.py
vendored
1
.github/scripts/github_utils.py
vendored
@ -18,6 +18,7 @@ class GitHubComment:
|
||||
body_text: str
|
||||
created_at: str
|
||||
author_login: str
|
||||
author_url: Optional[str]
|
||||
author_association: str
|
||||
editor_login: Optional[str]
|
||||
database_id: int
|
||||
|
BIN
.github/scripts/gql_mocks.json.gz
vendored
BIN
.github/scripts/gql_mocks.json.gz
vendored
Binary file not shown.
2
.github/scripts/test_check_labels.py
vendored
2
.github/scripts/test_check_labels.py
vendored
@ -38,6 +38,7 @@ def mock_get_comments() -> list[GitHubComment]:
|
||||
body_text="mock_body_text",
|
||||
created_at="",
|
||||
author_login="",
|
||||
author_url=None,
|
||||
author_association="",
|
||||
editor_login=None,
|
||||
database_id=1,
|
||||
@ -48,6 +49,7 @@ def mock_get_comments() -> list[GitHubComment]:
|
||||
body_text=" #" + LABEL_ERR_MSG_TITLE.replace("`", ""),
|
||||
created_at="",
|
||||
author_login=BOT_AUTHORS[1],
|
||||
author_url=None,
|
||||
author_association="",
|
||||
editor_login=None,
|
||||
database_id=2,
|
||||
|
18
.github/scripts/test_trymerge.py
vendored
18
.github/scripts/test_trymerge.py
vendored
@ -32,6 +32,7 @@ from trymerge import (
|
||||
main as trymerge_main,
|
||||
MandatoryChecksMissingError,
|
||||
MergeRule,
|
||||
PostCommentError,
|
||||
RE_GHSTACK_DESC,
|
||||
read_merge_rules,
|
||||
remove_job_name_suffix,
|
||||
@ -588,6 +589,23 @@ class TestTryMerge(TestCase):
|
||||
self.assertEqual(mock_merge_base, pr.get_merge_base())
|
||||
mocked_gh_fetch_merge_base.assert_called_once()
|
||||
|
||||
def test_app_can_revert(self, *args: Any) -> None:
|
||||
pr = GitHubPR("pytorch", "pytorch", 164660)
|
||||
repo = DummyGitRepo()
|
||||
app_comment_id, impostor_comment_id = 3375785595, 3377647892
|
||||
# Check that app can revert
|
||||
self.assertIsNotNone(validate_revert(repo, pr, comment_id=app_comment_id))
|
||||
# But impostor can not
|
||||
self.assertRaises(
|
||||
PostCommentError,
|
||||
lambda: validate_revert(repo, pr, comment_id=impostor_comment_id),
|
||||
)
|
||||
# Despite it's name being the name of the bot
|
||||
self.assertEqual(
|
||||
pr.get_comment_by_id(impostor_comment_id).author_login,
|
||||
"pytorch-auto-revert",
|
||||
)
|
||||
|
||||
|
||||
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
|
||||
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
|
||||
|
7
.github/scripts/trymerge.py
vendored
7
.github/scripts/trymerge.py
vendored
@ -234,6 +234,7 @@ query ($owner: String!, $name: String!, $number: Int!) {
|
||||
createdAt
|
||||
author {
|
||||
login
|
||||
url
|
||||
}
|
||||
authorAssociation
|
||||
editor {
|
||||
@ -1093,6 +1094,7 @@ class GitHubPR:
|
||||
body_text=node["bodyText"],
|
||||
created_at=node["createdAt"] if "createdAt" in node else "",
|
||||
author_login=node["author"]["login"],
|
||||
author_url=node["author"].get("url", None),
|
||||
author_association=node["authorAssociation"],
|
||||
editor_login=editor["login"] if editor else None,
|
||||
database_id=node["databaseId"],
|
||||
@ -2029,6 +2031,11 @@ def validate_revert(
|
||||
# For some reason, one can not be a member of private repo, only CONTRIBUTOR
|
||||
if pr.is_base_repo_private():
|
||||
allowed_reverters.append("CONTRIBUTOR")
|
||||
# Special case the pytorch-auto-revert app, whose does not have association
|
||||
# But should be able to issue revert command
|
||||
if comment.author_url == "https://github.com/apps/pytorch-auto-revert":
|
||||
allowed_reverters.append("NONE")
|
||||
|
||||
if author_association not in allowed_reverters:
|
||||
raise PostCommentError(
|
||||
f"Will not revert as @{author_login} is not one of "
|
||||
|
@ -2,7 +2,7 @@ name: inductor-perf-nightly-h100
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: 15 0,12 * * 1-6
|
||||
- cron: 15 0 * * 1-6
|
||||
- cron: 0 7 * * 0
|
||||
# NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
|
||||
# out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
|
||||
|
@ -63,6 +63,7 @@ jobs:
|
||||
# Same as the build job
|
||||
python-version: 3.12.7
|
||||
test-matrix: ${{ needs.macos-perf-py3-arm64-build.outputs.test-matrix }}
|
||||
timeout-minutes: 300
|
||||
disable-monitor: false
|
||||
monitor-log-interval: 15
|
||||
monitor-data-collect-interval: 4
|
||||
|
26
.github/workflows/rocm.yml
vendored
26
.github/workflows/rocm.yml
vendored
@ -59,3 +59,29 @@ jobs:
|
||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-gfx1100-test:
|
||||
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-jammy-rocm-py3_10-gfx1100
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-jammy-rocm-py3_10-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
|
||||
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
|
||||
]}
|
||||
tests-to-include: >
|
||||
test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
|
||||
test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
|
||||
inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
|
||||
inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
|
||||
inductor/test_flex_attention inductor/test_max_autotune
|
||||
secrets: inherit
|
||||
|
2
.gitignore
vendored
2
.gitignore
vendored
@ -88,7 +88,7 @@ torch_compile_debug/
|
||||
# Listed manually because some files in this directory are not generated
|
||||
torch/testing/_internal/generated/annotated_fn_args.py
|
||||
torch/testing/_internal/data/*.pt
|
||||
torch/csrc/api/include/torch/version.h
|
||||
torch/headeronly/version.h
|
||||
torch/csrc/cudnn/cuDNN.cpp
|
||||
torch/csrc/generated
|
||||
torch/csrc/generic/TensorMethods.cpp
|
||||
|
@ -28,7 +28,7 @@ exclude_patterns = [
|
||||
'torch/lib/**',
|
||||
'venv/**',
|
||||
'**/*.pyi',
|
||||
"tools/experimental/dynamic_shapes/torchfuzz/**",
|
||||
"tools/experimental/torchfuzz/**",
|
||||
'tools/test/test_selective_build.py',
|
||||
]
|
||||
command = [
|
||||
@ -198,7 +198,7 @@ exclude_patterns = [
|
||||
'tools/test/gen_operators_yaml_test.py',
|
||||
'tools/test/gen_oplist_test.py',
|
||||
'tools/test/test_selective_build.py',
|
||||
'tools/experimental/dynamic_shapes/torchfuzz/**',
|
||||
'tools/experimental/torchfuzz/**',
|
||||
]
|
||||
command = [
|
||||
'python3',
|
||||
|
@ -13,6 +13,9 @@ load(":build_variables.bzl", "jit_core_sources", "lazy_tensor_ts_sources", "libt
|
||||
load(":ufunc_defs.bzl", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cuda_sources")
|
||||
load("//:tools/bazel.bzl", "rules")
|
||||
|
||||
# Export files for use by torch/headeronly (where version.h generation now lives)
|
||||
exports_files(["version.txt"])
|
||||
|
||||
define_targets(rules = rules)
|
||||
|
||||
COMMON_COPTS = [
|
||||
@ -690,7 +693,9 @@ cc_library(
|
||||
"torch/csrc/*/generated/*.h",
|
||||
"torch/csrc/jit/serialization/mobile_bytecode_generated.h",
|
||||
] + torch_cuda_headers,
|
||||
) + GENERATED_AUTOGRAD_CPP + [":version_h"],
|
||||
) + GENERATED_AUTOGRAD_CPP + [
|
||||
"//torch/headeronly:version_h",
|
||||
],
|
||||
includes = [
|
||||
"third_party/kineto/libkineto/include",
|
||||
"torch/csrc",
|
||||
|
@ -53,7 +53,7 @@ ARG CUDA_PATH=cu121
|
||||
ARG INSTALL_CHANNEL=whl/nightly
|
||||
# Automatically set by buildx
|
||||
# pinning version of conda here see: https://github.com/pytorch/pytorch/issues/164574
|
||||
RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y python=${PYTHON_VERSION} conda=25.7.0
|
||||
RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} conda=25.7.0
|
||||
|
||||
ARG TARGETPLATFORM
|
||||
|
||||
|
@ -144,8 +144,7 @@ inline std::string _all_equal_numel_error(at::ArrayRef<Tensor> tensors) {
|
||||
inline bool _apply_preamble(ArrayRef<Tensor> tensors) {
|
||||
checkDeviceType("CPU_tensor_apply", tensors, kCPU);
|
||||
checkLayout("CPU_tensor_apply", tensors, kStrided);
|
||||
if (!_all_equal_numel(tensors))
|
||||
TORCH_CHECK(false, _all_equal_numel_error(tensors));
|
||||
TORCH_CHECK(_all_equal_numel(tensors), _all_equal_numel_error(tensors));
|
||||
// An empty tensor has no elements
|
||||
for (auto& t : tensors)
|
||||
if (t.numel() == 0)
|
||||
|
@ -483,8 +483,8 @@ at::BlasBackend Context::blasPreferredBackend() {
|
||||
#if ROCM_VERSION >= 60300
|
||||
"gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908",
|
||||
#endif
|
||||
#if ROCM_VERSION >= 60500
|
||||
"gfx950"
|
||||
#if ROCM_VERSION >= 70000
|
||||
"gfx950", "gfx1150", "gfx1151"
|
||||
#endif
|
||||
};
|
||||
for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
|
||||
@ -587,20 +587,33 @@ void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) {
|
||||
rocm_fa_preferred_backend = b;
|
||||
}
|
||||
|
||||
bool Context::allowFP16ReductionCuBLAS() const {
|
||||
CuBLASReductionOption Context::allowFP16ReductionCuBLAS() const {
|
||||
return allow_fp16_reduction_cublas;
|
||||
}
|
||||
|
||||
void Context::setAllowFP16ReductionCuBLAS(bool b) {
|
||||
allow_fp16_reduction_cublas = b;
|
||||
CuBLASReductionOption inline get_reduction_option(bool allow_reduced_precision, bool allow_splitk) {
|
||||
TORCH_CHECK(
|
||||
!(allow_reduced_precision && !allow_splitk),
|
||||
"allow_splitk=False is not supported when reduced precision reductions are enabled");
|
||||
if (allow_reduced_precision) {
|
||||
return CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
|
||||
} else if (allow_splitk) {
|
||||
return CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK;
|
||||
} else {
|
||||
return CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK;
|
||||
}
|
||||
}
|
||||
|
||||
bool Context::allowBF16ReductionCuBLAS() const {
|
||||
void Context::setAllowFP16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) {
|
||||
allow_fp16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk);
|
||||
}
|
||||
|
||||
CuBLASReductionOption Context::allowBF16ReductionCuBLAS() const {
|
||||
return allow_bf16_reduction_cublas;
|
||||
}
|
||||
|
||||
void Context::setAllowBF16ReductionCuBLAS(bool b) {
|
||||
allow_bf16_reduction_cublas = b;
|
||||
void Context::setAllowBF16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) {
|
||||
allow_bf16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk);
|
||||
}
|
||||
|
||||
bool Context::allowFP16AccumulationCuBLAS() const {
|
||||
|
@ -38,6 +38,12 @@ namespace at {
|
||||
class Tensor;
|
||||
|
||||
enum class TORCH_API Float32MatmulPrecision { HIGHEST, HIGH, MEDIUM };
|
||||
|
||||
enum class CuBLASReductionOption : uint8_t {
|
||||
AllowReducedPrecisionWithSplitK = 0,
|
||||
DisallowReducedPrecisionAllowSplitK = 1,
|
||||
DisallowReducedPrecisionDisallowSplitK = 2,
|
||||
};
|
||||
enum class TORCH_API Float32Backend { GENERIC, CUDA, MKLDNN };
|
||||
enum class TORCH_API Float32Op { ALL, CONV, RNN, MATMUL };
|
||||
enum class TORCH_API Float32Precision { NONE, IEEE, TF32, BF16 };
|
||||
@ -357,10 +363,14 @@ class TORCH_API Context {
|
||||
void setAllowTF32CuBLAS(bool);
|
||||
Float32MatmulPrecision float32MatmulPrecision() const;
|
||||
Float32Precision float32Precision(Float32Backend backend, Float32Op op) const;
|
||||
bool allowFP16ReductionCuBLAS() const;
|
||||
void setAllowFP16ReductionCuBLAS(bool);
|
||||
bool allowBF16ReductionCuBLAS() const;
|
||||
void setAllowBF16ReductionCuBLAS(bool);
|
||||
CuBLASReductionOption allowFP16ReductionCuBLAS() const;
|
||||
void setAllowFP16ReductionCuBLAS(
|
||||
bool allow_reduced_precision,
|
||||
bool allow_splitk = true);
|
||||
CuBLASReductionOption allowBF16ReductionCuBLAS() const;
|
||||
void setAllowBF16ReductionCuBLAS(
|
||||
bool allow_reduced_precision,
|
||||
bool allow_splitk = true);
|
||||
bool allowFP16AccumulationCuBLAS() const;
|
||||
void setAllowFP16AccumulationCuBLAS(bool);
|
||||
|
||||
@ -452,8 +462,10 @@ class TORCH_API Context {
|
||||
: at::Float32MatmulPrecision::HIGHEST;
|
||||
int benchmark_limit_cudnn = 10;
|
||||
bool allow_tf32_cudnn = true;
|
||||
bool allow_fp16_reduction_cublas = true;
|
||||
bool allow_bf16_reduction_cublas = true;
|
||||
CuBLASReductionOption allow_fp16_reduction_cublas =
|
||||
CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
|
||||
CuBLASReductionOption allow_bf16_reduction_cublas =
|
||||
CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
|
||||
bool allow_fp16_accumulation_cublas = false;
|
||||
std::optional<int32_t> sm_carveout = std::nullopt;
|
||||
bool enabled_mkldnn = true;
|
||||
|
@ -229,14 +229,14 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
|
||||
}
|
||||
|
||||
void resize_(int64_t sparse_dim, int64_t dense_dim, ArrayRef<int64_t> size) {
|
||||
return _resize_(sparse_dim, dense_dim, size);
|
||||
_resize_(sparse_dim, dense_dim, size);
|
||||
}
|
||||
|
||||
void resize_(
|
||||
int64_t sparse_dim,
|
||||
int64_t dense_dim,
|
||||
ArrayRef<c10::SymInt> size) {
|
||||
return _resize_(sparse_dim, dense_dim, size);
|
||||
_resize_(sparse_dim, dense_dim, size);
|
||||
}
|
||||
|
||||
// NOTE: this function will resize the sparse tensor and also set `indices`
|
||||
|
@ -59,7 +59,7 @@ static inline void set_item(const Tensor& self, ArrayRef<TensorIndex> indices, c
|
||||
}
|
||||
}
|
||||
|
||||
return set_item(self, indices, value);
|
||||
set_item(self, indices, value);
|
||||
}
|
||||
|
||||
} // namespace indexing
|
||||
|
@ -765,7 +765,8 @@ void TensorIteratorBase::for_each(loop2d_t loop, int64_t grain_size) {
|
||||
if (numel == 0) {
|
||||
return;
|
||||
} else if (numel < grain_size || at::get_num_threads() == 1) {
|
||||
return serial_for_each(loop, {0, numel});
|
||||
serial_for_each(loop, {0, numel});
|
||||
return;
|
||||
} else {
|
||||
at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
|
||||
serial_for_each(loop, {begin, end});
|
||||
|
@ -49,7 +49,7 @@ static void check_unique_names(DimnameList names) {
|
||||
}
|
||||
|
||||
void check_names_valid_for(const TensorBase& tensor, DimnameList names) {
|
||||
return impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names);
|
||||
impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names);
|
||||
}
|
||||
|
||||
void check_names_valid_for(size_t tensor_dim, DimnameList names) {
|
||||
|
@ -138,7 +138,7 @@ void Tensor::_backward(TensorList inputs,
|
||||
const std::optional<Tensor>& gradient,
|
||||
std::optional<bool> keep_graph,
|
||||
bool create_graph) const {
|
||||
return impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph);
|
||||
impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph);
|
||||
}
|
||||
|
||||
const TensorBase& TensorBase::requires_grad_(bool _requires_grad) const {
|
||||
@ -173,4 +173,12 @@ unsigned TensorBase::_register_hook(std::function<TensorBase(const TensorBase&)>
|
||||
return impl::GetVariableHooks()->_register_hook(*this, std::move(hook));
|
||||
}
|
||||
|
||||
std::optional<ScalarType> TensorBase::grad_dtype() const {
|
||||
return impl::GetVariableHooks()->grad_dtype(*this);
|
||||
}
|
||||
|
||||
void TensorBase::set_grad_dtype(const std::optional<ScalarType>& grad_dtype) const {
|
||||
return impl::GetVariableHooks()->set_grad_dtype(*this, grad_dtype);
|
||||
}
|
||||
|
||||
} // namespace at
|
||||
|
@ -930,6 +930,10 @@ public:
|
||||
|
||||
const TensorBase& requires_grad_(bool _requires_grad=true) const;
|
||||
|
||||
std::optional<ScalarType> grad_dtype() const;
|
||||
|
||||
void set_grad_dtype(const std::optional<ScalarType>& grad_dtype) const;
|
||||
|
||||
// View Variables
|
||||
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
|
@ -68,6 +68,8 @@ struct TORCH_API VariableHooksInterface {
|
||||
const c10::OperatorHandle& op,
|
||||
c10::DispatchKeySet dispatch_keys,
|
||||
torch::jit::Stack* stack) const = 0;
|
||||
virtual std::optional<c10::ScalarType> grad_dtype(const TensorBase&) const = 0;
|
||||
virtual void set_grad_dtype(const TensorBase&, const std::optional<c10::ScalarType>&) const = 0;
|
||||
};
|
||||
|
||||
TORCH_API void SetVariableHooks(VariableHooksInterface* hooks);
|
||||
|
@ -496,7 +496,7 @@ class TORCH_API OperatorHandle {
|
||||
}
|
||||
|
||||
void checkInvariants() const {
|
||||
return operatorDef_->op.checkInvariants();
|
||||
operatorDef_->op.checkInvariants();
|
||||
}
|
||||
|
||||
c10::ArrayRef<at::Tag> getTags() const {
|
||||
@ -932,7 +932,7 @@ inline void Dispatcher::redispatchBoxed(
|
||||
}
|
||||
#endif
|
||||
const auto& kernel = entry.lookup(dispatchKeySet);
|
||||
return kernel.callBoxed(op, dispatchKeySet, stack);
|
||||
kernel.callBoxed(op, dispatchKeySet, stack);
|
||||
}
|
||||
|
||||
} // namespace c10
|
||||
|
@ -422,18 +422,34 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
|
||||
abType = CUDA_R_16F;
|
||||
cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
|
||||
#ifndef USE_ROCM
|
||||
if (!at::globalContext().allowFP16ReductionCuBLAS()) {
|
||||
preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
|
||||
CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
|
||||
auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
|
||||
if (fp16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
uint32_t mask =
|
||||
fp16_reduction ==
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
|
||||
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
|
||||
CUBLASLT_REDUCTION_SCHEME_NONE)
|
||||
: CUBLASLT_REDUCTION_SCHEME_NONE;
|
||||
preference.setAttribute(
|
||||
CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
|
||||
}
|
||||
#endif
|
||||
} else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
|
||||
abType = CUDA_R_16BF;
|
||||
cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
|
||||
#ifndef USE_ROCM
|
||||
if (!at::globalContext().allowBF16ReductionCuBLAS()) {
|
||||
preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
|
||||
CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
|
||||
auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
|
||||
if (bf16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
uint32_t mask =
|
||||
bf16_reduction ==
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
|
||||
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
|
||||
CUBLASLT_REDUCTION_SCHEME_NONE)
|
||||
: CUBLASLT_REDUCTION_SCHEME_NONE;
|
||||
preference.setAttribute(
|
||||
CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
@ -1120,8 +1136,15 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
|
||||
}
|
||||
if (prop->major >= 5) {
|
||||
cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
|
||||
if (!at::globalContext().allowFP16ReductionCuBLAS()) {
|
||||
cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
|
||||
auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
|
||||
TORCH_CHECK(fp16_reduction !=
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK,
|
||||
"torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction("
|
||||
"..., allow_splitk=False) requires the cuBLASLt backend");
|
||||
if (fp16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
cublas_flags = static_cast<cublasMath_t>(
|
||||
cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
|
||||
}
|
||||
// Disallow fp16 reductions that could lead to unexpected overflow issues.
|
||||
TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, cublas_flags));
|
||||
@ -1180,8 +1203,15 @@ inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DT
|
||||
GEMM_CHECK_ARGVALUES(at::BFloat16);
|
||||
#ifndef USE_ROCM
|
||||
cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
|
||||
if (!at::globalContext().allowBF16ReductionCuBLAS()) {
|
||||
cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
|
||||
auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
|
||||
TORCH_CHECK(bf16_reduction !=
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK,
|
||||
"torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction("
|
||||
"..., allow_splitk=False) requires the cuBLASLt backend");
|
||||
if (bf16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
cublas_flags = static_cast<cublasMath_t>(
|
||||
cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
|
||||
}
|
||||
#endif
|
||||
#if defined(USE_ROCM)
|
||||
@ -1270,7 +1300,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
|
||||
}
|
||||
#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
|
||||
else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
|
||||
if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) { //no CK GEMM version for gfx1100
|
||||
if (at::detail::getCUDAHooks().isGPUArch({"gfx11", "gfx12"})) { //no CK GEMM version
|
||||
gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
|
||||
} else{
|
||||
at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
|
||||
@ -1577,18 +1607,34 @@ bool gemm_and_bias(
|
||||
abType = CUDA_R_16F;
|
||||
cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
|
||||
#ifndef USE_ROCM
|
||||
if (!at::globalContext().allowFP16ReductionCuBLAS()) {
|
||||
preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
|
||||
CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
|
||||
auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
|
||||
if (fp16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
uint32_t mask =
|
||||
fp16_reduction ==
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
|
||||
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
|
||||
CUBLASLT_REDUCTION_SCHEME_NONE)
|
||||
: CUBLASLT_REDUCTION_SCHEME_NONE;
|
||||
preference.setAttribute(
|
||||
CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
|
||||
}
|
||||
#endif
|
||||
} else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
|
||||
abType = CUDA_R_16BF;
|
||||
cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
|
||||
#ifndef USE_ROCM
|
||||
if (!at::globalContext().allowBF16ReductionCuBLAS()) {
|
||||
preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
|
||||
CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
|
||||
auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
|
||||
if (bf16_reduction !=
|
||||
at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
|
||||
uint32_t mask =
|
||||
bf16_reduction ==
|
||||
at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
|
||||
? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
|
||||
CUBLASLT_REDUCTION_SCHEME_NONE)
|
||||
: CUBLASLT_REDUCTION_SCHEME_NONE;
|
||||
preference.setAttribute(
|
||||
CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -326,6 +326,23 @@ bool CUDAHooks::supportsBFloat16ConvolutionWithCuDNNv8() const {
|
||||
#endif
|
||||
}
|
||||
|
||||
bool CUDAHooks::supportsBFloat16RNNWithCuDNN() const {
|
||||
#if AT_CUDNN_ENABLED() && (CUDNN_VERSION >= 91300)
|
||||
if (!hasCUDA()) {
|
||||
return false;
|
||||
}
|
||||
cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
|
||||
// Check for Volta cores
|
||||
if (prop->major >= 8) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
long CUDAHooks::versionCuDNN() const {
|
||||
#if AT_CUDNN_ENABLED()
|
||||
return CUDNN_VERSION;
|
||||
|
@ -45,6 +45,7 @@ struct CUDAHooks : public at::CUDAHooksInterface {
|
||||
bool supportsDilatedConvolutionWithCuDNN() const override;
|
||||
bool supportsDepthwiseConvolutionWithCuDNN() const override;
|
||||
bool supportsBFloat16ConvolutionWithCuDNNv8() const override;
|
||||
bool supportsBFloat16RNNWithCuDNN() const override;
|
||||
bool hasCUDART() const override;
|
||||
long versionCUDART() const override;
|
||||
long versionCuDNN() const override;
|
||||
|
@ -166,6 +166,10 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual bool supportsBFloat16RNNWithCuDNN() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual long versionCuDNN() const {
|
||||
TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
|
||||
}
|
||||
|
@ -465,11 +465,11 @@ static void dynamicLayerBack(const c10::OperatorHandle& op, torch::jit::Stack* s
|
||||
|
||||
// used for functions that have aliasing operations but should be treated like they're out of place (i.e. lift_fresh)
|
||||
static void dynamicLayerBackGradSpecialCase(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
|
||||
return dynamicLayerBack(op, stack, true);
|
||||
dynamicLayerBack(op, stack, true);
|
||||
}
|
||||
|
||||
static void dynamicLayerBackFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
|
||||
return dynamicLayerBack(op, stack, false);
|
||||
dynamicLayerBack(op, stack, false);
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL(_, FuncTorchDynamicLayerFrontMode, m) {
|
||||
|
@ -12,7 +12,7 @@
|
||||
|
||||
#define MPS_ERROR_NOT_COMPILED "PyTorch code is not compiled with MPS enabled"
|
||||
#define MPS_ERROR_RUNTIME_TOO_LOW \
|
||||
"The MPS backend is supported on MacOS 13.0+.", \
|
||||
"The MPS backend is supported on MacOS 14.0+. ", \
|
||||
"Current OS version can be queried using `sw_vers`"
|
||||
#define MPS_ERROR_DOUBLE_NOT_SUPPORTED "Cannot convert a MPS Tensor to float64 dtype " \
|
||||
"as the MPS framework doesn't support float64. Please use float32 instead."
|
||||
|
@ -375,7 +375,7 @@ static void bf16_gemv_trans(
|
||||
const at::BFloat16 beta,
|
||||
at::BFloat16* y,
|
||||
const int incy) {
|
||||
return bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
|
||||
bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
|
||||
}
|
||||
|
||||
template <>
|
||||
|
@ -70,7 +70,7 @@ inline void searchsorted_maybe_trim_input_tensors(
|
||||
const Tensor& raw_boundaries) {
|
||||
Tensor trimmed_sorter;
|
||||
Tensor raw_sorter;
|
||||
return searchsorted_maybe_trim_input_tensors(
|
||||
searchsorted_maybe_trim_input_tensors(
|
||||
trimmed_input,
|
||||
trimmed_boundaries,
|
||||
trimmed_sorter,
|
||||
|
@ -93,6 +93,12 @@ inline bool cond_cudnn_grid_sampler(
|
||||
const TensorBase& input,
|
||||
const TensorBase& grid
|
||||
) {
|
||||
auto st = input.scalar_type();
|
||||
if (!(st == kDouble || st == kFloat || st == kHalf))
|
||||
return false;
|
||||
st = grid.scalar_type();
|
||||
if (!(st == kDouble || st == kFloat || st == kHalf))
|
||||
return false;
|
||||
return (
|
||||
at::native::cudnn_is_acceptable(input) &&
|
||||
at::native::cudnn_is_acceptable(grid) &&
|
||||
|
@ -108,6 +108,13 @@ bool use_mkldnn(const Tensor& input, TensorList params, TensorList hx) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool use_cudnn(const Tensor& t) {
|
||||
bool acceptable = at::cudnn_is_acceptable(t);
|
||||
auto st = t.scalar_type();
|
||||
bool bfloat16_cond = st == kBFloat16 && at::detail::getCUDAHooks().supportsBFloat16RNNWithCuDNN();
|
||||
return acceptable && (bfloat16_cond || st == kDouble || st == kFloat || st == kHalf);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
using pair_of = std::pair<T, T>;
|
||||
|
||||
@ -1200,7 +1207,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backwar
|
||||
bool train, \
|
||||
bool bidirectional, \
|
||||
bool batch_first) { \
|
||||
if (at::cudnn_is_acceptable(_input)) { \
|
||||
if (use_cudnn(_input)) { \
|
||||
Tensor output, hy; \
|
||||
NAME##_cudnn_stub( \
|
||||
_input.device().type(), \
|
||||
@ -1262,7 +1269,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backwar
|
||||
double dropout_p, \
|
||||
bool train, \
|
||||
bool bidirectional) { \
|
||||
if (at::cudnn_is_acceptable(data)) { \
|
||||
if (use_cudnn(data)) { \
|
||||
Tensor output, hy; \
|
||||
NAME##_packed_cudnn_stub( \
|
||||
data.device().type(), \
|
||||
@ -1430,7 +1437,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
|
||||
TensorList _params, bool has_biases,
|
||||
int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
|
||||
TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
|
||||
if (at::cudnn_is_acceptable(_input)) {
|
||||
if (use_cudnn(_input)) {
|
||||
Tensor output, hy, cy;
|
||||
lstm_cudnn_stub(_input.device().type(), output, hy, cy, _input, hx, _params, has_biases,
|
||||
num_layers, dropout_p, train, bidirectional, batch_first);
|
||||
@ -1491,7 +1498,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
|
||||
TensorList _params, bool has_biases,
|
||||
int64_t num_layers, double dropout_p, bool train, bool bidirectional) {
|
||||
TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
|
||||
if (at::cudnn_is_acceptable(data)) {
|
||||
if (use_cudnn(data)) {
|
||||
Tensor output, hy, cy;
|
||||
lstm_packed_cudnn_stub(data.device().type(), output, hy, cy, data, batch_sizes, hx,
|
||||
_params, has_biases, num_layers, dropout_p, train, bidirectional);
|
||||
|
@ -23,14 +23,6 @@
|
||||
#include <ATen/Functions.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#else
|
||||
#include <ATen/ops/_cast_Byte_native.h>
|
||||
#include <ATen/ops/_cast_Char_native.h>
|
||||
#include <ATen/ops/_cast_Double_native.h>
|
||||
#include <ATen/ops/_cast_Float_native.h>
|
||||
#include <ATen/ops/_cast_Half_native.h>
|
||||
#include <ATen/ops/_cast_Int_native.h>
|
||||
#include <ATen/ops/_cast_Long_native.h>
|
||||
#include <ATen/ops/_cast_Short_native.h>
|
||||
#include <ATen/ops/_dim_arange_native.h>
|
||||
#include <ATen/ops/_efficientzerotensor_native.h>
|
||||
#include <ATen/ops/_empty_affine_quantized.h>
|
||||
|
@ -91,9 +91,6 @@ bool cudnn_is_acceptable(const TensorBase& self) {
|
||||
return false;
|
||||
if (!self.is_cuda())
|
||||
return false;
|
||||
auto st = self.scalar_type();
|
||||
if (!(st == kDouble || st == kFloat || st == kHalf))
|
||||
return false;
|
||||
if (!detail::getCUDAHooks().compiledWithCuDNN())
|
||||
return false;
|
||||
// cuDNN functions like grid_sampler returns CUDNN_STATUS_BAD_PARAM on empty
|
||||
|
@ -25,11 +25,11 @@
|
||||
namespace at::native {
|
||||
|
||||
void _backward(const Tensor& self, TensorList inputs, const std::optional<Tensor>& gradient_opt, std::optional<bool> keep_graph, bool create_graph) {
|
||||
return self._backward(inputs, gradient_opt, keep_graph, create_graph);
|
||||
self._backward(inputs, gradient_opt, keep_graph, create_graph);
|
||||
}
|
||||
|
||||
void set_data(Tensor& self, const Tensor& new_data) {
|
||||
return self.set_data(new_data);
|
||||
self.set_data(new_data);
|
||||
}
|
||||
|
||||
Tensor data(const Tensor& self) {
|
||||
@ -54,7 +54,7 @@ Tensor& requires_grad_(Tensor& self, bool _requires_grad) {
|
||||
}
|
||||
|
||||
void retain_grad(Tensor& self) {
|
||||
return self.retain_grad();
|
||||
self.retain_grad();
|
||||
}
|
||||
|
||||
bool retains_grad(const Tensor& self) {
|
||||
|
@ -300,7 +300,8 @@ void div_floor_kernel(TensorIteratorBase& iter) {
|
||||
// In the special case of unsigned integer division, floor division is
|
||||
// equivalent to truncation division (since the signs of the divisor and
|
||||
// dividend are always the same)
|
||||
return div_trunc_kernel(iter);
|
||||
div_trunc_kernel(iter);
|
||||
return;
|
||||
} else if (isIntegralType(dtype, /*includeBool*/ false)) {
|
||||
// There's no SIMD integer division, so don't try to vectorize it.
|
||||
AT_DISPATCH_INTEGRAL_TYPES(dtype, "div_floor_cpu", [&]() {
|
||||
|
@ -749,21 +749,29 @@ void flip_kernel(TensorIterator& iter, const bool quantized) {
|
||||
// });
|
||||
|
||||
if (iter_dtype == kByte) {
|
||||
return cpu_hflip_vec<uint8_t>(iter);
|
||||
cpu_hflip_vec<uint8_t>(iter);
|
||||
return;
|
||||
} else if (iter_dtype == kChar) {
|
||||
return cpu_hflip_vec<int8_t>(iter);
|
||||
cpu_hflip_vec<int8_t>(iter);
|
||||
return;
|
||||
} else if (iter_dtype == kInt) {
|
||||
return cpu_hflip_vec<int32_t>(iter);
|
||||
cpu_hflip_vec<int32_t>(iter);
|
||||
return;
|
||||
} else if (iter_dtype == kLong) {
|
||||
return cpu_hflip_vec<int64_t>(iter);
|
||||
cpu_hflip_vec<int64_t>(iter);
|
||||
return;
|
||||
} else if (iter_dtype == kShort) {
|
||||
return cpu_hflip_vec<int16_t>(iter);
|
||||
cpu_hflip_vec<int16_t>(iter);
|
||||
return;
|
||||
} else if (iter_dtype == kBool) {
|
||||
return cpu_hflip_vec<bool>(iter);
|
||||
cpu_hflip_vec<bool>(iter);
|
||||
return;
|
||||
} else if (iter_dtype == kFloat) {
|
||||
return cpu_hflip_vec<float>(iter);
|
||||
cpu_hflip_vec<float>(iter);
|
||||
return;
|
||||
} else if (iter_dtype == kDouble) {
|
||||
return cpu_hflip_vec<double>(iter);
|
||||
cpu_hflip_vec<double>(iter);
|
||||
return;
|
||||
}
|
||||
}
|
||||
// other dtypes (float16, bfloat16, complex) are handled by cpu_kernel_vec (see below)
|
||||
@ -778,10 +786,12 @@ void flip_kernel(TensorIterator& iter, const bool quantized) {
|
||||
c == input_strides_2[1] &&
|
||||
c == iter.element_size(0) * iter.shape()[0] // checks if dim=1 is contiguous as well
|
||||
) {
|
||||
return cpu_hflip_channels_last_vec(iter);
|
||||
cpu_hflip_channels_last_vec(iter);
|
||||
return;
|
||||
}
|
||||
// Special case: vertical flip using memcpy (faster than generic cpu_kernel_vec)
|
||||
return cpu_vflip_memcpy(iter);
|
||||
cpu_vflip_memcpy(iter);
|
||||
return;
|
||||
}
|
||||
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(), "flip_cpu",
|
||||
|
@ -96,11 +96,14 @@ static void pow_tensor_scalar_kernel(
|
||||
dtype == kBFloat16 || isComplexType(dtype)) {
|
||||
// Dispatch to fast specialization for sqrt, rsqrt and reciprocal
|
||||
if (exp_scalar.equal(.5)) {
|
||||
return sqrt_kernel(iter);
|
||||
sqrt_kernel(iter);
|
||||
return;
|
||||
} else if (exp_scalar.equal(-0.5)) {
|
||||
return rsqrt_kernel(iter);
|
||||
rsqrt_kernel(iter);
|
||||
return;
|
||||
} else if (exp_scalar.equal(-1.0)) {
|
||||
return reciprocal_kernel(iter);
|
||||
reciprocal_kernel(iter);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -256,10 +256,10 @@ static void norm_kernel_tensor_iterator_impl(
|
||||
} else {
|
||||
if (iter.input_dtype() == kHalf && iter.dtype(0) == kFloat) {
|
||||
// type promotion that does cast and reduction in a single kernel
|
||||
return norm_kernel_cpu_impl<at::Half, float>(iter, val);
|
||||
norm_kernel_cpu_impl<at::Half, float>(iter, val); return;
|
||||
} else if (iter.input_dtype() == kBFloat16 && iter.dtype(0) == kFloat) {
|
||||
// type promotion that does cast and reduction in a single kernel
|
||||
return norm_kernel_cpu_impl<at::BFloat16, float>(iter, val);
|
||||
norm_kernel_cpu_impl<at::BFloat16, float>(iter, val); return;
|
||||
}
|
||||
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kHalf, kBFloat16, kComplexHalf, iter.input_dtype(), "norm_cpu", [&] {
|
||||
|
@ -428,10 +428,11 @@ void fp16_gemv_trans(
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && alpha == 1.0);
|
||||
#if !defined(__aarch64__) || defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
|
||||
if (at::globalContext().allowFP16ReductionCPU()) {
|
||||
return fp16_gemv_trans_fp16_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
|
||||
fp16_gemv_trans_fp16_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
return fp16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
|
||||
fp16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
|
||||
}
|
||||
|
||||
float bf16_dot_with_fp32_arith(const at::BFloat16* vec1, const at::BFloat16* vec2, int64_t len) {
|
||||
@ -465,7 +466,7 @@ void bf16_gemv_trans(
|
||||
at::BFloat16* y,
|
||||
const int incy) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && alpha == 1.0 && beta == 0.0);
|
||||
return bf16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, y, incy);
|
||||
bf16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, y, incy);
|
||||
}
|
||||
|
||||
float fp16_dot(
|
||||
|
@ -285,8 +285,8 @@ static bool isSupportedHipLtROCmArch(int index) {
|
||||
#if ROCM_VERSION >= 60300
|
||||
"gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908",
|
||||
#endif
|
||||
#if ROCM_VERSION >= 60500
|
||||
"gfx950"
|
||||
#if ROCM_VERSION >= 70000
|
||||
"gfx950", "gfx1150", "gfx1151"
|
||||
#endif
|
||||
};
|
||||
return at::detail::getCUDAHooks().isGPUArch(archs, index);
|
||||
|
@ -59,7 +59,7 @@ constexpr uint64_t getDefaultMaxThreadsPerBlock() {
|
||||
#ifdef USE_ROCM
|
||||
#define SKIP_SORTED_INDICES 32
|
||||
template <typename scalar_t, int SZ>
|
||||
__global__ void indexing_backward_kernel(
|
||||
__global__ void indexing_backward_kernel_many_indices(
|
||||
const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
|
||||
int64_t numel, int64_t stride, int64_t stride_before, int64_t outer_dim, bool accumulate) {
|
||||
using opmath_t = at::opmath_type<scalar_t>;
|
||||
@ -254,7 +254,8 @@ __global__ void indexing_backward_kernel_stride_1(
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
#endif
|
||||
|
||||
template <typename scalar_t, int SZ>
|
||||
__global__ void indexing_backward_kernel(
|
||||
const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
|
||||
@ -333,6 +334,7 @@ __global__ void indexing_backward_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef USE_ROCM
|
||||
template <typename scalar_t>
|
||||
__global__ void indexing_backward_kernel_stride_1(
|
||||
const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
|
||||
@ -780,11 +782,43 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
|
||||
kBool,
|
||||
kBFloat16);
|
||||
} else {
|
||||
#ifdef USE_ROCM
|
||||
if (num_indices >= 200000)
|
||||
AT_DISPATCH_V2(
|
||||
expandedValue.scalar_type(),
|
||||
"indexing_backward_many_indices",
|
||||
AT_WRAP([&] {
|
||||
indexing_backward_kernel_many_indices<scalar_t, UNROLL><<<new_grid, block, smem_dups_size, stream>>>(
|
||||
sorted_indices.const_data_ptr<int64_t>(),
|
||||
orig_indices.const_data_ptr<int64_t>(),
|
||||
expandedValue.const_data_ptr<scalar_t>(),
|
||||
src_.mutable_data_ptr<scalar_t>(),
|
||||
num_indices,
|
||||
sliceSize,
|
||||
strideBefore,
|
||||
nElemBefore,
|
||||
accumulate);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}),
|
||||
AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
|
||||
// AT_EXPAND(AT_FLOAT8_TYPES),
|
||||
// TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
|
||||
// should not be supported here, then reenable AT_FLOAT8_DTYPES
|
||||
kFloat8_e4m3fn,
|
||||
kFloat8_e5m2,
|
||||
kFloat8_e4m3fnuz,
|
||||
kFloat8_e5m2fnuz,
|
||||
kComplexHalf,
|
||||
kHalf,
|
||||
kBool,
|
||||
kBFloat16);
|
||||
else
|
||||
#endif
|
||||
AT_DISPATCH_V2(
|
||||
expandedValue.scalar_type(),
|
||||
"indexing_backward",
|
||||
AT_WRAP([&] {
|
||||
indexing_backward_kernel<scalar_t, UNROLL><<<KERNEL_GRID, block, KERNEL_SMEM, stream>>>(
|
||||
indexing_backward_kernel<scalar_t, UNROLL><<<grid, block, 0, stream>>>(
|
||||
sorted_indices.const_data_ptr<int64_t>(),
|
||||
orig_indices.const_data_ptr<int64_t>(),
|
||||
expandedValue.const_data_ptr<scalar_t>(),
|
||||
|
@ -121,7 +121,7 @@ void cufft_set_plan_cache_max_size_impl(DeviceIndex device_index, int64_t max_si
|
||||
"cufft_set_plan_cache_max_size: expected 0 <= device_index < ",
|
||||
at::detail::getCUDAHooks().deviceCount(), "], but got device_index=",
|
||||
device_index);
|
||||
return cufft_get_plan_cache(device_index).resize(max_size);
|
||||
cufft_get_plan_cache(device_index).resize(max_size);
|
||||
}
|
||||
|
||||
int64_t cufft_get_plan_cache_size_impl(DeviceIndex device_index) {
|
||||
@ -137,7 +137,7 @@ void cufft_clear_plan_cache_impl(DeviceIndex device_index) {
|
||||
"cufft_clear_plan_cache: expected 0 <= device_index < ",
|
||||
at::detail::getCUDAHooks().deviceCount(), "], but got device_index=",
|
||||
device_index);
|
||||
return cufft_get_plan_cache(device_index).clear();
|
||||
cufft_get_plan_cache(device_index).clear();
|
||||
}
|
||||
|
||||
} // namespace at::native::detail
|
||||
|
@ -230,7 +230,7 @@ constexpr int BLOCK_THREADS = 256;
|
||||
constexpr int RADIX_BITS = 8;
|
||||
constexpr int RADIX_DIGITS = 1 << RADIX_BITS; // 2 ^ RADIX_BITS
|
||||
constexpr int RADIX_MASK = (RADIX_DIGITS - 1);
|
||||
static_assert(RADIX_DIGITS <= BLOCK_THREADS, "radixFindKthValues kernel requires RADIX_DIGITS <= BLOCK_THREADS");
|
||||
static_assert(RADIX_DIGITS <= BLOCK_THREADS, "RADIX_DIGITS must be <= BLOCK_THREADS");
|
||||
constexpr int MIN_ITEMS_PER_THREAD = 4;
|
||||
constexpr int MAX_ITEMS_PER_THREAD = 64;
|
||||
|
||||
@ -242,11 +242,10 @@ __global__ void fill(T* x, T value, IndexType size) {
|
||||
}
|
||||
}
|
||||
|
||||
// find the kth smallest value,
|
||||
// for largest topk, k_to_find = slice_size - k + 1
|
||||
// compute local histogram for each block
|
||||
template <typename T, typename IndexType, typename Bitwise, int Dim>
|
||||
C10_LAUNCH_BOUNDS_1(BLOCK_THREADS)
|
||||
__global__ void radixFindKthValues(
|
||||
__global__ void computeBlockDigitCounts(
|
||||
at::cuda::detail::TensorInfo<const T, IndexType> input,
|
||||
uint32_t slice_size,
|
||||
uint32_t* ks_to_find, // size: num_slices, unused arg but for mysterious reasons perf is better when it's present
|
||||
@ -321,12 +320,51 @@ __global__ void radixFindKthValues(
|
||||
}
|
||||
}
|
||||
|
||||
// compute global histogram and cumsum for each row
|
||||
__global__ void computeDigitCumSum(
|
||||
short* counts,
|
||||
uint32_t* digit_cum_sum,
|
||||
uint32_t blocks_per_slice) {
|
||||
int tidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int digit_idx = threadIdx.x;
|
||||
uint32_t slice_idx = blockIdx.x;
|
||||
|
||||
typedef cub::BlockScan<uint32_t, RADIX_DIGITS> BlockScan;
|
||||
__shared__ typename BlockScan::TempStorage scan_storage;
|
||||
// accumulates counters from multiple blocks
|
||||
uint32_t digit_count = 0;
|
||||
if (threadIdx.x < RADIX_DIGITS) {
|
||||
constexpr int HISTO_ACCUM_TILE = 4;
|
||||
uint32_t rounds = blocks_per_slice / HISTO_ACCUM_TILE;
|
||||
for (int iter = 0; iter < rounds; iter++) {
|
||||
int base = HISTO_ACCUM_TILE * iter;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < HISTO_ACCUM_TILE; j++) {
|
||||
int blk = base + j;
|
||||
digit_count += counts[(slice_idx * blocks_per_slice + blk) * RADIX_DIGITS + digit_idx];
|
||||
}
|
||||
}
|
||||
for (int blk = HISTO_ACCUM_TILE * rounds; blk < blocks_per_slice; blk++) {
|
||||
digit_count += counts[(slice_idx * blocks_per_slice + blk) * RADIX_DIGITS + digit_idx];
|
||||
}
|
||||
|
||||
}
|
||||
// compute the block-wide inclusive prefix sum
|
||||
uint32_t digit_count_cumsum;
|
||||
BlockScan(scan_storage).InclusiveSum(digit_count, digit_count_cumsum);
|
||||
__syncthreads();
|
||||
if (threadIdx.x < RADIX_DIGITS) {
|
||||
digit_cum_sum[tidx] = digit_count_cumsum;
|
||||
}
|
||||
}
|
||||
|
||||
// Assumption: k can not be larger than UINT32_MAX
|
||||
template <typename Bitwise, typename T>
|
||||
C10_LAUNCH_BOUNDS_1(RADIX_DIGITS) // one thread per digit
|
||||
__global__ void computeBlockwiseWithinKCounts(
|
||||
Bitwise* desires_in, // size: num_slices
|
||||
short* counts, // size: num_slices * blocks_per_slice * radix_digits
|
||||
uint32_t* digit_cum_sum,
|
||||
uint32_t* ks_to_find_in, // size: num_slices
|
||||
uint32_t blocks_per_slice,
|
||||
int current_bit,
|
||||
@ -338,7 +376,7 @@ __global__ void computeBlockwiseWithinKCounts(
|
||||
Bitwise* desires_out,
|
||||
uint32_t num_blocks
|
||||
) {
|
||||
// This kernel should be launched with the same number of blocks as the `radixFindKthValues` kernel.
|
||||
// This kernel should be launched with the same number of blocks as the `computeBlockDigitCounts` kernel.
|
||||
int tidx = threadIdx.x;
|
||||
uint32_t block_idx = getLinearBlockId<uint32_t>();
|
||||
uint32_t slice_idx = block_idx / blocks_per_slice;
|
||||
@ -351,36 +389,15 @@ __global__ void computeBlockwiseWithinKCounts(
|
||||
if (block_idx >= num_blocks) {
|
||||
return;
|
||||
}
|
||||
typedef cub::BlockScan<uint32_t, BLOCK_THREADS> BlockScan;
|
||||
union __align__(16) TempStorage {
|
||||
uint32_t digit_count_cumsum[RADIX_DIGITS]; // only used if this it the last block for this slice
|
||||
typename BlockScan::TempStorage scan_storage;
|
||||
};
|
||||
__shared__ TempStorage temp_storage;
|
||||
|
||||
// accumulates counters from multiple blocks
|
||||
uint32_t digit_count = 0;
|
||||
if (tidx < RADIX_DIGITS) {
|
||||
for (int blk = 0; blk < blocks_per_slice; ++blk) {
|
||||
digit_count += counts[(slice_idx * blocks_per_slice + blk) * RADIX_DIGITS + tidx];
|
||||
}
|
||||
}
|
||||
|
||||
// compute the block-wide inclusive prefix sum
|
||||
uint32_t digit_count_cumsum;
|
||||
BlockScan(temp_storage.scan_storage).InclusiveSum(digit_count, digit_count_cumsum);
|
||||
__syncthreads();
|
||||
// every thread also need the perfix_sum of it's left value for comparison, so save a copy in shared mem
|
||||
if (tidx < RADIX_DIGITS) {
|
||||
temp_storage.digit_count_cumsum[tidx] = digit_count_cumsum;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
__shared__ Bitwise desired;
|
||||
uint32_t k_to_find = ks_to_find_in[slice_idx];
|
||||
|
||||
if (tidx < RADIX_DIGITS) {
|
||||
uint32_t digit_count_cumsum_left = (tidx == 0) ? 0 : temp_storage.digit_count_cumsum[tidx - 1];
|
||||
uint32_t position = slice_idx * RADIX_DIGITS + tidx;
|
||||
uint32_t digit_count_cumsum = digit_cum_sum[position];
|
||||
uint32_t digit_count_cumsum_left = (tidx == 0) ? 0 : digit_cum_sum[position - 1];
|
||||
|
||||
// if not the last pass: update desired and ks_to_find
|
||||
// if last pass: write out the kth value
|
||||
@ -466,7 +483,7 @@ template <typename Bitwise>
|
||||
__global__ void computeBlockwiseKthCounts(
|
||||
Bitwise* desires, // size: num_slices
|
||||
short* counts, // size: num_slices * blocks_per_slice * radix_digits
|
||||
uint32_t num_blocks, // the number of blocks used by `radixFindKthValues` kernel
|
||||
uint32_t num_blocks, // the number of blocks used by `computeBlockDigitCounts` kernel
|
||||
uint32_t blocks_per_slice,
|
||||
// outputs:
|
||||
uint32_t* kthCounts // size: num_slices * blocks_per_slice == num_blocks
|
||||
@ -649,9 +666,7 @@ void launch(
|
||||
T* kthValues = reinterpret_cast<T*>(kthValues_buffer.get());
|
||||
|
||||
TORCH_CHECK(blocks_per_slice <= std::numeric_limits<uint32_t>::max(), "blocks_per_slice larger than uint32 maximum is not supported");
|
||||
auto semaphores_buffer = allocator.allocate(numInputSlices * sizeof(uint32_t));
|
||||
uint32_t* semaphores = reinterpret_cast<uint32_t*>(semaphores_buffer.get());
|
||||
AT_CUDA_CHECK(cudaMemsetAsync(semaphores, 0, numInputSlices * sizeof(uint32_t), stream));
|
||||
|
||||
|
||||
auto ks_to_find_buffer = allocator.allocate(2 * numInputSlices * sizeof(uint32_t));
|
||||
uint32_t* ks_to_find = reinterpret_cast<uint32_t*>(ks_to_find_buffer.get());
|
||||
@ -668,6 +683,10 @@ void launch(
|
||||
static_assert(MAX_ITEMS_PER_THREAD * BLOCK_THREADS < std::numeric_limits<short>::max(),
|
||||
"blockwise counter too large");
|
||||
|
||||
auto digit_cum_sum_buffer = allocator.allocate(numInputSlices * RADIX_DIGITS * sizeof(uint32_t));
|
||||
uint32_t* digit_cum_sum = reinterpret_cast<uint32_t*>(digit_cum_sum_buffer.get());
|
||||
AT_CUDA_CHECK(cudaMemsetAsync(digit_cum_sum, 0, numInputSlices * RADIX_DIGITS * sizeof(uint32_t), stream));
|
||||
|
||||
#if CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
auto withinKCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
|
||||
uint32_t* withinKCounts = reinterpret_cast<uint32_t*>(withinKCounts_buffer.get());
|
||||
@ -691,7 +710,7 @@ void launch(
|
||||
|
||||
// iterate radix bits for multiple passes
|
||||
for (int current_bit = sizeof(T) * 8 - RADIX_BITS; current_bit >= 0; current_bit -= RADIX_BITS) {
|
||||
radixFindKthValues<T, IndexType, Bitwise, Dim><<<grid, block, 0, stream>>>(
|
||||
computeBlockDigitCounts<T, IndexType, Bitwise, Dim><<<grid, block, 0, stream>>>(
|
||||
input,
|
||||
inputSliceSize,
|
||||
ks_to_find_in, // unused arg
|
||||
@ -704,10 +723,14 @@ void launch(
|
||||
desired_in,
|
||||
counts);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
computeDigitCumSum<<<numInputSlices, RADIX_DIGITS, 0, stream>>>(counts, digit_cum_sum, blocks_per_slice);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
// we unconditionally call this kernel to update desired/ks_to_find/kthValues
|
||||
// if cub supports scan_by_key we additionally do k counts
|
||||
computeBlockwiseWithinKCounts<Bitwise, T><<<grid, RADIX_DIGITS, 0, stream>>>(
|
||||
desired_in, counts, ks_to_find_in, blocks_per_slice, current_bit, largest, withinKCounts, kthValues, ks_to_find_out, desired_out, num_blocks);
|
||||
desired_in, counts, digit_cum_sum, ks_to_find_in, blocks_per_slice, current_bit, largest, withinKCounts, kthValues, ks_to_find_out, desired_out, num_blocks);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
// swap desired/ks_to_find in and out for next iter
|
||||
auto tmp_desired = desired_in;
|
||||
|
@ -1107,10 +1107,14 @@ void ldl_factor_kernel(
|
||||
auto preferred_backend = at::globalContext().linalgPreferredBackend();
|
||||
switch (preferred_backend) {
|
||||
case at::LinalgBackend::Cusolver:
|
||||
return ldl_factor_cusolver(
|
||||
{ ldl_factor_cusolver(
|
||||
LD, pivots, info, upper, hermitian);
|
||||
return;
|
||||
}
|
||||
case at::LinalgBackend::Magma:
|
||||
return ldl_factor_magma(LD, pivots, info, upper, hermitian);
|
||||
{ ldl_factor_magma(LD, pivots, info, upper, hermitian);
|
||||
return;
|
||||
}
|
||||
default:
|
||||
// By default use cusolver if available and magma otherwise.
|
||||
// If cusolver and magma 2.5.4+ are both available and hermitian=true,
|
||||
@ -1122,8 +1126,10 @@ void ldl_factor_kernel(
|
||||
LD, pivots, info, upper, hermitian);
|
||||
}
|
||||
#endif
|
||||
return ldl_factor_cusolver(
|
||||
LD, pivots, info, upper, hermitian);
|
||||
{ ldl_factor_cusolver(
|
||||
LD, pivots, info, upper, hermitian);
|
||||
return;
|
||||
}
|
||||
#else
|
||||
return ldl_factor_magma(LD, pivots, info, upper, hermitian);
|
||||
#endif
|
||||
@ -1839,11 +1845,14 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
|
||||
// For the benchmarks see
|
||||
// https://github.com/pytorch/pytorch/pull/56253#discussion_r622851107
|
||||
if (input.size(-2) <= 256 && batchCount(input) >= std::max<int64_t>(2, input.size(-2) / 16)) {
|
||||
return geqrf_batched_cublas(input, tau);
|
||||
geqrf_batched_cublas(input, tau);
|
||||
return;
|
||||
} else {
|
||||
return geqrf_cusolver(input, tau);
|
||||
geqrf_cusolver(input, tau);
|
||||
return;
|
||||
}
|
||||
return geqrf_batched_cublas(input, tau);
|
||||
geqrf_batched_cublas(input, tau);
|
||||
return;
|
||||
};
|
||||
|
||||
auto preferred_backend = at::globalContext().linalgPreferredBackend();
|
||||
@ -1856,10 +1865,14 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
|
||||
// - ?geqrf_gpu allows fast computation of Q via ?orgqr_gpu, but doesn't give R properly.
|
||||
// - ?geqrf2_gpu gives correct R, but doesn't allow computation of Q via ?orgqr_gpu
|
||||
case at::LinalgBackend::Magma:
|
||||
return geqrf_magma(input, tau);
|
||||
{ geqrf_magma(input, tau);
|
||||
return;
|
||||
}
|
||||
case at::LinalgBackend::Cusolver:
|
||||
default:
|
||||
return geqrf_cusolver_backend(input, tau);
|
||||
{ geqrf_cusolver_backend(input, tau);
|
||||
return;
|
||||
}
|
||||
}
|
||||
#else
|
||||
return geqrf_magma(input, tau);
|
||||
@ -2703,13 +2716,17 @@ void gels_looped(const Tensor& a, Tensor& b, Tensor& infos) {
|
||||
auto preferred_backend = at::globalContext().linalgPreferredBackend();
|
||||
switch (preferred_backend) {
|
||||
case at::LinalgBackend::Magma:
|
||||
return gels_magma(a, b, infos);
|
||||
{ gels_magma(a, b, infos);
|
||||
return;
|
||||
}
|
||||
case at::LinalgBackend::Cusolver:
|
||||
default:
|
||||
// linalg_lstsq_gels is a generic function that is implemented using
|
||||
// geqrf_stub, ormqr_stub, and triangular_solve_stub
|
||||
// It dispatches to cuSOLVER for CUDA inputs if USE_LINALG_SOLVER is defined
|
||||
return linalg_lstsq_gels(a, b, infos);
|
||||
{ linalg_lstsq_gels(a, b, infos);
|
||||
return;
|
||||
}
|
||||
}
|
||||
#else
|
||||
return gels_magma(a, b, infos);
|
||||
|
@ -337,8 +337,7 @@ struct BenchmarkCache {
|
||||
engine_cache_order.begin(), engine_cache_order, it->second.second);
|
||||
}
|
||||
} else {
|
||||
engine_cache.erase(key);
|
||||
engine_cache.emplace(
|
||||
engine_cache.insert_or_assign(
|
||||
key,
|
||||
std::make_pair(results, engine_cache_order.end())); // dummy iterator
|
||||
}
|
||||
|
@ -371,8 +371,7 @@ struct MHAGraphCache {
|
||||
}
|
||||
|
||||
void update(const KeyType& key, T& results) {
|
||||
engine_cache.erase(key);
|
||||
engine_cache.emplace(key, std::move(results));
|
||||
engine_cache.insert_or_assign(key, std::move(results));
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -1222,7 +1222,7 @@ cudnnRNNAlgo_t get_algo(
|
||||
}
|
||||
|
||||
cudnnDataType_t promote_rnn_math_type(cudnnDataType_t dtype) {
|
||||
if (dtype == CUDNN_DATA_HALF) {
|
||||
if (dtype == CUDNN_DATA_HALF || dtype == CUDNN_DATA_BFLOAT16) {
|
||||
return CUDNN_DATA_FLOAT;
|
||||
}
|
||||
return dtype;
|
||||
|
@ -772,13 +772,21 @@ void dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
|
||||
|
||||
template <>
|
||||
void gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
std::string_view arch(dprops->gcnArchName);
|
||||
if (arch == "gfx1100") {
|
||||
static const std::vector<std::string> wmma_archs = {
|
||||
"gfx1100", "gfx1101", "gfx1102", "gfx1200", "gfx1201",
|
||||
#if ROCM_VERSION >= 70000
|
||||
"gfx1150", "gfx1151"
|
||||
#endif
|
||||
};
|
||||
if (at::detail::getCUDAHooks().isGPUArch(wmma_archs)) {
|
||||
dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGS(at::BFloat16));
|
||||
} else{
|
||||
}
|
||||
else if (at::detail::getCUDAHooks().isGPUArch({"gfx9"})) {
|
||||
dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGS(at::BFloat16));
|
||||
}
|
||||
else {
|
||||
TORCH_CHECK(false, "gemm_internal_ck<at::BFloat16> unsupported gfx arch");
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
@ -599,11 +599,21 @@ void dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
|
||||
|
||||
template <>
|
||||
void gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
|
||||
if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) {
|
||||
static const std::vector<std::string> wmma_archs = {
|
||||
"gfx1100", "gfx1101", "gfx1102", "gfx1200", "gfx1201",
|
||||
#if ROCM_VERSION >= 70000
|
||||
"gfx1150", "gfx1151"
|
||||
#endif
|
||||
};
|
||||
if (at::detail::getCUDAHooks().isGPUArch(wmma_archs)) {
|
||||
dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGS(at::Half));
|
||||
} else{
|
||||
}
|
||||
else if (at::detail::getCUDAHooks().isGPUArch({"gfx9"})) {
|
||||
dispatch_half_gemm(CUDABLAS_GEMM_ARGS(at::Half));
|
||||
}
|
||||
else {
|
||||
TORCH_CHECK(false, "gemm_internal_ck<at::Half> unsupported gfx arch");
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
@ -373,59 +373,67 @@ void addmm_out_sparse_csr(
|
||||
if (mat2.layout() == kSparseCsr) {
|
||||
if (result.layout() == kStrided) {
|
||||
// TODO: Add native CSC support via cuSPARSE if supported.
|
||||
return addmm_dense_result(
|
||||
addmm_dense_result(
|
||||
mat2.transpose(0, 1).to_sparse_csr(),
|
||||
mat1.transpose(0, 1),
|
||||
beta,
|
||||
alpha,
|
||||
result.transpose(0, 1));
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (mat2.layout() == kSparseCsc) {
|
||||
if (result.layout() == kStrided) {
|
||||
return addmm_dense_result(
|
||||
addmm_dense_result(
|
||||
mat2.transpose(-2, -1),
|
||||
mat1.transpose(-2, -1),
|
||||
beta,
|
||||
alpha,
|
||||
result.transpose(-2, -1));
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (mat2.layout() == kSparseBsc) {
|
||||
if (result.layout() == kStrided) {
|
||||
return addmm_dense_result(
|
||||
addmm_dense_result(
|
||||
mat2.transpose(-2, -1),
|
||||
mat1.transpose(-2, -1),
|
||||
beta,
|
||||
alpha,
|
||||
result.transpose(-2, -1));
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (mat1.layout() == kSparseCsr) {
|
||||
if (mat2.layout() == kStrided) {
|
||||
if (result.layout() == kStrided) {
|
||||
return addmm_dense_result(mat1, mat2, beta, alpha, result);
|
||||
addmm_dense_result(mat1, mat2, beta, alpha, result);
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (mat2.layout() == kSparseCsr) {
|
||||
if (result.layout() == kStrided) {
|
||||
return addmm_sparse_input_dense_result(mat1, mat2, beta, alpha, result);
|
||||
addmm_sparse_input_dense_result(mat1, mat2, beta, alpha, result);
|
||||
return;
|
||||
}
|
||||
if (result.layout() == kSparseCsr) {
|
||||
return addmm_sparse_result(mat1, mat2, beta, alpha, result);
|
||||
addmm_sparse_result(mat1, mat2, beta, alpha, result);
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (mat2.layout() == kSparseCsc) {
|
||||
if (result.layout() == kStrided) {
|
||||
// TODO: CSR @ CSC kernel would be very fast due to format alignment
|
||||
return addmm_sparse_input_dense_result(
|
||||
mat1, mat2.to_sparse_csr(), beta, alpha, result);
|
||||
addmm_sparse_input_dense_result(
|
||||
mat1, mat2.to_sparse_csr(), beta, alpha, result);
|
||||
return;
|
||||
}
|
||||
if (result.layout() == kSparseCsr) {
|
||||
// TODO: CSR @ CSC kernel would be very fast due to format alignment
|
||||
return addmm_sparse_result(
|
||||
mat1, mat2.to_sparse_csr(), beta, alpha, result);
|
||||
addmm_sparse_result(
|
||||
mat1, mat2.to_sparse_csr(), beta, alpha, result);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -433,56 +441,62 @@ void addmm_out_sparse_csr(
|
||||
if (mat2.layout() == kStrided) {
|
||||
if (result.layout() == kStrided) {
|
||||
// TODO: avoid csc->csr conversion with native csc support
|
||||
return addmm_dense_result(
|
||||
mat1.to_sparse_csr(), mat2, beta, alpha, result);
|
||||
addmm_dense_result(
|
||||
mat1.to_sparse_csr(), mat2, beta, alpha, result);
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (mat2.layout() == kSparseCsr) {
|
||||
if (result.layout() == kSparseCsr) {
|
||||
// TODO: avoid csc->csr conversion with native csc support
|
||||
return addmm_sparse_result(
|
||||
mat1.to_sparse_csr(), mat2, beta, alpha, result);
|
||||
addmm_sparse_result(
|
||||
mat1.to_sparse_csr(), mat2, beta, alpha, result);
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (mat2.layout() == kSparseCsc) {
|
||||
if (result.layout() == kStrided) {
|
||||
return addmm_sparse_input_dense_result(
|
||||
mat2.transpose(-2, -1),
|
||||
mat1.transpose(-2, -1),
|
||||
beta,
|
||||
alpha,
|
||||
result.transpose(-2, -1));
|
||||
addmm_sparse_input_dense_result(
|
||||
mat2.transpose(-2, -1),
|
||||
mat1.transpose(-2, -1),
|
||||
beta,
|
||||
alpha,
|
||||
result.transpose(-2, -1));
|
||||
return;
|
||||
}
|
||||
if (result.layout() == kSparseCsr) {
|
||||
// TODO avoid csc->csr
|
||||
return addmm_sparse_result(
|
||||
mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result);
|
||||
addmm_sparse_result(
|
||||
mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result);
|
||||
return;
|
||||
}
|
||||
if (result.layout() == kSparseCsc) {
|
||||
return addmm_sparse_result(
|
||||
mat2.transpose(-2, -1),
|
||||
mat1.transpose(-2, -1),
|
||||
beta,
|
||||
alpha,
|
||||
result.transpose(-2, -1));
|
||||
addmm_sparse_result(
|
||||
mat2.transpose(-2, -1),
|
||||
mat1.transpose(-2, -1),
|
||||
beta,
|
||||
alpha,
|
||||
result.transpose(-2, -1));
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (mat1.layout() == kSparseBsr) {
|
||||
if (mat2.layout() == kStrided) {
|
||||
if (result.layout() == kStrided) {
|
||||
return addmm_dense_result(mat1, mat2, beta, alpha, result);
|
||||
addmm_dense_result(mat1, mat2, beta, alpha, result);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"addmm: computation on CPU is not implemented for ",
|
||||
result.layout(),
|
||||
" + ",
|
||||
mat1.layout(),
|
||||
" @ ",
|
||||
mat2.layout());
|
||||
false,
|
||||
"addmm: computation on CPU is not implemented for ",
|
||||
result.layout(),
|
||||
" + ",
|
||||
mat1.layout(),
|
||||
" @ ",
|
||||
mat2.layout());
|
||||
}
|
||||
|
||||
/*
|
||||
@ -496,16 +510,16 @@ void addmm_out_sparse_csr(
|
||||
[out] result of the operation.
|
||||
*/
|
||||
void addmv_out_sparse_csr(
|
||||
const Tensor& mat,
|
||||
const Tensor& vec,
|
||||
const Scalar& beta,
|
||||
const Scalar& alpha,
|
||||
const Tensor& result) {
|
||||
const Tensor& mat,
|
||||
const Tensor& vec,
|
||||
const Scalar& beta,
|
||||
const Scalar& alpha,
|
||||
const Tensor& result) {
|
||||
#if !AT_USE_MKL_SPARSE()
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"Calling addmv on a sparse CPU tensor requires Linux platform. ",
|
||||
"Please use PyTorch built with MKL on Linux.");
|
||||
false,
|
||||
"Calling addmv on a sparse CPU tensor requires Linux platform. ",
|
||||
"Please use PyTorch built with MKL on Linux.");
|
||||
#else
|
||||
c10::MaybeOwned<Tensor> result_ = prepare_dense_vector_for_mkl(result);
|
||||
c10::MaybeOwned<Tensor> vec_ = prepare_dense_vector_for_mkl(vec);
|
||||
|
@ -5,38 +5,6 @@
|
||||
# representing ScalarType's. They are now superseded by usage of
|
||||
# `aten::to()`. The ops remain here for backward compatibility purposes.
|
||||
|
||||
# DEPRECATED. DO NOT USE
|
||||
- func: _cast_Byte(Tensor self, bool non_blocking=False) -> Tensor
|
||||
variants: function
|
||||
|
||||
# DEPRECATED. DO NOT USE
|
||||
- func: _cast_Char(Tensor self, bool non_blocking=False) -> Tensor
|
||||
variants: function
|
||||
|
||||
# DEPRECATED. DO NOT USE
|
||||
- func: _cast_Double(Tensor self, bool non_blocking=False) -> Tensor
|
||||
variants: function
|
||||
|
||||
# DEPRECATED. DO NOT USE
|
||||
- func: _cast_Float(Tensor self, bool non_blocking=False) -> Tensor
|
||||
variants: function
|
||||
|
||||
# DEPRECATED. DO NOT USE
|
||||
- func: _cast_Int(Tensor self, bool non_blocking=False) -> Tensor
|
||||
variants: function
|
||||
|
||||
# DEPRECATED. DO NOT USE
|
||||
- func: _cast_Long(Tensor self, bool non_blocking=False) -> Tensor
|
||||
variants: function
|
||||
|
||||
# DEPRECATED. DO NOT USE
|
||||
- func: _cast_Short(Tensor self, bool non_blocking=False) -> Tensor
|
||||
variants: function
|
||||
|
||||
# DEPRECATED. DO NOT USE
|
||||
- func: _cast_Half(Tensor self, bool non_blocking=False) -> Tensor
|
||||
variants: function
|
||||
|
||||
# Computes the gradient of current tensor w.r.t. graph leaves.
|
||||
- func: _backward(Tensor self, Tensor[] inputs, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> ()
|
||||
manual_cpp_binding: True
|
||||
@ -6725,12 +6693,12 @@
|
||||
|
||||
- func: native_norm(Tensor self, Scalar p=2) -> Tensor
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: norm_sparse
|
||||
SparseCPU, SparseCUDA, SparseMPS: norm_sparse
|
||||
autogen: native_norm.out
|
||||
|
||||
- func: native_norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype) -> Tensor
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: norm_sparse
|
||||
SparseCPU, SparseCUDA, SparseMPS: norm_sparse
|
||||
autogen: native_norm.ScalarOpt_dim_dtype_out
|
||||
|
||||
- func: _batch_norm_with_update(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)
|
||||
@ -6856,14 +6824,14 @@
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: sparse_dtype_norm
|
||||
SparseCPU, SparseCUDA, SparseMPS: sparse_dtype_norm
|
||||
|
||||
- func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor
|
||||
structured_delegate: norm.out
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: sparse_norm
|
||||
SparseCPU, SparseCUDA, SparseMPS: sparse_norm
|
||||
|
||||
- func: norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
|
||||
structured: True
|
||||
|
@ -810,7 +810,8 @@ void addmm_out_sparse_csr(
|
||||
if (mat1.layout() == kSparseBsr) {
|
||||
if (mat2.layout() == kStrided) {
|
||||
if (result.layout() == kStrided)
|
||||
return block_sparse_mm(input, mat1, mat2, beta, alpha, result);
|
||||
{ block_sparse_mm(input, mat1, mat2, beta, alpha, result); return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -819,13 +820,13 @@ void addmm_out_sparse_csr(
|
||||
if (result.layout() == kStrided) {
|
||||
auto result_t = result.transpose(-2, -1);
|
||||
auto input_t = (result.is_same(input) ? result_t : input.transpose(-2, -1));
|
||||
return block_sparse_mm(
|
||||
block_sparse_mm(
|
||||
input_t,
|
||||
mat2.transpose(-2, -1),
|
||||
mat1.transpose(-2, -1),
|
||||
beta,
|
||||
alpha,
|
||||
result_t);
|
||||
result_t); return;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -840,41 +841,41 @@ void addmm_out_sparse_csr(
|
||||
if (mat2.layout() == kSparseCsr) {
|
||||
if (result.layout() == kStrided) {
|
||||
// TODO: Add native CSC support via cuSPARSE if supported.
|
||||
return spmm(
|
||||
spmm(
|
||||
mat2.transpose(0, 1).to_sparse_csr(),
|
||||
mat1.transpose(0, 1),
|
||||
beta,
|
||||
alpha,
|
||||
result.transpose(0, 1));
|
||||
result.transpose(0, 1)); return;
|
||||
}
|
||||
}
|
||||
if (mat2.layout() == kSparseCsc) {
|
||||
if (result.layout() == kStrided) {
|
||||
return spmm(
|
||||
spmm(
|
||||
mat2.transpose(-2, -1),
|
||||
mat1.transpose(-2, -1),
|
||||
beta,
|
||||
alpha,
|
||||
result.transpose(-2, -1));
|
||||
result.transpose(-2, -1)); return;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (mat1.layout() == kSparseCsr) {
|
||||
if (mat2.layout() == kStrided) {
|
||||
if (result.layout() == kStrided) {
|
||||
return spmm(mat1, mat2, beta, alpha, result);
|
||||
spmm(mat1, mat2, beta, alpha, result); return;
|
||||
}
|
||||
}
|
||||
if (mat2.layout() == kSparseCsr) {
|
||||
if (result.layout() == kSparseCsr) {
|
||||
return spgemm(mat1, mat2, beta, alpha, result);
|
||||
spgemm(mat1, mat2, beta, alpha, result); return;
|
||||
}
|
||||
}
|
||||
if (mat2.layout() == kSparseCsc) {
|
||||
if (result.layout() == kSparseCsr) {
|
||||
// TODO: Add native CSC support via cuSPARSE if supported.
|
||||
// CSR @ CSC kernel would be very fast due to format alignment
|
||||
return spgemm(mat1, mat2.to_sparse_csr(), beta, alpha, result);
|
||||
spgemm(mat1, mat2.to_sparse_csr(), beta, alpha, result); return;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -882,27 +883,28 @@ void addmm_out_sparse_csr(
|
||||
if (mat2.layout() == kStrided) {
|
||||
if (result.layout() == kStrided) {
|
||||
// TODO: Add native CSC support via cuSPARSE if supported.
|
||||
return spmm(mat1.to_sparse_csr(), mat2, beta, alpha, result);
|
||||
spmm(mat1.to_sparse_csr(), mat2, beta, alpha, result); return;
|
||||
}
|
||||
}
|
||||
if (mat2.layout() == kSparseCsr) {
|
||||
if (result.layout() == kSparseCsr)
|
||||
// TODO: Add native CSC support via cuSPARSE if supported.
|
||||
return spgemm(mat1.to_sparse_csr(), mat2, beta, alpha, result);
|
||||
{ spgemm(mat1.to_sparse_csr(), mat2, beta, alpha, result); return;
|
||||
}
|
||||
}
|
||||
if (mat2.layout() == kSparseCsc) {
|
||||
if (result.layout() == kSparseCsr) {
|
||||
// TODO: Add native CSC support via cuSPARSE if supported.
|
||||
return spgemm(
|
||||
mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result);
|
||||
spgemm(
|
||||
mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result); return;
|
||||
}
|
||||
if (result.layout() == kSparseCsc) {
|
||||
return spgemm(
|
||||
spgemm(
|
||||
mat2.transpose(-2, -1),
|
||||
mat1.transpose(-2, -1),
|
||||
beta,
|
||||
alpha,
|
||||
result.transpose(-2, -1));
|
||||
result.transpose(-2, -1)); return;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -933,7 +935,7 @@ void addmv_out_sparse_csr(
|
||||
const Scalar& alpha,
|
||||
const Tensor& result) {
|
||||
if (mat.layout() == kSparseBsr) {
|
||||
return block_sparse_mv(mat, vec, beta, alpha, result);
|
||||
block_sparse_mv(mat, vec, beta, alpha, result); return;
|
||||
}
|
||||
cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
|
||||
|
||||
@ -1213,9 +1215,9 @@ void triangular_solve_out_sparse_csr(
|
||||
}
|
||||
if (A.layout() == kSparseBsr) {
|
||||
if (B.size(-1) == 1) {
|
||||
return block_sparse_triangular_solve_vec(A, B, X, upper, transpose, unitriangular);
|
||||
block_sparse_triangular_solve_vec(A, B, X, upper, transpose, unitriangular); return;
|
||||
} else {
|
||||
return block_sparse_triangular_solve_mat(A, B, X, upper, transpose, unitriangular);
|
||||
block_sparse_triangular_solve_mat(A, B, X, upper, transpose, unitriangular); return;
|
||||
}
|
||||
}
|
||||
#ifdef USE_ROCM
|
||||
|
@ -117,7 +117,7 @@ class FwdKernel:
|
||||
def get_all(cls) -> list["FwdKernel"]:
|
||||
kernels: list[FwdKernel] = []
|
||||
for aligned, dtype, (sm, sm_max) in itertools.product(
|
||||
[True, False], DTYPES.keys(), zip(SM, SM[1:])
|
||||
[True, False], DTYPES.keys(), itertools.pairwise(SM)
|
||||
):
|
||||
# Remove some kernels we don't use
|
||||
if dtype == "bf16" and sm < 80:
|
||||
@ -228,7 +228,7 @@ class BwdKernel:
|
||||
for aligned, dtype, (sm, sm_max), apply_dropout, max_k in itertools.product(
|
||||
[True, False],
|
||||
DTYPES.keys(),
|
||||
zip(SM, SM[1:]),
|
||||
itertools.pairwise(SM),
|
||||
[True, False],
|
||||
[32, 64, 128, 2**16],
|
||||
):
|
||||
|
191
benchmarks/distributed/bench_nvshmem_tile_reduce.py
Normal file
191
benchmarks/distributed/bench_nvshmem_tile_reduce.py
Normal file
@ -0,0 +1,191 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Benchmark for NVSHMEM tile reduce operations.
|
||||
|
||||
Usage:
|
||||
python benchmarks/distributed/bench_nvshmem_tile_reduce.py
|
||||
|
||||
This benchmark measures the performance of tile reduce operations across different
|
||||
matrix sizes and tile configurations.
|
||||
"""
|
||||
|
||||
import time
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.distributed._symmetric_memory as symm_mem
|
||||
from torch.testing._internal.common_distributed import MultiProcContinuousTest
|
||||
from torch.testing._internal.common_utils import (
|
||||
requires_cuda_p2p_access,
|
||||
skip_but_pass_in_sandcastle_if,
|
||||
skipIfRocm,
|
||||
)
|
||||
|
||||
|
||||
# Decorator
|
||||
def requires_nvshmem():
|
||||
return skip_but_pass_in_sandcastle_if(
|
||||
not symm_mem.is_nvshmem_available(),
|
||||
"bench_nvshmem_tile_reduce requires NVSHMEM, skipping benchmark",
|
||||
)
|
||||
|
||||
|
||||
# So that benchmarks are written in device-agnostic way
|
||||
device_type = "cuda"
|
||||
device_module = torch.get_device_module(device_type)
|
||||
|
||||
|
||||
@requires_nvshmem()
|
||||
@requires_cuda_p2p_access()
|
||||
class NVSHMEMTileReduceBenchmark(MultiProcContinuousTest):
|
||||
def _init_device(self) -> None:
|
||||
# TODO: relieve this (seems to hang if without)
|
||||
device_module.set_device(self.device)
|
||||
# Set NVSHMEM as SymmMem backend
|
||||
symm_mem.set_backend("NVSHMEM")
|
||||
|
||||
@property
|
||||
def device(self) -> torch.device:
|
||||
return torch.device(device_type, self.rank)
|
||||
|
||||
def _benchmark_tile_reduce_single(
|
||||
self,
|
||||
full_size: int,
|
||||
tile_size: int,
|
||||
warmup_iters: int = 5,
|
||||
bench_iters: int = 10,
|
||||
) -> dict:
|
||||
"""
|
||||
Benchmark a single configuration of tile reduce.
|
||||
|
||||
Args:
|
||||
full_size: Size of the full matrix (full_size x full_size)
|
||||
warmup_iters: Number of warmup iterations
|
||||
bench_iters: Number of benchmark iterations
|
||||
|
||||
Returns:
|
||||
Dictionary with benchmark results
|
||||
"""
|
||||
self._init_device()
|
||||
group_name = dist.group.WORLD.group_name
|
||||
symm_mem.enable_symm_mem_for_group(group_name)
|
||||
|
||||
dtype = torch.float
|
||||
|
||||
# Allocate full matrices
|
||||
full_inp = symm_mem.empty(
|
||||
full_size, full_size, dtype=dtype, device=self.device
|
||||
).fill_(self.rank)
|
||||
full_out = symm_mem.empty(
|
||||
full_size, full_size, dtype=dtype, device=self.device
|
||||
).fill_(0)
|
||||
|
||||
slice_ut = slice(0, tile_size)
|
||||
inp_tile = full_inp[slice_ut, slice_ut]
|
||||
out_tile = full_out[slice_ut, slice_ut]
|
||||
|
||||
root = 0
|
||||
|
||||
# Warmup iterations
|
||||
for _ in range(warmup_iters):
|
||||
torch.ops.symm_mem.tile_reduce(inp_tile, out_tile, root, group_name)
|
||||
torch.cuda.synchronize(self.device)
|
||||
|
||||
# Benchmark iterations
|
||||
times = []
|
||||
|
||||
dist.barrier()
|
||||
torch.cuda.synchronize(self.device)
|
||||
start_time = time.perf_counter()
|
||||
|
||||
for _ in range(bench_iters):
|
||||
torch.ops.symm_mem.tile_reduce(inp_tile, out_tile, root, group_name)
|
||||
|
||||
torch.cuda.synchronize(self.device)
|
||||
end_time = time.perf_counter()
|
||||
times.append((end_time - start_time) / bench_iters)
|
||||
|
||||
# Calculate statistics
|
||||
times = torch.tensor(times, dtype=torch.float64)
|
||||
tile_elements = tile_size * tile_size
|
||||
tile_bytes = (
|
||||
tile_elements * dtype.itemsize
|
||||
if hasattr(dtype, "itemsize")
|
||||
else tile_elements * 4
|
||||
)
|
||||
|
||||
results = {
|
||||
"full_size": full_size,
|
||||
"tile_size": tile_size,
|
||||
"tile_elements": tile_elements,
|
||||
"tile_bytes": tile_bytes,
|
||||
"world_size": self.world_size,
|
||||
"mean_time_ms": times.mean().item() * 1000,
|
||||
"std_time_ms": times.std().item() * 1000,
|
||||
"min_time_ms": times.min().item() * 1000,
|
||||
"max_time_ms": times.max().item() * 1000,
|
||||
"throughput_gb_s": tile_bytes / (times.mean().item() * 1e9),
|
||||
"elements_per_sec": tile_elements / times.mean().item(),
|
||||
}
|
||||
|
||||
return results
|
||||
|
||||
@skipIfRocm
|
||||
def test_benchmark_tile_reduce_various_sizes(self) -> None:
|
||||
"""
|
||||
Benchmark tile reduce across various matrix sizes.
|
||||
"""
|
||||
# Test various matrix sizes
|
||||
tile_sizes = [512, 1024, 2048, 4096, 8192, 16384]
|
||||
full_size = tile_sizes[-1]
|
||||
warmup_iters = 5
|
||||
bench_iters = 20
|
||||
|
||||
results = []
|
||||
|
||||
for tile_size in tile_sizes:
|
||||
try:
|
||||
result = self._benchmark_tile_reduce_single(
|
||||
full_size, tile_size, warmup_iters, bench_iters
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
if self.rank == 0:
|
||||
print(
|
||||
f"Matrix Size: {full_size}x{full_size}, Tile Size: {tile_size}x{tile_size}"
|
||||
)
|
||||
print(
|
||||
f" Mean Time: {result['mean_time_ms']:.3f} ± {result['std_time_ms']:.3f} ms"
|
||||
)
|
||||
print(f" Throughput: {result['throughput_gb_s']:.2f} GB/s")
|
||||
print(f" Bytes: {result['tile_bytes']:.0f}")
|
||||
print()
|
||||
|
||||
except Exception as e:
|
||||
if self.rank == 0:
|
||||
print(f"Failed to benchmark matrix size {full_size}: {e}")
|
||||
|
||||
# Print summary
|
||||
if self.rank == 0 and results:
|
||||
print("=== BENCHMARK SUMMARY ===")
|
||||
print(
|
||||
f"{'Matrix Size':<12} {'Tile Size':<10} {'Time (ms)':<12} {'Throughput (GB/s)':<18} {'Bytes':<15}"
|
||||
)
|
||||
print("-" * 70)
|
||||
|
||||
for result in results:
|
||||
print(
|
||||
f"{result['full_size']}x{result['full_size']:<7} "
|
||||
f"{result['tile_size']}x{result['tile_size']:<5} "
|
||||
f"{result['mean_time_ms']:<12.3f} "
|
||||
f"{result['throughput_gb_s']:<18.2f} "
|
||||
f"{result['tile_bytes']:<15.0f}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# For standalone usage, you'd need to set up distributed environment
|
||||
# For now, this is meant to be run via the PyTorch test framework
|
||||
from torch.testing._internal.common_utils import run_tests
|
||||
|
||||
run_tests()
|
@ -25,15 +25,6 @@ drq
|
||||
fambench_dlrm
|
||||
fambench_xlmr
|
||||
fastNLP_Bert
|
||||
hf_Albert
|
||||
hf_Bart
|
||||
hf_Bert
|
||||
hf_BigBird
|
||||
hf_DistilBert
|
||||
hf_GPT2
|
||||
hf_Longformer
|
||||
hf_Reformer
|
||||
hf_T5
|
||||
maml
|
||||
maml_omniglot
|
||||
mnasnet1_0
|
||||
@ -60,13 +51,6 @@ soft_actor_critic
|
||||
speech_transformer
|
||||
squeezenet1_1
|
||||
tacotron2
|
||||
timm_efficientdet
|
||||
timm_efficientnet
|
||||
timm_nfnet
|
||||
timm_regnet
|
||||
timm_resnest
|
||||
timm_vision_transformer
|
||||
timm_vovnet
|
||||
tts_angular
|
||||
vgg16
|
||||
vision_maskrcnn
|
||||
|
@ -23,7 +23,6 @@ TORCHBENCH_MODELS: list[str] = [
|
||||
"resnet50",
|
||||
"moco",
|
||||
"llama",
|
||||
"hf_T5",
|
||||
]
|
||||
HUGGINGFACE_MODELS: list[str] = [
|
||||
"AllenaiLongformerBase",
|
||||
|
@ -10,9 +10,7 @@ import pandas as pd
|
||||
|
||||
flaky_models = {
|
||||
"yolov3",
|
||||
"gluon_inception_v3",
|
||||
"detectron2_maskrcnn_r_101_c4",
|
||||
"timm_efficientnet", # see https://github.com/pytorch/pytorch/issues/148699
|
||||
"XGLMForCausalLM", # discovered in https://github.com/pytorch/pytorch/pull/128148
|
||||
"moondream", # discovered in https://github.com/pytorch/pytorch/pull/159291
|
||||
# discovered in https://github.com/pytorch/pytorch/issues/161419. Its not flaky but really hard to repro, so skipping it
|
||||
@ -36,8 +34,6 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
|
||||
{
|
||||
"Background_Matting",
|
||||
"alexnet",
|
||||
"cait_m36_384",
|
||||
"dla102",
|
||||
"demucs",
|
||||
"densenet121",
|
||||
"detectron2_fcos_r_50_fpn",
|
||||
@ -45,13 +41,6 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
|
||||
"doctr_reco_predictor",
|
||||
"dpn107",
|
||||
"fbnetv3_b",
|
||||
"hf_BigBird",
|
||||
"hf_Longformer",
|
||||
"hf_Reformer",
|
||||
"hf_Roberta_base",
|
||||
"hf_T5",
|
||||
"hf_T5_base",
|
||||
"hf_T5_generate",
|
||||
"levit_128",
|
||||
"llava",
|
||||
"microbench_unbacked_tolist_sum",
|
||||
@ -70,12 +59,6 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
|
||||
"stable_diffusion_text_encoder",
|
||||
"stable_diffusion_unet",
|
||||
"swsl_resnext101_32x16d",
|
||||
"timm_efficientdet",
|
||||
"timm_efficientnet",
|
||||
"timm_nfnet",
|
||||
"timm_regnet",
|
||||
"timm_resnest",
|
||||
"timm_vovnet",
|
||||
"torchrec_dlrm",
|
||||
"vgg16",
|
||||
# LLM
|
||||
|
@ -10,7 +10,6 @@ import pandas as pd
|
||||
|
||||
flaky_models = {
|
||||
"yolov3",
|
||||
"gluon_inception_v3",
|
||||
"detectron2_maskrcnn_r_101_c4",
|
||||
"XGLMForCausalLM", # discovered in https://github.com/pytorch/pytorch/pull/128148
|
||||
"detectron2_fcos_r_50_fpn",
|
||||
@ -32,18 +31,11 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
|
||||
flaky_models.update(
|
||||
{
|
||||
"alexnet",
|
||||
"cait_m36_384",
|
||||
"demucs",
|
||||
"densenet121",
|
||||
"detectron2_fcos_r_50_fpn",
|
||||
"doctr_det_predictor",
|
||||
"doctr_reco_predictor",
|
||||
"hf_BigBird",
|
||||
"hf_Longformer",
|
||||
"hf_Reformer",
|
||||
"hf_Roberta_base",
|
||||
"hf_T5",
|
||||
"hf_T5_base",
|
||||
"levit_128",
|
||||
"llava",
|
||||
"microbench_unbacked_tolist_sum",
|
||||
@ -54,7 +46,6 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
|
||||
"stable_diffusion_text_encoder",
|
||||
"stable_diffusion_unet",
|
||||
"timm_efficientdet",
|
||||
"timm_nfnet",
|
||||
"torchrec_dlrm",
|
||||
"vgg16",
|
||||
# LLM
|
||||
|
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
AlbertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
AllenaiLongformerBase,pass,4
|
||||
|
||||
|
||||
@ -18,50 +14,22 @@ BartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
BartForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
BertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
BertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
BlenderbotForCausalLM,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
BlenderbotSmallForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
BlenderbotSmallForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
CamemBert,pass,0
|
||||
|
||||
|
||||
|
||||
DebertaV2ForMaskedLM,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
DebertaV2ForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
DistilBertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
DistilBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
DistillGPT2,pass,2
|
||||
|
||||
|
||||
@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
ElectraForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
GPT2ForSequenceClassification,pass,0
|
||||
|
||||
|
||||
@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
LayoutLMForSequenceClassification,pass,0
|
||||
|
||||
|
||||
|
||||
M2M100ForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
@ -98,10 +58,6 @@ MBartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
MBartForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
MT5ForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
MegatronBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
MobileBertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
MobileBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
OPTForCausalLM,pass,0
|
||||
|
||||
|
||||
@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
PLBartForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
PegasusForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
PegasusForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
RobertaForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
RobertaForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
T5ForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
|
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4
|
||||
|
||||
|
||||
|
||||
AlbertForQuestionAnswering,pass,5
|
||||
|
||||
|
||||
|
||||
AllenaiLongformerBase,pass,9
|
||||
|
||||
|
||||
@ -18,50 +14,22 @@ BartForCausalLM,pass,6
|
||||
|
||||
|
||||
|
||||
BartForConditionalGeneration,pass,8
|
||||
|
||||
|
||||
|
||||
BertForMaskedLM,pass,5
|
||||
|
||||
|
||||
|
||||
BertForQuestionAnswering,pass,5
|
||||
|
||||
|
||||
|
||||
BlenderbotForCausalLM,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
BlenderbotSmallForCausalLM,pass,6
|
||||
|
||||
|
||||
|
||||
BlenderbotSmallForConditionalGeneration,pass,8
|
||||
|
||||
|
||||
|
||||
CamemBert,pass,5
|
||||
|
||||
|
||||
|
||||
DebertaV2ForMaskedLM,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
|
||||
|
||||
|
||||
|
||||
DistilBertForMaskedLM,pass,5
|
||||
|
||||
|
||||
|
||||
DistilBertForQuestionAnswering,pass,5
|
||||
|
||||
|
||||
|
||||
DistillGPT2,pass,7
|
||||
|
||||
|
||||
@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4
|
||||
|
||||
|
||||
|
||||
ElectraForQuestionAnswering,pass,5
|
||||
|
||||
|
||||
|
||||
GPT2ForSequenceClassification,pass,6
|
||||
|
||||
|
||||
@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5
|
||||
|
||||
|
||||
|
||||
LayoutLMForSequenceClassification,pass,6
|
||||
|
||||
|
||||
|
||||
M2M100ForConditionalGeneration,pass,4
|
||||
|
||||
|
||||
@ -98,10 +58,6 @@ MBartForCausalLM,pass,6
|
||||
|
||||
|
||||
|
||||
MBartForConditionalGeneration,pass,8
|
||||
|
||||
|
||||
|
||||
MT5ForConditionalGeneration,pass,5
|
||||
|
||||
|
||||
@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5
|
||||
|
||||
|
||||
|
||||
MegatronBertForQuestionAnswering,pass,5
|
||||
|
||||
|
||||
|
||||
MobileBertForMaskedLM,pass,3
|
||||
|
||||
|
||||
|
||||
MobileBertForQuestionAnswering,pass,3
|
||||
|
||||
|
||||
|
||||
OPTForCausalLM,pass,8
|
||||
|
||||
|
||||
@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6
|
||||
|
||||
|
||||
|
||||
PLBartForConditionalGeneration,pass,8
|
||||
|
||||
|
||||
|
||||
PegasusForCausalLM,pass,6
|
||||
|
||||
|
||||
|
||||
PegasusForConditionalGeneration,pass,7
|
||||
|
||||
|
||||
|
||||
RobertaForCausalLM,pass,5
|
||||
|
||||
|
||||
|
||||
RobertaForQuestionAnswering,pass,5
|
||||
|
||||
|
||||
|
||||
T5ForConditionalGeneration,pass,5
|
||||
|
||||
|
||||
|
|
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
botnet26t_256,pass,0
|
||||
|
||||
|
||||
|
||||
cait_m36_384,pass,0
|
||||
|
||||
|
||||
|
||||
coat_lite_mini,pass,0
|
||||
|
||||
|
||||
|
||||
convit_base,pass,0
|
||||
|
||||
|
||||
|
||||
convmixer_768_32,pass,0
|
||||
|
||||
|
||||
|
||||
convnext_base,pass,0
|
||||
|
||||
|
||||
|
||||
crossvit_9_240,pass,0
|
||||
|
||||
|
||||
|
||||
cspdarknet53,pass,0
|
||||
|
||||
|
||||
|
||||
deit_base_distilled_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
dla102,pass,0
|
||||
|
||||
|
||||
|
||||
dm_nfnet_f0,pass,0
|
||||
|
||||
|
||||
|
||||
dpn107,pass,0
|
||||
|
||||
|
||||
|
||||
eca_botnext26ts_256,pass,0
|
||||
|
||||
|
||||
|
||||
eca_halonext26ts,pass,0
|
||||
|
||||
|
||||
|
||||
ese_vovnet19b_dw,pass,0
|
||||
|
||||
|
||||
|
||||
fbnetc_100,pass,0
|
||||
|
||||
|
||||
|
||||
fbnetv3_b,pass,0
|
||||
|
||||
|
||||
|
||||
gernet_l,pass,0
|
||||
|
||||
|
||||
|
||||
ghostnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
gluon_inception_v3,pass,0
|
||||
|
||||
|
||||
|
||||
gmixer_24_224,pass,0
|
||||
|
||||
|
||||
|
||||
gmlp_s16_224,pass,0
|
||||
|
||||
|
||||
|
||||
hrnet_w18,pass,0
|
||||
|
||||
|
||||
|
||||
inception_v3,pass,0
|
||||
|
||||
|
||||
|
||||
jx_nest_base,pass,0
|
||||
|
||||
|
||||
|
||||
lcnet_050,pass,0
|
||||
|
||||
|
||||
|
||||
levit_128,pass,0
|
||||
|
||||
|
||||
|
||||
mixer_b16_224,pass,0
|
||||
|
||||
|
||||
|
||||
mixnet_l,pass,0
|
||||
|
||||
|
||||
|
||||
mnasnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
mobilenetv2_100,pass,0
|
||||
|
||||
|
||||
@ -146,100 +42,16 @@ nfnet_l0,pass,0
|
||||
|
||||
|
||||
|
||||
pit_b_224,pass,0
|
||||
|
||||
|
||||
|
||||
pnasnet5large,pass,0
|
||||
|
||||
|
||||
|
||||
poolformer_m36,pass,0
|
||||
|
||||
|
||||
|
||||
regnety_002,pass,0
|
||||
|
||||
|
||||
|
||||
repvgg_a2,pass,0
|
||||
|
||||
|
||||
|
||||
res2net101_26w_4s,pass,0
|
||||
|
||||
|
||||
|
||||
res2net50_14w_8s,pass,0
|
||||
|
||||
|
||||
|
||||
res2next50,pass,0
|
||||
|
||||
|
||||
|
||||
resmlp_12_224,pass,0
|
||||
|
||||
|
||||
|
||||
resnest101e,pass,0
|
||||
|
||||
|
||||
|
||||
rexnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
sebotnet33ts_256,pass,0
|
||||
|
||||
|
||||
|
||||
selecsls42b,pass,0
|
||||
|
||||
|
||||
|
||||
spnasnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
swin_base_patch4_window7_224,pass,0
|
||||
|
||||
|
||||
|
||||
swsl_resnext101_32x16d,pass,0
|
||||
|
||||
|
||||
|
||||
tf_efficientnet_b0,pass,0
|
||||
|
||||
|
||||
|
||||
tf_mixnet_l,pass,0
|
||||
|
||||
|
||||
|
||||
tinynet_a,pass,0
|
||||
|
||||
|
||||
|
||||
tnt_s_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
twins_pcpvt_base,pass,0
|
||||
|
||||
|
||||
|
||||
visformer_small,pass,0
|
||||
|
||||
|
||||
|
||||
vit_base_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
volo_d1_224,pass,0
|
||||
|
||||
|
||||
|
||||
xcit_large_24_p8_224,pass,0
|
||||
|
|
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,7
|
||||
|
||||
|
||||
|
||||
botnet26t_256,pass,6
|
||||
|
||||
|
||||
|
||||
cait_m36_384,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
coat_lite_mini,pass,6
|
||||
|
||||
|
||||
|
||||
convit_base,pass,7
|
||||
|
||||
|
||||
|
||||
convmixer_768_32,pass,5
|
||||
|
||||
|
||||
|
||||
convnext_base,pass,7
|
||||
|
||||
|
||||
|
||||
crossvit_9_240,pass,7
|
||||
|
||||
|
||||
|
||||
cspdarknet53,pass,7
|
||||
|
||||
|
||||
|
||||
deit_base_distilled_patch16_224,pass,7
|
||||
|
||||
|
||||
|
||||
dla102,pass,7
|
||||
|
||||
|
||||
|
||||
dm_nfnet_f0,pass,6
|
||||
|
||||
|
||||
|
||||
dpn107,pass,6
|
||||
|
||||
|
||||
|
||||
eca_botnext26ts_256,pass,7
|
||||
|
||||
|
||||
|
||||
eca_halonext26ts,pass,7
|
||||
|
||||
|
||||
|
||||
ese_vovnet19b_dw,pass,7
|
||||
|
||||
|
||||
|
||||
fbnetc_100,pass,7
|
||||
|
||||
|
||||
|
||||
fbnetv3_b,pass,6
|
||||
|
||||
|
||||
|
||||
gernet_l,pass,6
|
||||
|
||||
|
||||
|
||||
ghostnet_100,pass,6
|
||||
|
||||
|
||||
|
||||
gluon_inception_v3,pass,7
|
||||
|
||||
|
||||
|
||||
gmixer_24_224,pass,6
|
||||
|
||||
|
||||
|
||||
gmlp_s16_224,pass,7
|
||||
|
||||
|
||||
|
||||
hrnet_w18,pass,5
|
||||
|
||||
|
||||
|
||||
inception_v3,pass,6
|
||||
|
||||
|
||||
|
||||
jx_nest_base,pass,7
|
||||
|
||||
|
||||
|
||||
lcnet_050,fail_accuracy,6
|
||||
|
||||
|
||||
|
||||
levit_128,pass,7
|
||||
|
||||
|
||||
|
||||
mixer_b16_224,pass,7
|
||||
|
||||
|
||||
|
||||
mixnet_l,pass,6
|
||||
|
||||
|
||||
|
||||
mnasnet_100,pass,7
|
||||
|
||||
|
||||
|
||||
mobilenetv2_100,pass,7
|
||||
|
||||
|
||||
@ -146,100 +42,16 @@ nfnet_l0,pass,7
|
||||
|
||||
|
||||
|
||||
pit_b_224,pass,6
|
||||
|
||||
|
||||
|
||||
pnasnet5large,pass,5
|
||||
|
||||
|
||||
|
||||
poolformer_m36,pass,6
|
||||
|
||||
|
||||
|
||||
regnety_002,pass,6
|
||||
|
||||
|
||||
|
||||
repvgg_a2,pass,7
|
||||
|
||||
|
||||
|
||||
res2net101_26w_4s,pass,6
|
||||
|
||||
|
||||
|
||||
res2net50_14w_8s,pass,6
|
||||
|
||||
|
||||
|
||||
res2next50,pass,6
|
||||
|
||||
|
||||
|
||||
resmlp_12_224,pass,6
|
||||
|
||||
|
||||
|
||||
resnest101e,pass,6
|
||||
|
||||
|
||||
|
||||
rexnet_100,pass,7
|
||||
|
||||
|
||||
|
||||
sebotnet33ts_256,pass,6
|
||||
|
||||
|
||||
|
||||
selecsls42b,pass,6
|
||||
|
||||
|
||||
|
||||
spnasnet_100,pass,7
|
||||
|
||||
|
||||
|
||||
swin_base_patch4_window7_224,pass,7
|
||||
|
||||
|
||||
|
||||
swsl_resnext101_32x16d,pass,6
|
||||
|
||||
|
||||
|
||||
tf_efficientnet_b0,pass,6
|
||||
|
||||
|
||||
|
||||
tf_mixnet_l,pass,6
|
||||
|
||||
|
||||
|
||||
tinynet_a,pass,6
|
||||
|
||||
|
||||
|
||||
tnt_s_patch16_224,pass,7
|
||||
|
||||
|
||||
|
||||
twins_pcpvt_base,pass,7
|
||||
|
||||
|
||||
|
||||
visformer_small,pass,7
|
||||
|
||||
|
||||
|
||||
vit_base_patch16_224,pass,7
|
||||
|
||||
|
||||
|
||||
volo_d1_224,pass,7
|
||||
|
||||
|
||||
|
||||
xcit_large_24_p8_224,pass_due_to_skip,7
|
||||
|
|
@ -130,70 +130,6 @@ functorch_maml_omniglot,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Albert,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Bart,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Bert,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,0
|
||||
|
||||
|
||||
|
||||
hf_DistilBert,pass,0
|
||||
|
||||
|
||||
|
||||
hf_GPT2,pass,0
|
||||
|
||||
|
||||
|
||||
hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
|
||||
|
||||
|
||||
hf_Roberta_base,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_base,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
hf_T5_generate,pass,7
|
||||
|
||||
|
||||
|
||||
hf_T5_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Whisper,pass,0
|
||||
|
||||
|
||||
|
||||
hf_distil_whisper,pass,0
|
||||
|
||||
|
||||
|
||||
lennard_jones,pass,0
|
||||
|
||||
|
||||
@ -342,30 +278,6 @@ stable_diffusion_unet,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
timm_efficientnet,pass,0
|
||||
|
||||
|
||||
|
||||
timm_regnet,pass,0
|
||||
|
||||
|
||||
|
||||
timm_resnest,pass,0
|
||||
|
||||
|
||||
|
||||
timm_vision_transformer,pass,0
|
||||
|
||||
|
||||
|
||||
timm_vision_transformer_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
timm_vovnet,pass,0
|
||||
|
||||
|
||||
|
||||
torch_multimodal_clip,pass,0
|
||||
|
||||
|
||||
|
|
@ -78,62 +78,6 @@ functorch_maml_omniglot,pass,7
|
||||
|
||||
|
||||
|
||||
hf_Albert,pass,6
|
||||
|
||||
|
||||
|
||||
hf_Bart,pass,6
|
||||
|
||||
|
||||
|
||||
hf_Bert,pass,6
|
||||
|
||||
|
||||
|
||||
hf_Bert_large,pass,6
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,6
|
||||
|
||||
|
||||
|
||||
hf_DistilBert,pass,6
|
||||
|
||||
|
||||
|
||||
hf_GPT2,pass,8
|
||||
|
||||
|
||||
|
||||
hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,20
|
||||
|
||||
|
||||
|
||||
hf_Roberta_base,pass,6
|
||||
|
||||
|
||||
|
||||
hf_T5_base,eager_2nd_run_OOM,0
|
||||
|
||||
|
||||
|
||||
hf_T5_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Whisper,pass,6
|
||||
|
||||
|
||||
|
||||
hf_distil_whisper,model_fail_to_load,0
|
||||
|
||||
|
||||
|
||||
lennard_jones,pass,7
|
||||
|
||||
|
||||
@ -250,30 +194,6 @@ stable_diffusion_unet,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
timm_efficientnet,fail_accuracy,7
|
||||
|
||||
|
||||
|
||||
timm_regnet,pass,7
|
||||
|
||||
|
||||
|
||||
timm_resnest,pass,6
|
||||
|
||||
|
||||
|
||||
timm_vision_transformer,pass,6
|
||||
|
||||
|
||||
|
||||
timm_vision_transformer_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
timm_vovnet,pass,6
|
||||
|
||||
|
||||
|
||||
torch_multimodal_clip,pass,7
|
||||
|
||||
|
||||
|
|
@ -6,58 +6,26 @@ AlbertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
AlbertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
BartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
BartForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
BertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
BertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
BlenderbotForCausalLM,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
BlenderbotSmallForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
BlenderbotSmallForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
CamemBert,pass,0
|
||||
|
||||
|
||||
|
||||
DebertaV2ForMaskedLM,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
DebertaV2ForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
DistilBertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
DistilBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
DistillGPT2,pass,0
|
||||
|
||||
|
||||
@ -66,10 +34,6 @@ ElectraForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
ElectraForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
GPT2ForSequenceClassification,pass,0
|
||||
|
||||
|
||||
@ -82,10 +46,6 @@ LayoutLMForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
LayoutLMForSequenceClassification,pass,0
|
||||
|
||||
|
||||
|
||||
M2M100ForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
@ -94,10 +54,6 @@ MBartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
MBartForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
MT5ForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
@ -106,18 +62,10 @@ MegatronBertForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
MegatronBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
MobileBertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
MobileBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
OPTForCausalLM,pass,0
|
||||
|
||||
|
||||
@ -126,26 +74,14 @@ PLBartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
PLBartForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
PegasusForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
PegasusForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
RobertaForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
RobertaForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
T5ForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
|
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
botnet26t_256,pass,0
|
||||
|
||||
|
||||
|
||||
cait_m36_384,pass,0
|
||||
|
||||
|
||||
|
||||
coat_lite_mini,pass,0
|
||||
|
||||
|
||||
|
||||
convit_base,pass,0
|
||||
|
||||
|
||||
|
||||
convmixer_768_32,pass,0
|
||||
|
||||
|
||||
|
||||
convnext_base,pass,0
|
||||
|
||||
|
||||
|
||||
crossvit_9_240,pass,0
|
||||
|
||||
|
||||
|
||||
cspdarknet53,pass,0
|
||||
|
||||
|
||||
|
||||
deit_base_distilled_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
dla102,pass,0
|
||||
|
||||
|
||||
|
||||
dm_nfnet_f0,pass,0
|
||||
|
||||
|
||||
|
||||
dpn107,pass,0
|
||||
|
||||
|
||||
|
||||
eca_botnext26ts_256,pass,0
|
||||
|
||||
|
||||
|
||||
eca_halonext26ts,pass,0
|
||||
|
||||
|
||||
|
||||
ese_vovnet19b_dw,pass,0
|
||||
|
||||
|
||||
|
||||
fbnetc_100,pass,0
|
||||
|
||||
|
||||
|
||||
fbnetv3_b,pass,0
|
||||
|
||||
|
||||
|
||||
gernet_l,pass,0
|
||||
|
||||
|
||||
|
||||
ghostnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
gluon_inception_v3,pass,0
|
||||
|
||||
|
||||
|
||||
gmixer_24_224,pass,0
|
||||
|
||||
|
||||
|
||||
gmlp_s16_224,pass,0
|
||||
|
||||
|
||||
|
||||
hrnet_w18,pass,0
|
||||
|
||||
|
||||
|
||||
inception_v3,pass,0
|
||||
|
||||
|
||||
|
||||
jx_nest_base,pass,0
|
||||
|
||||
|
||||
|
||||
lcnet_050,pass,0
|
||||
|
||||
|
||||
|
||||
levit_128,pass,0
|
||||
|
||||
|
||||
|
||||
mixer_b16_224,pass,0
|
||||
|
||||
|
||||
|
||||
mixnet_l,pass,0
|
||||
|
||||
|
||||
|
||||
mnasnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
mobilenetv2_100,pass,0
|
||||
|
||||
|
||||
@ -146,100 +42,16 @@ nfnet_l0,pass,0
|
||||
|
||||
|
||||
|
||||
pit_b_224,pass,0
|
||||
|
||||
|
||||
|
||||
pnasnet5large,pass,0
|
||||
|
||||
|
||||
|
||||
poolformer_m36,pass,0
|
||||
|
||||
|
||||
|
||||
regnety_002,pass,0
|
||||
|
||||
|
||||
|
||||
repvgg_a2,pass,0
|
||||
|
||||
|
||||
|
||||
res2net101_26w_4s,pass,0
|
||||
|
||||
|
||||
|
||||
res2net50_14w_8s,pass,0
|
||||
|
||||
|
||||
|
||||
res2next50,pass,0
|
||||
|
||||
|
||||
|
||||
resmlp_12_224,pass,0
|
||||
|
||||
|
||||
|
||||
resnest101e,pass,0
|
||||
|
||||
|
||||
|
||||
rexnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
sebotnet33ts_256,pass,0
|
||||
|
||||
|
||||
|
||||
selecsls42b,pass,0
|
||||
|
||||
|
||||
|
||||
spnasnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
swin_base_patch4_window7_224,pass,0
|
||||
|
||||
|
||||
|
||||
swsl_resnext101_32x16d,pass,0
|
||||
|
||||
|
||||
|
||||
tf_efficientnet_b0,pass,0
|
||||
|
||||
|
||||
|
||||
tf_mixnet_l,pass,0
|
||||
|
||||
|
||||
|
||||
tinynet_a,pass,0
|
||||
|
||||
|
||||
|
||||
tnt_s_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
twins_pcpvt_base,pass,0
|
||||
|
||||
|
||||
|
||||
visformer_small,pass,0
|
||||
|
||||
|
||||
|
||||
vit_base_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
volo_d1_224,pass,0
|
||||
|
||||
|
||||
|
||||
xcit_large_24_p8_224,pass,0
|
||||
|
|
@ -118,62 +118,6 @@ functorch_maml_omniglot,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Albert,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Bart,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Bert,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
hf_DistilBert,pass,0
|
||||
|
||||
|
||||
|
||||
hf_GPT2,pass,0
|
||||
|
||||
|
||||
|
||||
hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Roberta_base,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_base,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
hf_T5_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Whisper,pass,0
|
||||
|
||||
|
||||
|
||||
hf_distil_whisper,pass,0
|
||||
|
||||
|
||||
|
||||
lennard_jones,pass,0
|
||||
|
||||
|
||||
@ -314,30 +258,6 @@ stable_diffusion_unet,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
timm_efficientnet,pass,0
|
||||
|
||||
|
||||
|
||||
timm_regnet,pass,0
|
||||
|
||||
|
||||
|
||||
timm_resnest,pass,0
|
||||
|
||||
|
||||
|
||||
timm_vision_transformer,pass,0
|
||||
|
||||
|
||||
|
||||
timm_vision_transformer_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
timm_vovnet,pass,0
|
||||
|
||||
|
||||
|
||||
torch_multimodal_clip,pass,0
|
||||
|
||||
|
||||
|
|
@ -114,58 +114,6 @@ functorch_maml_omniglot,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Albert,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Bart,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Bert,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,0
|
||||
|
||||
|
||||
|
||||
hf_DistilBert,pass,0
|
||||
|
||||
|
||||
|
||||
hf_GPT2,pass,0
|
||||
|
||||
|
||||
|
||||
hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Roberta_base,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_base,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_distil_whisper,pass,0
|
||||
|
||||
|
||||
|
||||
lennard_jones,pass,0
|
||||
|
||||
|
||||
@ -278,38 +226,6 @@ stable_diffusion_unet,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
timm_efficientdet,model_fail_to_load,0
|
||||
|
||||
|
||||
|
||||
timm_efficientnet,pass,0
|
||||
|
||||
|
||||
|
||||
timm_nfnet,pass,0
|
||||
|
||||
|
||||
|
||||
timm_regnet,pass,0
|
||||
|
||||
|
||||
|
||||
timm_resnest,pass,0
|
||||
|
||||
|
||||
|
||||
timm_vision_transformer,pass,0
|
||||
|
||||
|
||||
|
||||
timm_vision_transformer_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
timm_vovnet,pass,0
|
||||
|
||||
|
||||
|
||||
torch_multimodal_clip,pass,0
|
||||
|
||||
|
||||
|
|
@ -6,58 +6,26 @@ AlbertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
AlbertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
BartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
BartForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
BertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
BertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
BlenderbotForCausalLM,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
BlenderbotSmallForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
BlenderbotSmallForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
CamemBert,pass,0
|
||||
|
||||
|
||||
|
||||
DebertaV2ForMaskedLM,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
DebertaV2ForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
DistilBertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
DistilBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
DistillGPT2,pass,0
|
||||
|
||||
|
||||
@ -66,10 +34,6 @@ ElectraForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
ElectraForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
GPT2ForSequenceClassification,pass,0
|
||||
|
||||
|
||||
@ -82,10 +46,6 @@ LayoutLMForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
LayoutLMForSequenceClassification,pass,0
|
||||
|
||||
|
||||
|
||||
M2M100ForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
@ -94,10 +54,6 @@ MBartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
MBartForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
MT5ForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
@ -106,18 +62,10 @@ MegatronBertForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
MegatronBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
MobileBertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
MobileBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
OPTForCausalLM,pass,0
|
||||
|
||||
|
||||
@ -126,26 +74,14 @@ PLBartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
PLBartForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
PegasusForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
PegasusForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
RobertaForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
RobertaForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
T5ForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
|
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
botnet26t_256,pass,0
|
||||
|
||||
|
||||
|
||||
cait_m36_384,pass,0
|
||||
|
||||
|
||||
|
||||
coat_lite_mini,pass,0
|
||||
|
||||
|
||||
|
||||
convit_base,pass,0
|
||||
|
||||
|
||||
|
||||
convmixer_768_32,pass,0
|
||||
|
||||
|
||||
|
||||
convnext_base,pass,0
|
||||
|
||||
|
||||
|
||||
crossvit_9_240,pass,0
|
||||
|
||||
|
||||
|
||||
cspdarknet53,pass,0
|
||||
|
||||
|
||||
|
||||
deit_base_distilled_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
dla102,pass,0
|
||||
|
||||
|
||||
|
||||
dm_nfnet_f0,pass,0
|
||||
|
||||
|
||||
|
||||
dpn107,pass,0
|
||||
|
||||
|
||||
|
||||
eca_botnext26ts_256,pass,0
|
||||
|
||||
|
||||
|
||||
eca_halonext26ts,pass,0
|
||||
|
||||
|
||||
|
||||
ese_vovnet19b_dw,pass,0
|
||||
|
||||
|
||||
|
||||
fbnetc_100,pass,0
|
||||
|
||||
|
||||
|
||||
fbnetv3_b,pass,0
|
||||
|
||||
|
||||
|
||||
gernet_l,pass,0
|
||||
|
||||
|
||||
|
||||
ghostnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
gluon_inception_v3,pass,0
|
||||
|
||||
|
||||
|
||||
gmixer_24_224,pass,0
|
||||
|
||||
|
||||
|
||||
gmlp_s16_224,pass,0
|
||||
|
||||
|
||||
|
||||
hrnet_w18,pass,0
|
||||
|
||||
|
||||
|
||||
inception_v3,pass,0
|
||||
|
||||
|
||||
|
||||
jx_nest_base,pass,0
|
||||
|
||||
|
||||
|
||||
lcnet_050,pass,0
|
||||
|
||||
|
||||
|
||||
levit_128,pass,0
|
||||
|
||||
|
||||
|
||||
mixer_b16_224,pass,0
|
||||
|
||||
|
||||
|
||||
mixnet_l,pass,0
|
||||
|
||||
|
||||
|
||||
mnasnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
mobilenetv2_100,pass,0
|
||||
|
||||
|
||||
@ -146,100 +42,16 @@ nfnet_l0,pass,0
|
||||
|
||||
|
||||
|
||||
pit_b_224,pass,0
|
||||
|
||||
|
||||
|
||||
pnasnet5large,pass,0
|
||||
|
||||
|
||||
|
||||
poolformer_m36,pass,0
|
||||
|
||||
|
||||
|
||||
regnety_002,pass,0
|
||||
|
||||
|
||||
|
||||
repvgg_a2,pass,0
|
||||
|
||||
|
||||
|
||||
res2net101_26w_4s,pass,0
|
||||
|
||||
|
||||
|
||||
res2net50_14w_8s,pass,0
|
||||
|
||||
|
||||
|
||||
res2next50,pass,0
|
||||
|
||||
|
||||
|
||||
resmlp_12_224,pass,0
|
||||
|
||||
|
||||
|
||||
resnest101e,pass,0
|
||||
|
||||
|
||||
|
||||
rexnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
sebotnet33ts_256,pass,0
|
||||
|
||||
|
||||
|
||||
selecsls42b,pass,0
|
||||
|
||||
|
||||
|
||||
spnasnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
swin_base_patch4_window7_224,pass,0
|
||||
|
||||
|
||||
|
||||
swsl_resnext101_32x16d,pass,0
|
||||
|
||||
|
||||
|
||||
tf_efficientnet_b0,pass,0
|
||||
|
||||
|
||||
|
||||
tf_mixnet_l,pass,0
|
||||
|
||||
|
||||
|
||||
tinynet_a,pass,0
|
||||
|
||||
|
||||
|
||||
tnt_s_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
twins_pcpvt_base,pass,0
|
||||
|
||||
|
||||
|
||||
visformer_small,pass,0
|
||||
|
||||
|
||||
|
||||
vit_base_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
volo_d1_224,pass,0
|
||||
|
||||
|
||||
|
||||
xcit_large_24_p8_224,pass,0
|
||||
|
|
@ -114,58 +114,6 @@ functorch_maml_omniglot,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Albert,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Bart,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Bert,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,0
|
||||
|
||||
|
||||
|
||||
hf_DistilBert,pass,0
|
||||
|
||||
|
||||
|
||||
hf_GPT2,pass,0
|
||||
|
||||
|
||||
|
||||
hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Roberta_base,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_base,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_distil_whisper,pass,0
|
||||
|
||||
|
||||
|
||||
lennard_jones,pass,0
|
||||
|
||||
|
||||
@ -278,38 +226,6 @@ stable_diffusion_unet,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
timm_efficientdet,model_fail_to_load,0
|
||||
|
||||
|
||||
|
||||
timm_efficientnet,pass,0
|
||||
|
||||
|
||||
|
||||
timm_nfnet,pass,0
|
||||
|
||||
|
||||
|
||||
timm_regnet,pass,0
|
||||
|
||||
|
||||
|
||||
timm_resnest,pass,0
|
||||
|
||||
|
||||
|
||||
timm_vision_transformer,pass,0
|
||||
|
||||
|
||||
|
||||
timm_vision_transformer_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
timm_vovnet,pass,0
|
||||
|
||||
|
||||
|
||||
torch_multimodal_clip,pass,0
|
||||
|
||||
|
||||
|
|
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
AlbertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
AllenaiLongformerBase,pass,4
|
||||
|
||||
|
||||
@ -18,50 +14,22 @@ BartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
BartForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
BertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
BertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
BlenderbotForCausalLM,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
BlenderbotSmallForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
BlenderbotSmallForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
CamemBert,pass,0
|
||||
|
||||
|
||||
|
||||
DebertaV2ForMaskedLM,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
DebertaV2ForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
DistilBertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
DistilBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
DistillGPT2,pass,2
|
||||
|
||||
|
||||
@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
ElectraForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
GPT2ForSequenceClassification,pass,0
|
||||
|
||||
|
||||
@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
LayoutLMForSequenceClassification,pass,0
|
||||
|
||||
|
||||
|
||||
M2M100ForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
@ -98,10 +58,6 @@ MBartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
MBartForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
MT5ForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
MegatronBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
MobileBertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
MobileBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
OPTForCausalLM,pass,0
|
||||
|
||||
|
||||
@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
PLBartForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
PegasusForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
PegasusForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
RobertaForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
RobertaForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
T5ForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
|
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
botnet26t_256,pass,0
|
||||
|
||||
|
||||
|
||||
cait_m36_384,pass,0
|
||||
|
||||
|
||||
|
||||
coat_lite_mini,pass,0
|
||||
|
||||
|
||||
|
||||
convit_base,pass,0
|
||||
|
||||
|
||||
|
||||
convmixer_768_32,pass,0
|
||||
|
||||
|
||||
|
||||
convnext_base,pass,0
|
||||
|
||||
|
||||
|
||||
crossvit_9_240,pass,0
|
||||
|
||||
|
||||
|
||||
cspdarknet53,pass,0
|
||||
|
||||
|
||||
|
||||
deit_base_distilled_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
dla102,timeout,0
|
||||
|
||||
|
||||
|
||||
dm_nfnet_f0,pass,0
|
||||
|
||||
|
||||
|
||||
dpn107,pass,0
|
||||
|
||||
|
||||
|
||||
eca_botnext26ts_256,pass,0
|
||||
|
||||
|
||||
|
||||
eca_halonext26ts,pass,0
|
||||
|
||||
|
||||
|
||||
ese_vovnet19b_dw,pass,0
|
||||
|
||||
|
||||
|
||||
fbnetc_100,pass,0
|
||||
|
||||
|
||||
|
||||
fbnetv3_b,pass,0
|
||||
|
||||
|
||||
|
||||
gernet_l,pass,0
|
||||
|
||||
|
||||
|
||||
ghostnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
gluon_inception_v3,pass,0
|
||||
|
||||
|
||||
|
||||
gmixer_24_224,pass,0
|
||||
|
||||
|
||||
|
||||
gmlp_s16_224,pass,0
|
||||
|
||||
|
||||
|
||||
hrnet_w18,pass,0
|
||||
|
||||
|
||||
|
||||
inception_v3,pass,0
|
||||
|
||||
|
||||
|
||||
jx_nest_base,pass,0
|
||||
|
||||
|
||||
|
||||
lcnet_050,pass,0
|
||||
|
||||
|
||||
|
||||
levit_128,pass,0
|
||||
|
||||
|
||||
|
||||
mixer_b16_224,pass,0
|
||||
|
||||
|
||||
|
||||
mixnet_l,pass,0
|
||||
|
||||
|
||||
|
||||
mnasnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
mobilenetv2_100,pass,0
|
||||
|
||||
|
||||
@ -146,100 +42,16 @@ nfnet_l0,pass,0
|
||||
|
||||
|
||||
|
||||
pit_b_224,pass,0
|
||||
|
||||
|
||||
|
||||
pnasnet5large,pass,0
|
||||
|
||||
|
||||
|
||||
poolformer_m36,pass,0
|
||||
|
||||
|
||||
|
||||
regnety_002,pass,0
|
||||
|
||||
|
||||
|
||||
repvgg_a2,pass,0
|
||||
|
||||
|
||||
|
||||
res2net101_26w_4s,pass,0
|
||||
|
||||
|
||||
|
||||
res2net50_14w_8s,pass,0
|
||||
|
||||
|
||||
|
||||
res2next50,pass,0
|
||||
|
||||
|
||||
|
||||
resmlp_12_224,pass,0
|
||||
|
||||
|
||||
|
||||
resnest101e,pass,0
|
||||
|
||||
|
||||
|
||||
rexnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
sebotnet33ts_256,pass,0
|
||||
|
||||
|
||||
|
||||
selecsls42b,pass,0
|
||||
|
||||
|
||||
|
||||
spnasnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
swin_base_patch4_window7_224,pass,0
|
||||
|
||||
|
||||
|
||||
swsl_resnext101_32x16d,pass,0
|
||||
|
||||
|
||||
|
||||
tf_efficientnet_b0,pass,0
|
||||
|
||||
|
||||
|
||||
tf_mixnet_l,pass,0
|
||||
|
||||
|
||||
|
||||
tinynet_a,pass,0
|
||||
|
||||
|
||||
|
||||
tnt_s_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
twins_pcpvt_base,pass,0
|
||||
|
||||
|
||||
|
||||
visformer_small,pass,0
|
||||
|
||||
|
||||
|
||||
vit_base_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
volo_d1_224,pass,0
|
||||
|
||||
|
||||
|
||||
xcit_large_24_p8_224,pass,0
|
||||
|
|
@ -122,66 +122,6 @@ functorch_maml_omniglot,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Albert,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Bart,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Bert,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,27
|
||||
|
||||
|
||||
|
||||
hf_DistilBert,pass,0
|
||||
|
||||
|
||||
|
||||
hf_GPT2,pass,0
|
||||
|
||||
|
||||
|
||||
hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Longformer,pass,4
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
|
||||
|
||||
|
||||
hf_Roberta_base,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_base,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_distil_whisper,pass,0
|
||||
|
||||
|
||||
|
||||
lennard_jones,pass,0
|
||||
|
||||
|
||||
@ -302,38 +242,6 @@ stable_diffusion_unet,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
timm_efficientdet,model_fail_to_load,0
|
||||
|
||||
|
||||
|
||||
timm_efficientnet,pass,0
|
||||
|
||||
|
||||
|
||||
timm_nfnet,pass,0
|
||||
|
||||
|
||||
|
||||
timm_regnet,pass,0
|
||||
|
||||
|
||||
|
||||
timm_resnest,pass,0
|
||||
|
||||
|
||||
|
||||
timm_vision_transformer,pass,0
|
||||
|
||||
|
||||
|
||||
timm_vision_transformer_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
timm_vovnet,pass,0
|
||||
|
||||
|
||||
|
||||
torch_multimodal_clip,pass,0
|
||||
|
||||
|
||||
|
|
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
AlbertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
AllenaiLongformerBase,pass,4
|
||||
|
||||
|
||||
@ -18,50 +14,22 @@ BartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
BartForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
BertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
BertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
BlenderbotForCausalLM,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
BlenderbotSmallForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
BlenderbotSmallForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
CamemBert,pass,0
|
||||
|
||||
|
||||
|
||||
DebertaV2ForMaskedLM,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
DebertaV2ForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
DistilBertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
DistilBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
DistillGPT2,pass,2
|
||||
|
||||
|
||||
@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
ElectraForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
GPT2ForSequenceClassification,pass,0
|
||||
|
||||
|
||||
@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
LayoutLMForSequenceClassification,pass,0
|
||||
|
||||
|
||||
|
||||
M2M100ForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
@ -98,10 +58,6 @@ MBartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
MBartForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
MT5ForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
MegatronBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
MobileBertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
MobileBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
OPTForCausalLM,pass,0
|
||||
|
||||
|
||||
@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
PLBartForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
PegasusForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
PegasusForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
RobertaForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
RobertaForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
T5ForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
|
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
botnet26t_256,pass,0
|
||||
|
||||
|
||||
|
||||
cait_m36_384,pass,0
|
||||
|
||||
|
||||
|
||||
coat_lite_mini,pass,0
|
||||
|
||||
|
||||
|
||||
convit_base,pass,0
|
||||
|
||||
|
||||
|
||||
convmixer_768_32,pass,0
|
||||
|
||||
|
||||
|
||||
convnext_base,pass,0
|
||||
|
||||
|
||||
|
||||
crossvit_9_240,pass,0
|
||||
|
||||
|
||||
|
||||
cspdarknet53,pass,0
|
||||
|
||||
|
||||
|
||||
deit_base_distilled_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
dla102,timeout,0
|
||||
|
||||
|
||||
|
||||
dm_nfnet_f0,pass,0
|
||||
|
||||
|
||||
|
||||
dpn107,pass,0
|
||||
|
||||
|
||||
|
||||
eca_botnext26ts_256,pass,0
|
||||
|
||||
|
||||
|
||||
eca_halonext26ts,pass,0
|
||||
|
||||
|
||||
|
||||
ese_vovnet19b_dw,pass,0
|
||||
|
||||
|
||||
|
||||
fbnetc_100,pass,0
|
||||
|
||||
|
||||
|
||||
fbnetv3_b,pass,0
|
||||
|
||||
|
||||
|
||||
gernet_l,pass,0
|
||||
|
||||
|
||||
|
||||
ghostnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
gluon_inception_v3,pass,0
|
||||
|
||||
|
||||
|
||||
gmixer_24_224,pass,0
|
||||
|
||||
|
||||
|
||||
gmlp_s16_224,pass,0
|
||||
|
||||
|
||||
|
||||
hrnet_w18,pass,0
|
||||
|
||||
|
||||
|
||||
inception_v3,pass,0
|
||||
|
||||
|
||||
|
||||
jx_nest_base,pass,0
|
||||
|
||||
|
||||
|
||||
lcnet_050,pass,0
|
||||
|
||||
|
||||
|
||||
levit_128,pass,0
|
||||
|
||||
|
||||
|
||||
mixer_b16_224,pass,0
|
||||
|
||||
|
||||
|
||||
mixnet_l,pass,0
|
||||
|
||||
|
||||
|
||||
mnasnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
mobilenetv2_100,pass,0
|
||||
|
||||
|
||||
@ -146,100 +42,16 @@ nfnet_l0,pass,0
|
||||
|
||||
|
||||
|
||||
pit_b_224,pass,0
|
||||
|
||||
|
||||
|
||||
pnasnet5large,pass,0
|
||||
|
||||
|
||||
|
||||
poolformer_m36,pass,0
|
||||
|
||||
|
||||
|
||||
regnety_002,pass,0
|
||||
|
||||
|
||||
|
||||
repvgg_a2,pass,0
|
||||
|
||||
|
||||
|
||||
res2net101_26w_4s,pass,0
|
||||
|
||||
|
||||
|
||||
res2net50_14w_8s,pass,0
|
||||
|
||||
|
||||
|
||||
res2next50,pass,0
|
||||
|
||||
|
||||
|
||||
resmlp_12_224,pass,0
|
||||
|
||||
|
||||
|
||||
resnest101e,pass,0
|
||||
|
||||
|
||||
|
||||
rexnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
sebotnet33ts_256,pass,0
|
||||
|
||||
|
||||
|
||||
selecsls42b,pass,0
|
||||
|
||||
|
||||
|
||||
spnasnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
swin_base_patch4_window7_224,pass,0
|
||||
|
||||
|
||||
|
||||
swsl_resnext101_32x16d,pass,0
|
||||
|
||||
|
||||
|
||||
tf_efficientnet_b0,pass,0
|
||||
|
||||
|
||||
|
||||
tf_mixnet_l,pass,0
|
||||
|
||||
|
||||
|
||||
tinynet_a,pass,0
|
||||
|
||||
|
||||
|
||||
tnt_s_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
twins_pcpvt_base,pass,0
|
||||
|
||||
|
||||
|
||||
visformer_small,pass,0
|
||||
|
||||
|
||||
|
||||
vit_base_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
volo_d1_224,pass,0
|
||||
|
||||
|
||||
|
||||
xcit_large_24_p8_224,pass,0
|
||||
|
|
@ -122,66 +122,6 @@ functorch_maml_omniglot,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Albert,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Bart,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Bert,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,27
|
||||
|
||||
|
||||
|
||||
hf_DistilBert,pass,0
|
||||
|
||||
|
||||
|
||||
hf_GPT2,pass,0
|
||||
|
||||
|
||||
|
||||
hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Longformer,pass,4
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
|
||||
|
||||
|
||||
hf_Roberta_base,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_base,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_distil_whisper,pass,0
|
||||
|
||||
|
||||
|
||||
lennard_jones,pass,0
|
||||
|
||||
|
||||
@ -302,38 +242,6 @@ stable_diffusion_unet,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
timm_efficientdet,model_fail_to_load,0
|
||||
|
||||
|
||||
|
||||
timm_efficientnet,pass,0
|
||||
|
||||
|
||||
|
||||
timm_nfnet,pass,0
|
||||
|
||||
|
||||
|
||||
timm_regnet,pass,0
|
||||
|
||||
|
||||
|
||||
timm_resnest,pass,0
|
||||
|
||||
|
||||
|
||||
timm_vision_transformer,pass,0
|
||||
|
||||
|
||||
|
||||
timm_vision_transformer_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
timm_vovnet,pass,0
|
||||
|
||||
|
||||
|
||||
torch_multimodal_clip,pass,0
|
||||
|
||||
|
||||
|
|
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
AlbertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
AllenaiLongformerBase,pass,4
|
||||
|
||||
|
||||
@ -18,50 +14,22 @@ BartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
BartForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
BertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
BertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
BlenderbotForCausalLM,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
BlenderbotSmallForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
BlenderbotSmallForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
CamemBert,pass,0
|
||||
|
||||
|
||||
|
||||
DebertaV2ForMaskedLM,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
DebertaV2ForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
DistilBertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
DistilBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
DistillGPT2,pass,2
|
||||
|
||||
|
||||
@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
ElectraForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
GPT2ForSequenceClassification,pass,0
|
||||
|
||||
|
||||
@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
LayoutLMForSequenceClassification,pass,0
|
||||
|
||||
|
||||
|
||||
M2M100ForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
@ -98,10 +58,6 @@ MBartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
MBartForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
MT5ForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
MegatronBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
MobileBertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
MobileBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
OPTForCausalLM,pass,0
|
||||
|
||||
|
||||
@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
PLBartForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
PegasusForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
PegasusForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
RobertaForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
RobertaForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
T5ForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
|
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
botnet26t_256,pass,0
|
||||
|
||||
|
||||
|
||||
cait_m36_384,pass,0
|
||||
|
||||
|
||||
|
||||
coat_lite_mini,pass,0
|
||||
|
||||
|
||||
|
||||
convit_base,pass,0
|
||||
|
||||
|
||||
|
||||
convmixer_768_32,pass,0
|
||||
|
||||
|
||||
|
||||
convnext_base,pass,0
|
||||
|
||||
|
||||
|
||||
crossvit_9_240,pass,0
|
||||
|
||||
|
||||
|
||||
cspdarknet53,pass,0
|
||||
|
||||
|
||||
|
||||
deit_base_distilled_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
dla102,pass,0
|
||||
|
||||
|
||||
|
||||
dm_nfnet_f0,pass,0
|
||||
|
||||
|
||||
|
||||
dpn107,pass,0
|
||||
|
||||
|
||||
|
||||
eca_botnext26ts_256,pass,0
|
||||
|
||||
|
||||
|
||||
eca_halonext26ts,pass,0
|
||||
|
||||
|
||||
|
||||
ese_vovnet19b_dw,pass,0
|
||||
|
||||
|
||||
|
||||
fbnetc_100,pass,0
|
||||
|
||||
|
||||
|
||||
fbnetv3_b,pass,0
|
||||
|
||||
|
||||
|
||||
gernet_l,pass,0
|
||||
|
||||
|
||||
|
||||
ghostnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
gluon_inception_v3,pass,0
|
||||
|
||||
|
||||
|
||||
gmixer_24_224,pass,0
|
||||
|
||||
|
||||
|
||||
gmlp_s16_224,pass,0
|
||||
|
||||
|
||||
|
||||
hrnet_w18,pass,0
|
||||
|
||||
|
||||
|
||||
inception_v3,pass,0
|
||||
|
||||
|
||||
|
||||
jx_nest_base,pass,0
|
||||
|
||||
|
||||
|
||||
lcnet_050,pass,0
|
||||
|
||||
|
||||
|
||||
levit_128,pass,0
|
||||
|
||||
|
||||
|
||||
mixer_b16_224,pass,0
|
||||
|
||||
|
||||
|
||||
mixnet_l,pass,0
|
||||
|
||||
|
||||
|
||||
mnasnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
mobilenetv2_100,pass,0
|
||||
|
||||
|
||||
@ -146,100 +42,16 @@ nfnet_l0,pass,0
|
||||
|
||||
|
||||
|
||||
pit_b_224,pass,0
|
||||
|
||||
|
||||
|
||||
pnasnet5large,pass,0
|
||||
|
||||
|
||||
|
||||
poolformer_m36,pass,0
|
||||
|
||||
|
||||
|
||||
regnety_002,pass,0
|
||||
|
||||
|
||||
|
||||
repvgg_a2,pass,0
|
||||
|
||||
|
||||
|
||||
res2net101_26w_4s,pass,0
|
||||
|
||||
|
||||
|
||||
res2net50_14w_8s,pass,0
|
||||
|
||||
|
||||
|
||||
res2next50,pass,0
|
||||
|
||||
|
||||
|
||||
resmlp_12_224,pass,0
|
||||
|
||||
|
||||
|
||||
resnest101e,pass,0
|
||||
|
||||
|
||||
|
||||
rexnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
sebotnet33ts_256,pass,0
|
||||
|
||||
|
||||
|
||||
selecsls42b,pass,0
|
||||
|
||||
|
||||
|
||||
spnasnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
swin_base_patch4_window7_224,pass,0
|
||||
|
||||
|
||||
|
||||
swsl_resnext101_32x16d,pass,0
|
||||
|
||||
|
||||
|
||||
tf_efficientnet_b0,pass,0
|
||||
|
||||
|
||||
|
||||
tf_mixnet_l,pass,0
|
||||
|
||||
|
||||
|
||||
tinynet_a,pass,0
|
||||
|
||||
|
||||
|
||||
tnt_s_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
twins_pcpvt_base,pass,0
|
||||
|
||||
|
||||
|
||||
visformer_small,pass,0
|
||||
|
||||
|
||||
|
||||
vit_base_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
volo_d1_224,pass,0
|
||||
|
||||
|
||||
|
||||
xcit_large_24_p8_224,pass,0
|
||||
|
|
@ -122,66 +122,6 @@ functorch_maml_omniglot,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Albert,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Bart,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Bert,pass,0
|
||||
|
||||
|
||||
|
||||
hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,27
|
||||
|
||||
|
||||
|
||||
hf_DistilBert,pass,0
|
||||
|
||||
|
||||
|
||||
hf_GPT2,pass,0
|
||||
|
||||
|
||||
|
||||
hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Longformer,pass,4
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
|
||||
|
||||
|
||||
hf_Roberta_base,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_base,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_distil_whisper,pass,0
|
||||
|
||||
|
||||
|
||||
lennard_jones,pass,0
|
||||
|
||||
|
||||
@ -302,38 +242,6 @@ stable_diffusion_unet,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
timm_efficientdet,model_fail_to_load,0
|
||||
|
||||
|
||||
|
||||
timm_efficientnet,pass,0
|
||||
|
||||
|
||||
|
||||
timm_nfnet,pass,0
|
||||
|
||||
|
||||
|
||||
timm_regnet,pass,0
|
||||
|
||||
|
||||
|
||||
timm_resnest,pass,0
|
||||
|
||||
|
||||
|
||||
timm_vision_transformer,pass,0
|
||||
|
||||
|
||||
|
||||
timm_vision_transformer_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
timm_vovnet,pass,0
|
||||
|
||||
|
||||
|
||||
torch_multimodal_clip,pass,0
|
||||
|
||||
|
||||
|
|
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
AlbertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
AllenaiLongformerBase,pass,4
|
||||
|
||||
|
||||
@ -18,50 +14,22 @@ BartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
BartForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
BertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
BertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
BlenderbotForCausalLM,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
BlenderbotSmallForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
BlenderbotSmallForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
CamemBert,pass,0
|
||||
|
||||
|
||||
|
||||
DebertaV2ForMaskedLM,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
DebertaV2ForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
DistilBertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
DistilBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
DistillGPT2,pass,2
|
||||
|
||||
|
||||
@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
ElectraForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
GPT2ForSequenceClassification,pass,0
|
||||
|
||||
|
||||
@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
LayoutLMForSequenceClassification,pass,0
|
||||
|
||||
|
||||
|
||||
M2M100ForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
@ -98,10 +58,6 @@ MBartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
MBartForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
MT5ForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
MegatronBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
MobileBertForMaskedLM,pass,0
|
||||
|
||||
|
||||
|
||||
MobileBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
OPTForCausalLM,pass,0
|
||||
|
||||
|
||||
@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
PLBartForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
PegasusForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
PegasusForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
||||
RobertaForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
RobertaForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
T5ForConditionalGeneration,pass,0
|
||||
|
||||
|
||||
|
|
@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4
|
||||
|
||||
|
||||
|
||||
AlbertForQuestionAnswering,pass,5
|
||||
|
||||
|
||||
|
||||
AllenaiLongformerBase,pass,9
|
||||
|
||||
|
||||
@ -18,50 +14,22 @@ BartForCausalLM,pass,6
|
||||
|
||||
|
||||
|
||||
BartForConditionalGeneration,pass,8
|
||||
|
||||
|
||||
|
||||
BertForMaskedLM,pass,5
|
||||
|
||||
|
||||
|
||||
BertForQuestionAnswering,pass,5
|
||||
|
||||
|
||||
|
||||
BlenderbotForCausalLM,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
BlenderbotSmallForCausalLM,pass,6
|
||||
|
||||
|
||||
|
||||
BlenderbotSmallForConditionalGeneration,pass,8
|
||||
|
||||
|
||||
|
||||
CamemBert,pass,5
|
||||
|
||||
|
||||
|
||||
DebertaV2ForMaskedLM,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
|
||||
|
||||
|
||||
|
||||
DistilBertForMaskedLM,pass,5
|
||||
|
||||
|
||||
|
||||
DistilBertForQuestionAnswering,pass,5
|
||||
|
||||
|
||||
|
||||
DistillGPT2,pass,7
|
||||
|
||||
|
||||
@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4
|
||||
|
||||
|
||||
|
||||
ElectraForQuestionAnswering,pass,5
|
||||
|
||||
|
||||
|
||||
GPT2ForSequenceClassification,pass,6
|
||||
|
||||
|
||||
@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5
|
||||
|
||||
|
||||
|
||||
LayoutLMForSequenceClassification,pass,6
|
||||
|
||||
|
||||
|
||||
M2M100ForConditionalGeneration,pass,4
|
||||
|
||||
|
||||
@ -98,10 +58,6 @@ MBartForCausalLM,pass,6
|
||||
|
||||
|
||||
|
||||
MBartForConditionalGeneration,pass,8
|
||||
|
||||
|
||||
|
||||
MT5ForConditionalGeneration,pass,5
|
||||
|
||||
|
||||
@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5
|
||||
|
||||
|
||||
|
||||
MegatronBertForQuestionAnswering,pass,5
|
||||
|
||||
|
||||
|
||||
MobileBertForMaskedLM,pass,3
|
||||
|
||||
|
||||
|
||||
MobileBertForQuestionAnswering,pass,3
|
||||
|
||||
|
||||
|
||||
OPTForCausalLM,pass,8
|
||||
|
||||
|
||||
@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6
|
||||
|
||||
|
||||
|
||||
PLBartForConditionalGeneration,pass,8
|
||||
|
||||
|
||||
|
||||
PegasusForCausalLM,pass,6
|
||||
|
||||
|
||||
|
||||
PegasusForConditionalGeneration,pass,7
|
||||
|
||||
|
||||
|
||||
RobertaForCausalLM,pass,5
|
||||
|
||||
|
||||
|
||||
RobertaForQuestionAnswering,pass,5
|
||||
|
||||
|
||||
|
||||
T5ForConditionalGeneration,pass,5
|
||||
|
||||
|
||||
|
|
@ -10,126 +10,22 @@ beit_base_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
botnet26t_256,pass,0
|
||||
|
||||
|
||||
|
||||
cait_m36_384,pass,0
|
||||
|
||||
|
||||
|
||||
coat_lite_mini,pass,0
|
||||
|
||||
|
||||
|
||||
convit_base,pass,0
|
||||
|
||||
|
||||
|
||||
convmixer_768_32,pass,0
|
||||
|
||||
|
||||
|
||||
convnext_base,pass,0
|
||||
|
||||
|
||||
|
||||
crossvit_9_240,pass,0
|
||||
|
||||
|
||||
|
||||
cspdarknet53,pass,0
|
||||
|
||||
|
||||
|
||||
deit_base_distilled_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
dla102,pass,0
|
||||
|
||||
|
||||
|
||||
dm_nfnet_f0,pass,0
|
||||
|
||||
|
||||
|
||||
dpn107,pass,0
|
||||
|
||||
|
||||
|
||||
eca_botnext26ts_256,pass,0
|
||||
|
||||
|
||||
|
||||
eca_halonext26ts,pass,0
|
||||
|
||||
|
||||
|
||||
ese_vovnet19b_dw,pass,0
|
||||
|
||||
|
||||
|
||||
fbnetc_100,pass,0
|
||||
|
||||
|
||||
|
||||
fbnetv3_b,pass,0
|
||||
|
||||
|
||||
|
||||
gernet_l,pass,0
|
||||
|
||||
|
||||
|
||||
ghostnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
gluon_inception_v3,pass,0
|
||||
|
||||
|
||||
|
||||
gmixer_24_224,pass,0
|
||||
|
||||
|
||||
|
||||
gmlp_s16_224,pass,0
|
||||
|
||||
|
||||
|
||||
hrnet_w18,pass,0
|
||||
|
||||
|
||||
|
||||
inception_v3,pass,0
|
||||
|
||||
|
||||
|
||||
jx_nest_base,pass,0
|
||||
|
||||
|
||||
|
||||
lcnet_050,pass,0
|
||||
|
||||
|
||||
|
||||
levit_128,pass,0
|
||||
|
||||
|
||||
|
||||
mixer_b16_224,pass,0
|
||||
|
||||
|
||||
|
||||
mixnet_l,pass,0
|
||||
|
||||
|
||||
|
||||
mnasnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
mobilenetv2_100,pass,0
|
||||
|
||||
|
||||
@ -146,100 +42,16 @@ nfnet_l0,pass,0
|
||||
|
||||
|
||||
|
||||
pit_b_224,pass,0
|
||||
|
||||
|
||||
|
||||
pnasnet5large,pass,0
|
||||
|
||||
|
||||
|
||||
poolformer_m36,pass,0
|
||||
|
||||
|
||||
|
||||
regnety_002,pass,0
|
||||
|
||||
|
||||
|
||||
repvgg_a2,pass,0
|
||||
|
||||
|
||||
|
||||
res2net101_26w_4s,pass,0
|
||||
|
||||
|
||||
|
||||
res2net50_14w_8s,pass,0
|
||||
|
||||
|
||||
|
||||
res2next50,pass,0
|
||||
|
||||
|
||||
|
||||
resmlp_12_224,pass,0
|
||||
|
||||
|
||||
|
||||
resnest101e,pass,0
|
||||
|
||||
|
||||
|
||||
rexnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
sebotnet33ts_256,pass,0
|
||||
|
||||
|
||||
|
||||
selecsls42b,pass,0
|
||||
|
||||
|
||||
|
||||
spnasnet_100,pass,0
|
||||
|
||||
|
||||
|
||||
swin_base_patch4_window7_224,pass,0
|
||||
|
||||
|
||||
|
||||
swsl_resnext101_32x16d,pass,0
|
||||
|
||||
|
||||
|
||||
tf_efficientnet_b0,pass,0
|
||||
|
||||
|
||||
|
||||
tf_mixnet_l,pass,0
|
||||
|
||||
|
||||
|
||||
tinynet_a,pass,0
|
||||
|
||||
|
||||
|
||||
tnt_s_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
twins_pcpvt_base,pass,0
|
||||
|
||||
|
||||
|
||||
visformer_small,pass,0
|
||||
|
||||
|
||||
|
||||
vit_base_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
volo_d1_224,pass,0
|
||||
|
||||
|
||||
|
||||
xcit_large_24_p8_224,pass,0
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user