mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 21:49:24 +08:00
Compare commits
161 Commits
copilot/co
...
ciflow/b20
Author | SHA1 | Date | |
---|---|---|---|
24a1c536dd | |||
138eba943f | |||
57ba575242 | |||
ceb11a584d | |||
33adb276fe | |||
e939651972 | |||
3255e7872b | |||
c4f6619330 | |||
f18041cca8 | |||
35e51893bd | |||
1f43d17ce6 | |||
032bed95cd | |||
d14cbb4476 | |||
f510d0dbc0 | |||
beb6b62e8c | |||
4740ce7787 | |||
ad67170c8b | |||
fdab48a7c1 | |||
a0948d4d23 | |||
0bbdd6b8db | |||
24520b8386 | |||
c79dfdc655 | |||
e595136187 | |||
aaac8cb0f5 | |||
0f0b4bf029 | |||
b8194268a6 | |||
f02e3947f6 | |||
9095a9dfae | |||
d9f94e0d7d | |||
23417ae50f | |||
e4d6c56ffb | |||
017d2985f3 | |||
c6a8db0b9a | |||
de09bab4b6 | |||
c137e222d4 | |||
cf3a787bbc | |||
de3da77cf7 | |||
543ddbf44c | |||
e9f4999985 | |||
29b029648e | |||
a25a649e70 | |||
69c33898fa | |||
1b397420f2 | |||
fe80f03726 | |||
e50dc40d28 | |||
2e22b1a61e | |||
616c6bdf8f | |||
c18ddfc572 | |||
86ebce1766 | |||
8cb2fb44f2 | |||
ab65498d71 | |||
06d324365c | |||
6c9c6e0936 | |||
2bcd892c86 | |||
75e2a9fae3 | |||
a16fd6b488 | |||
382b0150de | |||
a664b299ac | |||
9c12651417 | |||
08c97b4a1f | |||
fae74cd52f | |||
7a65770013 | |||
e4454947e2 | |||
3806e9767b | |||
b08d8c2e50 | |||
ca5b7f8ded | |||
9a71d96256 | |||
0d4c2b71e8 | |||
d659bbde62 | |||
58879bfafa | |||
a032510db3 | |||
39e0a832c9 | |||
dd3b48e85d | |||
cff1b20771 | |||
da8517fa63 | |||
45afaf08a1 | |||
080365b7d8 | |||
2928c5c572 | |||
630520b346 | |||
1dc9a05d03 | |||
bfcdbd0a97 | |||
faff826a46 | |||
85c5433d38 | |||
935ccdbe75 | |||
3af2f0c12a | |||
fdb774409d | |||
7ca5147563 | |||
9994d19747 | |||
3db29f6b89 | |||
940eac3252 | |||
da05532e8d | |||
7812f1137e | |||
f9206ce73d | |||
9ff76178e5 | |||
47a9ff89fa | |||
97bea14483 | |||
34953e12c8 | |||
5b957bad53 | |||
04e5df6d51 | |||
d8feaf66d6 | |||
d05f7557ba | |||
7dd9d8886d | |||
8ffed9c978 | |||
0c5b74e79a | |||
e730416e9e | |||
da02a8b5a2 | |||
0afa79a745 | |||
dddd78b7ca | |||
40ae06780c | |||
2f8d95f7dd | |||
8cf68c4b7b | |||
a9c114d391 | |||
b3973bb071 | |||
d70062c20e | |||
290d8d9867 | |||
9b4f439214 | |||
5186a062c2 | |||
da7e8123ec | |||
09d19cd2c0 | |||
7e3e502adc | |||
6ceb56b2a9 | |||
8c8cc40549 | |||
1ac9edfcbb | |||
2833d3649a | |||
409034c4a1 | |||
ac2dfc6467 | |||
0f5b860813 | |||
290e84e8eb | |||
ae3220345b | |||
412b826445 | |||
f3d99ebd3f | |||
ebba3522a2 | |||
489125cb6c | |||
2400887d78 | |||
5eb66528c6 | |||
a26d605ff1 | |||
896f4b4927 | |||
86f6748499 | |||
6ec6b6e652 | |||
5990e4bd99 | |||
f37387637e | |||
386f4ff999 | |||
ccded250b4 | |||
a22a028f98 | |||
a905f925ad | |||
e6ec9a9eeb | |||
1b2d2a5a95 | |||
4cbb6388a7 | |||
3a9e7bdc22 | |||
f6631ee0c0 | |||
7c75b9da11 | |||
699d88dc18 | |||
79edec4de5 | |||
86bb7b7ec1 | |||
888941a9b0 | |||
e186f7f22a | |||
ca3c6dfc3d | |||
478ae1c72c | |||
cd4ede03b4 | |||
e9af696dfe | |||
06f6bc6c7a |
@ -39,9 +39,13 @@ case ${DOCKER_TAG_PREFIX} in
|
||||
DOCKER_GPU_BUILD_ARG=""
|
||||
;;
|
||||
rocm*)
|
||||
# we want the patch version of 7.0 instead
|
||||
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
|
||||
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
|
||||
fi
|
||||
# we want the patch version of 6.4 instead
|
||||
if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
|
||||
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
|
||||
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.4"
|
||||
fi
|
||||
BASE_TARGET=rocm
|
||||
GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
|
||||
|
@ -75,9 +75,13 @@ case ${image} in
|
||||
DOCKERFILE_SUFFIX="_cuda_aarch64"
|
||||
;;
|
||||
manylinux2_28-builder:rocm*)
|
||||
# we want the patch version of 7.0 instead
|
||||
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
|
||||
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
|
||||
fi
|
||||
# we want the patch version of 6.4 instead
|
||||
if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
|
||||
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
|
||||
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.4"
|
||||
fi
|
||||
TARGET=rocm_final
|
||||
MANY_LINUX_VERSION="2_28"
|
||||
|
@ -57,8 +57,8 @@ def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules
|
||||
logger.info("Successfully cloned %s", target)
|
||||
return r, commit
|
||||
|
||||
except GitCommandError as e:
|
||||
logger.error("Git operation failed: %s", e)
|
||||
except GitCommandError:
|
||||
logger.exception("Git operation failed")
|
||||
raise
|
||||
|
||||
|
||||
|
4
.flake8
4
.flake8
@ -13,10 +13,6 @@ ignore =
|
||||
EXE001,
|
||||
# these ignores are from flake8-bugbear; please fix!
|
||||
B007,B008,B017,B019,B023,B028,B903,B905,B906,B907,B908,B910
|
||||
# these ignores are from flake8-comprehensions; please fix!
|
||||
C407,
|
||||
# these ignores are from flake8-logging-format; please fix!
|
||||
G100,G101,G200
|
||||
# these ignores are from flake8-simplify. please fix or ignore with commented reason
|
||||
SIM105,SIM108,SIM110,SIM111,SIM113,SIM114,SIM115,SIM116,SIM117,SIM118,SIM119,SIM12,
|
||||
# SIM104 is already covered by pyupgrade ruff
|
||||
|
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
1b013f5b5a87a1882eb143c26d79d091150d6a37
|
||||
69bbe7363897764f9e758d851cd0340147d27f94
|
||||
|
29
.github/labeler.yml
vendored
29
.github/labeler.yml
vendored
@ -133,3 +133,32 @@
|
||||
|
||||
"ciflow/vllm":
|
||||
- .github/ci_commit_pins/vllm.txt
|
||||
|
||||
"ciflow/b200":
|
||||
- test/test_matmul_cuda.py
|
||||
- test/test_scaled_matmul_cuda.py
|
||||
- test/inductor/test_fp8.py
|
||||
- aten/src/ATen/native/cuda/Blas.cpp
|
||||
- torch/**/*cublas*
|
||||
- torch/_inductor/kernel/mm.py
|
||||
- test/inductor/test_max_autotune.py
|
||||
- third_party/fbgemm
|
||||
|
||||
"ciflow/h100":
|
||||
- test/test_matmul_cuda.py
|
||||
- test/test_scaled_matmul_cuda.py
|
||||
- test/inductor/test_fp8.py
|
||||
- aten/src/ATen/native/cuda/Blas.cpp
|
||||
- torch/**/*cublas*
|
||||
- torch/_inductor/kernel/mm.py
|
||||
- test/inductor/test_max_autotune.py
|
||||
- third_party/fbgemm
|
||||
|
||||
"ciflow/rocm":
|
||||
- test/test_matmul_cuda.py
|
||||
- test/test_scaled_matmul_cuda.py
|
||||
- test/inductor/test_fp8.py
|
||||
- aten/src/ATen/native/cuda/Blas.cpp
|
||||
- torch/_inductor/kernel/mm.py
|
||||
- test/inductor/test_max_autotune.py
|
||||
- third_party/fbgemm
|
||||
|
30
.github/scripts/generate_binary_build_matrix.py
vendored
30
.github/scripts/generate_binary_build_matrix.py
vendored
@ -79,21 +79,21 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
|
||||
),
|
||||
"12.9": (
|
||||
"nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||
"nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | "
|
||||
"nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
|
||||
"nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | "
|
||||
"nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | "
|
||||
"nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | "
|
||||
"nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | "
|
||||
"nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | "
|
||||
"nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
|
||||
),
|
||||
"13.0": (
|
||||
"nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "
|
||||
|
14
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
14
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
@ -224,7 +224,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -473,7 +473,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -722,7 +722,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -971,7 +971,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1220,7 +1220,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1469,7 +1469,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1718,7 +1718,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
14
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
14
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
@ -259,7 +259,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_9-test: # Testing
|
||||
@ -925,7 +925,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_9-test: # Testing
|
||||
@ -1591,7 +1591,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda12_9-test: # Testing
|
||||
@ -2257,7 +2257,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda12_9-test: # Testing
|
||||
@ -2923,7 +2923,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda12_9-test: # Testing
|
||||
@ -3589,7 +3589,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda12_9-test: # Testing
|
||||
@ -4255,7 +4255,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14t-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda12_9-test: # Testing
|
||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -374,6 +374,7 @@ third_party/ruy/
|
||||
third_party/glog/
|
||||
|
||||
# Virtualenv
|
||||
.venv/
|
||||
venv/
|
||||
|
||||
# Log files
|
||||
|
@ -1205,7 +1205,6 @@ exclude_patterns = [
|
||||
# These files are all grandfathered in, feel free to remove from this list
|
||||
# as necessary
|
||||
# NOTE: remove the patterns in the order they are listed
|
||||
'aten/src/ATen/native/[a-pA-P]*/**',
|
||||
'aten/src/ATen/[a-mA-M]*/**',
|
||||
'test/**',
|
||||
]
|
||||
|
14
CODEOWNERS
14
CODEOWNERS
@ -201,3 +201,17 @@ torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A
|
||||
/torch/csrc/stable/ @janeyx99 @mikaylagawarecki
|
||||
/torch/headeronly/ @janeyx99
|
||||
/torch/header_only_apis.txt @janeyx99
|
||||
|
||||
# FlexAttention
|
||||
/torch/nn/attention/flex_attention.py @drisspg
|
||||
/torch/_higher_order_ops/flex_attention.py @drisspg
|
||||
/torch/_inductor/kernel/flex/ @drisspg
|
||||
/torch/_inductor/codegen/cpp_flex_attention_template.py @drisspg
|
||||
/test/inductor/test_flex_attention.py @drisspg
|
||||
/test/inductor/test_flex_decoding.py @drisspg
|
||||
|
||||
# Low Precision GEMMs
|
||||
/aten/src/ATen/native/cuda/Blas.cpp @drisspg @slayton58
|
||||
/aten/src/ATen/cuda/CUDABlas.cpp @drisspg @slayton58
|
||||
/aten/src/ATen/cuda/CUDABlas.h @drisspg @slayton58
|
||||
/test/test_scaled_matmul_cuda.py @drisspg @slayton58
|
||||
|
@ -289,14 +289,15 @@ IF(USE_FBGEMM_GENAI)
|
||||
|
||||
set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
set(fbgemm_genai_mx8mx8bf16_grouped
|
||||
set(fbgemm_genai_cuh
|
||||
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/"
|
||||
"${FBGEMM_GENAI_SRCS}/"
|
||||
)
|
||||
|
||||
target_include_directories(fbgemm_genai PRIVATE
|
||||
${FBGEMM_THIRD_PARTY}/cutlass/include
|
||||
${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
|
||||
${fbgemm_genai_mx8mx8bf16_grouped}
|
||||
${fbgemm_genai_cuh}
|
||||
${FBGEMM_GENAI_SRCS}/common/include/ # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
|
||||
${FBGEMM_GENAI_SRCS}/include/ # includes fbgemm_gpu/torch_ops.h
|
||||
)
|
||||
|
@ -177,7 +177,6 @@ inline void segmented_sort_pairs(
|
||||
}
|
||||
}
|
||||
|
||||
#if CUB_SUPPORTS_UNIQUE_BY_KEY()
|
||||
template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename NumSelectedIteratorT>
|
||||
inline void unique_by_key(
|
||||
KeysInputIteratorT keys_in, ValuesInputIteratorT values_in,
|
||||
@ -193,7 +192,6 @@ inline void unique_by_key(
|
||||
CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceSelect::UniqueByKey,
|
||||
keys_in, values_in, keys_out_, values_out, num_selected, num_input_items, c10::cuda::getCurrentCUDAStream());
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace impl {
|
||||
|
||||
@ -579,7 +577,6 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
|
||||
#endif
|
||||
}
|
||||
|
||||
#if CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
|
||||
template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT>
|
||||
inline void inclusive_sum_by_key(KeysInputIteratorT keys, ValuesInputIteratorT input, ValuesOutputIteratorT output, int64_t num_items) {
|
||||
@ -607,7 +604,6 @@ inline void inclusive_scan_by_key(KeysInputIteratorT keys, ValuesInputIteratorT
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT>
|
||||
void unique(InputIteratorT input, OutputIteratorT output,
|
||||
|
@ -28,22 +28,6 @@
|
||||
#define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false
|
||||
#endif
|
||||
|
||||
// cub support for UniqueByKey is added to cub 1.16 in:
|
||||
// https://github.com/NVIDIA/cub/pull/405
|
||||
#if CUB_VERSION >= 101600
|
||||
#define CUB_SUPPORTS_UNIQUE_BY_KEY() true
|
||||
#else
|
||||
#define CUB_SUPPORTS_UNIQUE_BY_KEY() false
|
||||
#endif
|
||||
|
||||
// cub support for scan by key is added to cub 1.15
|
||||
// in https://github.com/NVIDIA/cub/pull/376
|
||||
#if CUB_VERSION >= 101500
|
||||
#define CUB_SUPPORTS_SCAN_BY_KEY() 1
|
||||
#else
|
||||
#define CUB_SUPPORTS_SCAN_BY_KEY() 0
|
||||
#endif
|
||||
|
||||
// cub support for cub::FutureValue is added to cub 1.15 in:
|
||||
// https://github.com/NVIDIA/cub/pull/305
|
||||
#if CUB_VERSION >= 101500
|
||||
|
@ -160,6 +160,10 @@ constexpr DispatchKeySet kKeysToPropagateToWrapper({
|
||||
DispatchKey::CUDA,
|
||||
DispatchKey::CPU,
|
||||
DispatchKey::PrivateUse1,
|
||||
DispatchKey::SparseCPU,
|
||||
DispatchKey::SparseCUDA,
|
||||
DispatchKey::SparseCsrCPU,
|
||||
DispatchKey::SparseCsrCUDA,
|
||||
});
|
||||
|
||||
inline DispatchKeySet getKeysToPropagateToWrapper(const Tensor& tensor, DispatchKeySet to_propagate=kKeysToPropagateToWrapper) {
|
||||
|
@ -658,6 +658,7 @@ static void check_shape_forward(const at::Tensor& input,
|
||||
TORCH_CHECK(!params.is_output_padding_neg(), "negative output_padding is not supported");
|
||||
TORCH_CHECK(!params.is_stride_nonpos(), "non-positive stride is not supported");
|
||||
TORCH_CHECK(!params.is_dilation_neg(), "dilation should be greater than zero");
|
||||
TORCH_CHECK(groups > 0, "expected groups to be greater than 0, but got groups=", groups);
|
||||
|
||||
TORCH_CHECK(weight_dim == k,
|
||||
"Expected ", weight_dim, "-dimensional input for ", weight_dim,
|
||||
|
@ -128,7 +128,7 @@ at::Tensor PackedLinearWeight::apply_impl(
|
||||
auto* input_tr_ptr =
|
||||
reinterpret_cast<uint8_t*>(input_tr.data_ptr<c10::quint8>());
|
||||
// TODO: Activation transpose before and after the kernel can be removed if we
|
||||
// keep activation tensor always tranposed.
|
||||
// keep activation tensor always transposed.
|
||||
fbgemm::transpose_simd<uint8_t>(
|
||||
batch_size, K, input_ptr, K, input_tr_ptr, batch_size);
|
||||
|
||||
|
@ -34,7 +34,7 @@ struct Dist {
|
||||
// finish : This tells what to do with the aggregated value to compute
|
||||
// the norm. Generally this is the result of val ^ (1 / p).
|
||||
// backward : This is the gradient for that norm. Arguments are pretty
|
||||
// self explanitory.
|
||||
// self explanatory.
|
||||
//
|
||||
// There are a few cases where these aren't used. The 0 norm has no backward,
|
||||
// because it's always 0, so that's shortcircuited earlier. There's a special
|
||||
|
@ -74,7 +74,7 @@ it to sum up the entire array into a single value.
|
||||
|
||||
`ReduceOpsKernel.cpp` uses the `CPU_CAPABILITY_*` macros to "know" under which
|
||||
compiler flags it is currently compiled. This allows the programmer to write
|
||||
generic code, which will be compiled under multipled compilation settings.
|
||||
generic code, which will be compiled under multiplied compilation settings.
|
||||
|
||||
`../ReduceOps.cpp` now includes the header `ReduceOpsKernel.h`, which contains
|
||||
a generic definition of `sumImplAll`. This function allows the user to reduce
|
||||
|
@ -1017,7 +1017,7 @@ struct HelperInterpBase {
|
||||
while (aligned_interp_size % sizeof(int32_t) != 0) {
|
||||
aligned_interp_size += 1;
|
||||
}
|
||||
// assert that we wont go out of bounds
|
||||
// assert that we won't go out of bounds
|
||||
TORCH_INTERNAL_ASSERT(aligned_interp_size * sizeof(int16_t) < interp_size * sizeof(double));
|
||||
}
|
||||
|
||||
|
@ -655,7 +655,7 @@ void ImagingResampleHorizontalConvolution8u4x(
|
||||
// last element
|
||||
auto mmk = _mm256_set1_epi32(k[i]);
|
||||
// For num_channels == 3 (3 bytes = one pixel) we tolerate to read 4 bytes
|
||||
// lines 0, 1 and 2 wont go out of allocated memory bounds
|
||||
// lines 0, 1 and 2 won't go out of allocated memory bounds
|
||||
auto pix = _mm256_inserti128_si256(_mm256_castsi128_si256(
|
||||
mm_cvtepu8_epi32(lineIn0_min + stride * i, i32_aligned)),
|
||||
mm_cvtepu8_epi32(lineIn1_min + stride * i, i32_aligned), 1);
|
||||
@ -889,7 +889,7 @@ void ImagingResampleHorizontalConvolution8u(
|
||||
_mm_loadu_si128((__m128i *) (lineIn_min + stride * i))),
|
||||
_mm_loadu_si128((__m128i *) (lineIn_min + stride * (i + 4))), 1);
|
||||
|
||||
// Extract lower part of each lane, cast to epi16 and reoder RGBARGBA -> RRGGBBAA
|
||||
// Extract lower part of each lane, cast to epi16 and reorder RGBARGBA -> RRGGBBAA
|
||||
// RGBA: pix1 = [
|
||||
// r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 a0 0 a1 0
|
||||
// r4 0 r5 0 g4 0 g5 0 b4 0 b5 0 a4 0 a5 0
|
||||
@ -1312,7 +1312,7 @@ void ImagingResampleVerticalConvolution8u(
|
||||
|
||||
// Here we write 4 bytes to the output even if num_channels < 4, e.g o = {r,g,b,X} for num_channels=3
|
||||
// It is OK to write 4th byte (e.g. X) as on the next step we will overwrite it with new data.
|
||||
// We also wont go out of bounds of lineOut memory allocation
|
||||
// We also won't go out of bounds of lineOut memory allocation
|
||||
std::memcpy(lineOut + j, (uint8_t *) &o, 4);
|
||||
}
|
||||
|
||||
|
@ -240,7 +240,7 @@ _PS256_CONST(coscof_p2, 4.166664568298827E-002);
|
||||
_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
|
||||
|
||||
|
||||
/* evaluation of 8 sines at onces using AVX intrinsics
|
||||
/* evaluation of 8 sines at once using AVX intrinsics
|
||||
|
||||
The code is the exact rewriting of the cephes sinf function.
|
||||
Precision is excellent as long as x < 8192 (I did not bother to
|
||||
|
@ -311,7 +311,7 @@ void GroupNormKernelImplChannelsLastInternal(
|
||||
const bool gamma_null = (gamma_data == nullptr);
|
||||
const bool beta_null = beta_data == nullptr;
|
||||
|
||||
// NB: About algorithm choosen:
|
||||
// NB: About algorithm chosen:
|
||||
//
|
||||
// On channels last, GroupNorm has a input shape of {N, H, W, GD},
|
||||
// Mean and rstd are collected per each n and g, which involves reduction
|
||||
|
@ -930,7 +930,7 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
|
||||
}
|
||||
};
|
||||
|
||||
// Dynamically Quantize the float32 input to 8 bit assymetric
|
||||
// Dynamically Quantize the float32 input to 8 bit asymmetric
|
||||
input_quant_pack_8bit_channelwise(m, k, lhs_f32, (int8_t*)lhs_qa8dx);
|
||||
|
||||
const size_t lhs_stride =
|
||||
@ -1163,7 +1163,7 @@ void dyn_quant_matmul_4bit_kernel(
|
||||
const int64_t weight_packed_size =
|
||||
kleidiai::kai_pack_rhs_int4_size(N, K, block_size);
|
||||
if (weight_packed_size == packed_weights.numel()) {
|
||||
// KleidiAI interface intenally handles the Channelwise and groupwise
|
||||
// KleidiAI interface internally handles the Channelwise and groupwise
|
||||
// distinction
|
||||
kleidiai::kai_quant_pack_lhs_int4_mm(
|
||||
output, inp, packed_weights, M, N, K, block_size);
|
||||
|
@ -705,7 +705,7 @@ namespace {
|
||||
);
|
||||
} while (!done && max_threads);
|
||||
if (!done) {
|
||||
TORCH_INTERNAL_ASSERT(false, "Couldn't reduce launch bounds to accomodate sharedMemPerBlock limit");
|
||||
TORCH_INTERNAL_ASSERT(false, "Couldn't reduce launch bounds to accommodate sharedMemPerBlock limit");
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -154,19 +154,19 @@ struct cublasCommonArgs {
|
||||
const std::optional<ScalingType>& scaling_choice_b = std::nullopt) {
|
||||
bool transpose_result = false, transpose_a = false, transpose_b = false;
|
||||
result = prepare_matrix_for_cublas(c, transpose_result);
|
||||
mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result);
|
||||
matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result);
|
||||
mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result); // codespell:ignore
|
||||
matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result); // codespell:ignore
|
||||
|
||||
// Handle scale tensors if provided
|
||||
if (scale_a && scale_b) {
|
||||
// By default since we return in row-major we run the gemm
|
||||
// as B.T @ A.T, check transpose_result to determine if we flip the scales
|
||||
scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr();
|
||||
scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type();
|
||||
scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a;
|
||||
scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr();
|
||||
scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type();
|
||||
scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b;
|
||||
scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr(); // codespell:ignore
|
||||
scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type(); // codespell:ignore
|
||||
scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a; // codespell:ignore
|
||||
scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr(); // codespell:ignore
|
||||
scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type(); // codespell:ignore
|
||||
scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b; // codespell:ignore
|
||||
}
|
||||
|
||||
if (scale_result) {
|
||||
@ -180,17 +180,17 @@ struct cublasCommonArgs {
|
||||
transpose_b = !transpose_b;
|
||||
}
|
||||
|
||||
auto sizes_a = mata->sizes();
|
||||
auto sizes_b = matb->sizes();
|
||||
auto sizes_a = mata->sizes(); // codespell:ignore
|
||||
auto sizes_b = matb->sizes(); // codespell:ignore
|
||||
|
||||
m = sizes_a[transpose_result ? 1 : 0];
|
||||
k = sizes_a[transpose_result ? 0 : 1];
|
||||
n = sizes_b[transpose_result ? 0 : 1];
|
||||
lda = mata->stride((transpose_a == transpose_result) ? 1 : 0);
|
||||
ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0);
|
||||
lda = mata->stride((transpose_a == transpose_result) ? 1 : 0); // codespell:ignore
|
||||
ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0); // codespell:ignore
|
||||
result_ld = result->stride(transpose_result ? 0 : 1);
|
||||
transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n';
|
||||
transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n';
|
||||
transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n'; // codespell:ignore
|
||||
transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n'; // codespell:ignore
|
||||
|
||||
// cuBLAS expects unpacked values of `k`, `lda` and `ldb`, adjust for 4x2 packing
|
||||
// if the gemm operands are in packed float4
|
||||
@ -205,16 +205,16 @@ struct cublasCommonArgs {
|
||||
char transa, transb;
|
||||
int64_t m, n, k;
|
||||
int64_t lda, ldb, result_ld;
|
||||
c10::MaybeOwned<Tensor> mata, matb, result;
|
||||
c10::MaybeOwned<Tensor> mata, matb, result; // codespell:ignore
|
||||
|
||||
// Scale members
|
||||
void* scale_mata_ptr = nullptr;
|
||||
void* scale_matb_ptr = nullptr;
|
||||
void* scale_mata_ptr = nullptr; // codespell:ignore
|
||||
void* scale_matb_ptr = nullptr; // codespell:ignore
|
||||
void* scale_result_ptr = nullptr;
|
||||
std::optional<c10::ScalarType> scale_mata_dtype;
|
||||
std::optional<ScalingType> scaling_mata_type;
|
||||
std::optional<c10::ScalarType> scale_matb_dtype;
|
||||
std::optional<ScalingType> scaling_matb_type;
|
||||
std::optional<c10::ScalarType> scale_mata_dtype; // codespell:ignore
|
||||
std::optional<ScalingType> scaling_mata_type; // codespell:ignore
|
||||
std::optional<c10::ScalarType> scale_matb_dtype; // codespell:ignore
|
||||
std::optional<ScalingType> scaling_matb_type; // codespell:ignore
|
||||
std::optional<c10::ScalarType> scale_result_dtype;
|
||||
};
|
||||
} // namespace
|
||||
@ -362,7 +362,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||
static bool disable_addmm_cuda_lt = getDisableAddmmCudaLt();
|
||||
#endif
|
||||
// if lt path fails, we recurse back into this function here and force the lt path to off
|
||||
// we cannot update varible disable_addmm_cuda_lt from above since it is static and would be permanent
|
||||
// we cannot update variable disable_addmm_cuda_lt from above since it is static and would be permanent
|
||||
bool disable_addmm_cuda_lt_final = disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
|
||||
#if defined(USE_ROCM) && ROCM_VERSION == 60400
|
||||
// hipblaslt TT fp32 regression on ROCm 6.4, cannot use
|
||||
@ -2322,12 +2322,23 @@ _scaled_nvfp4_nvfp4(
|
||||
const Tensor& scale_b, const SwizzleType swizzle_b,
|
||||
const std::optional<Tensor>& bias,
|
||||
const c10::ScalarType out_dtype,
|
||||
const bool single_scale,
|
||||
Tensor& out) {
|
||||
Tensor& out,
|
||||
const std::optional<Tensor>& global_scale_a = std::nullopt,
|
||||
const std::optional<Tensor>& global_scale_b = std::nullopt) {
|
||||
#ifdef USE_ROCM
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "NVFP4 scaling not supported on ROCM");
|
||||
#endif
|
||||
TORCH_CHECK_VALUE(single_scale, "Only single-scaled NVFP4 currently supported");
|
||||
std::optional<Tensor> alpha = std::nullopt;
|
||||
// Note: "Or" here means that if only one scale is passed, we check for the other. Otherwise,
|
||||
// if this is "And" we would silently do nothing in the case where one global scale is
|
||||
// passed and not the other.
|
||||
if (global_scale_a.has_value() || global_scale_b.has_value()) {
|
||||
TORCH_CHECK_VALUE(global_scale_a.has_value(),
|
||||
"For two-level-scaled NVFP4, global_scale_a must have a value");
|
||||
TORCH_CHECK_VALUE(global_scale_b.has_value(),
|
||||
"For two-level-scaled NVFP4, global_scale_b must have a value");
|
||||
alpha = global_scale_a.value().mul(global_scale_b.value());
|
||||
}
|
||||
// Restrictions:
|
||||
// A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32
|
||||
// Scales must be swizzled
|
||||
@ -2349,7 +2360,7 @@ _scaled_nvfp4_nvfp4(
|
||||
|
||||
auto scaling_choice_a = ScalingType::BlockWise1x16;
|
||||
auto scaling_choice_b = ScalingType::BlockWise1x16;
|
||||
return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
|
||||
return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out, alpha);
|
||||
}
|
||||
|
||||
|
||||
@ -2555,9 +2566,10 @@ _scaled_mm_cuda_v2_out(
|
||||
} else if (gemm_impl == ScaledGemmImplementation::MXFP8_MXFP8) {
|
||||
return _scaled_mxfp8_mxfp8(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out);
|
||||
} else if (gemm_impl == ScaledGemmImplementation::NVFP4_NVFP4) {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "Only single-scale NVFP4 currently supported");
|
||||
return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out,
|
||||
scale_a[1], scale_b[1]);
|
||||
} else if (gemm_impl == ScaledGemmImplementation::NVFP4_NVFP4_SINGLE_SCALE) {
|
||||
return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, true /* single_scale */, out);
|
||||
return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out);
|
||||
} else if (gemm_impl == ScaledGemmImplementation::MXFP4_MXFP4) {
|
||||
return _scaled_mxfp4_mxfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out);
|
||||
} else {
|
||||
@ -2874,7 +2886,7 @@ _scaled_grouped_mm_cuda_v2(
|
||||
"Contraction dimensions (", dim_a, ",", dim_b, ") of mat_a and mat_b must match, got: ", mat_a.size(dim_a), " and ",
|
||||
mat_b.size(dim_b));
|
||||
// Note: only (-1, -2) is currently supported
|
||||
TORCH_CHECK_VALUE(dim_a == -1 && dim_b == -2, "Curently contraction dims must be (-1, -2) only");
|
||||
TORCH_CHECK_VALUE(dim_a == -1 && dim_b == -2, "Currently contraction dims must be (-1, -2) only");
|
||||
} else {
|
||||
TORCH_CHECK_VALUE(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
|
||||
}
|
||||
|
@ -298,7 +298,7 @@ static void jitted_gpu_kernel_impl(
|
||||
at::opmath_type<f_inputs_type> scalar_val,
|
||||
const std::tuple<ExtraArgs...>& extra_args) {
|
||||
|
||||
// TODO: Memory use can probably be optimized by re-using kernels across GPUs with
|
||||
// TODO: Memory use can probably be optimized by reusing kernels across GPUs with
|
||||
// the same compute capability
|
||||
static std::mutex jiterator_mutex;
|
||||
static std::vector<JittedKernelVariantCache> device_caches(c10::cuda::device_count());
|
||||
|
@ -494,7 +494,7 @@ void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG gen)
|
||||
auto value = static_cast<scalar_t>(rand * range + from);
|
||||
// reverse the bounds of curand4 from (0, 1] to [0, 1)
|
||||
// Note that this method is from legacy THCTensorRandom and is likely to give
|
||||
// you more 0-s, since, the probability of gettings 1-s is higher than 0-s and
|
||||
// you more 0-s, since, the probability of getting 1-s is higher than 0-s and
|
||||
// by reversing the bounds, we are flipping the probabilities of 1-s and 0-s.
|
||||
// BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/16706
|
||||
auto reverse_bound_value = value == to ? from : value;
|
||||
|
@ -75,7 +75,7 @@ fused_dropout_kernel_vec(at::cuda::detail::TensorInfo<const scalar_t, IndexType>
|
||||
// We'll use this to actually cause vectorized loads later
|
||||
LoadT *value = reinterpret_cast<LoadT*>(&src);
|
||||
|
||||
//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for halfs, so generate float for everything
|
||||
//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for Halfs, so generate float for everything
|
||||
// Note: need a new set of random values per 4 elements -- we'll handle VEC elements in this thread, so need ceil(VEC / 4)
|
||||
// sets of rand.
|
||||
if ((VEC >= 4) || (gridxvec_loop_state == 0)) {
|
||||
@ -159,7 +159,7 @@ fused_dropout_kernel(cuda::detail::TensorInfo<const scalar_t, IndexType> a,
|
||||
for (IndexType linearIndex = idx;
|
||||
linearIndex < rounded_size;
|
||||
linearIndex += gridDim.x * blockDim.x*UNROLL) {
|
||||
//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for halfs, so generate float for everything
|
||||
//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for Halfs, so generate float for everything
|
||||
float4 rand = curand_uniform4(&state);
|
||||
scalar_t src[UNROLL];
|
||||
rand.x = rand.x < p;
|
||||
|
@ -15,9 +15,7 @@
|
||||
#include <ATen/native/cuda/block_reduce.cuh>
|
||||
#include <ATen/native/cuda/thread_constants.h>
|
||||
|
||||
#if CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
#include <thrust/iterator/reverse_iterator.h>
|
||||
#endif
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
@ -240,10 +238,6 @@ __global__ void renorm_kernel(
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
#if !CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
template<typename index_t>
|
||||
void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count);
|
||||
#endif
|
||||
|
||||
Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indices_,
|
||||
int64_t num_weights, int64_t padding_idx,
|
||||
@ -306,7 +300,6 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice
|
||||
|
||||
if (scale_grad_by_freq) {
|
||||
count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
||||
#if CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () {
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
@ -333,11 +326,6 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice
|
||||
num_indices
|
||||
);
|
||||
});
|
||||
#else
|
||||
AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () {
|
||||
embedding_dense_backward_cuda_scan<index_t>(sorted_indices, count);
|
||||
});
|
||||
#endif
|
||||
}
|
||||
|
||||
return embedding_backward_cuda_kernel(grad, orig_indices,
|
||||
|
@ -10,9 +10,7 @@
|
||||
|
||||
#include <c10/macros/Macros.h>
|
||||
|
||||
#if CUB_SUPPORTS_UNIQUE_BY_KEY()
|
||||
#include <thrust/iterator/counting_iterator.h>
|
||||
#endif
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
@ -26,7 +24,7 @@ namespace at::native {
|
||||
namespace {
|
||||
|
||||
/* This code computes the sum of the weights in two-steps:
|
||||
1) Each GPU warp sums `NROWS_PER_THREAD` number of row given by `indeces`
|
||||
1) Each GPU warp sums `NROWS_PER_THREAD` number of row given by `indices`
|
||||
2) Each partial-sum from 1) are summed and scatter into `grad_weight`
|
||||
|
||||
Notice, `NROWS_PER_THREAD` impacts the Achieved Occupancy of the
|
||||
@ -196,18 +194,9 @@ __global__ void compute_num_of_partial_segments(const index_t *partials_per_segm
|
||||
partials_per_segment_offset[num_of_segments-1];
|
||||
}
|
||||
|
||||
#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
|
||||
__global__ void write_num_of_segments_for_legacy_thrust_path(int64_t *num_of_segments_ptr, int64_t num_of_segments) {
|
||||
*num_of_segments_ptr = num_of_segments;
|
||||
}
|
||||
#endif
|
||||
|
||||
} // anon namespace
|
||||
|
||||
#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
|
||||
template<typename index_t>
|
||||
int64_t embedding_backward_cuda_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets);
|
||||
#endif
|
||||
|
||||
Tensor embedding_backward_cuda_kernel(
|
||||
const Tensor &grad,
|
||||
@ -234,20 +223,12 @@ Tensor embedding_backward_cuda_kernel(
|
||||
auto segment_offsets = at::empty({numel}, orig_indices.options());
|
||||
auto num_of_segments_tensor = at::empty({}, grad.options().dtype(kLong));
|
||||
int64_t *num_of_segments_ptr = num_of_segments_tensor.mutable_data_ptr<int64_t>();
|
||||
#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
|
||||
AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () {
|
||||
int64_t num_of_segments = embedding_backward_cuda_kernel_unique_by_key<index_t>(sorted_indices, segment_offsets);
|
||||
write_num_of_segments_for_legacy_thrust_path<<<1, 1, 0, c10::cuda::getCurrentCUDAStream()>>>(num_of_segments_ptr, num_of_segments);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
});
|
||||
#else
|
||||
AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () {
|
||||
cuda::cub::unique_by_key(
|
||||
sorted_indices.const_data_ptr<index_t>(), thrust::make_counting_iterator(0),
|
||||
segment_offsets.mutable_data_ptr<index_t>(),
|
||||
num_of_segments_ptr, sorted_indices.numel());
|
||||
});
|
||||
#endif
|
||||
|
||||
int64_t max_segments = std::min<int64_t>(numel, num_weights);
|
||||
|
||||
|
@ -31,16 +31,10 @@
|
||||
|
||||
#include <c10/macros/Macros.h>
|
||||
|
||||
#if CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
#include <thrust/iterator/reverse_iterator.h>
|
||||
#endif
|
||||
|
||||
namespace at::native {
|
||||
|
||||
#if !CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
template<typename index_t>
|
||||
void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count);
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
|
||||
@ -199,7 +193,6 @@ Tensor embedding_bag_backward_cuda_sum_avg(
|
||||
|
||||
if (scale_grad_by_freq) {
|
||||
count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
||||
#if CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_cuda_sum_avg", [&] () {
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
@ -226,11 +219,6 @@ Tensor embedding_bag_backward_cuda_sum_avg(
|
||||
num_indices
|
||||
);
|
||||
});
|
||||
#else
|
||||
AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_cuda_sum_avg", [&] () {
|
||||
embedding_dense_backward_cuda_scan<index_t>(sorted_indices, count);
|
||||
});
|
||||
#endif
|
||||
}
|
||||
return embedding_backward_cuda_kernel(grad, orig_indices, sorted_indices,
|
||||
count, num_weights, padding_idx, mode == EmbeddingBagMode::MEAN, offset2bag,
|
||||
|
@ -204,7 +204,7 @@ Scalar scalar_reciprocal(const Scalar& scalar) {
|
||||
return Scalar(1. / scalar.toComplexDouble());
|
||||
}
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
false, "divison with ", scalar.type(), " not supported");
|
||||
false, "division with ", scalar.type(), " not supported");
|
||||
}
|
||||
|
||||
void foreach_tensor_div_scalar_kernel_cuda_(
|
||||
|
@ -57,7 +57,7 @@ namespace {
|
||||
const index_t n = index / (out_H * out_W);
|
||||
const index_t grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
|
||||
|
||||
// get the corresponding input x, y co-ordinates from grid
|
||||
// get the corresponding input x, y coordinates from grid
|
||||
opmath_t x = grid.data[grid_offset];
|
||||
opmath_t y = grid.data[grid_offset + grid_sCoor];
|
||||
|
||||
@ -193,7 +193,7 @@ namespace {
|
||||
const index_t n = index / (out_D * out_H * out_W);
|
||||
const index_t grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
|
||||
|
||||
// get the corresponding input x, y, z co-ordinates from grid
|
||||
// get the corresponding input x, y, z coordinates from grid
|
||||
opmath_t x = grid.data[grid_offset];
|
||||
opmath_t y = grid.data[grid_offset + grid_sCoor];
|
||||
opmath_t z = grid.data[grid_offset + 2 * grid_sCoor];
|
||||
@ -358,7 +358,7 @@ namespace {
|
||||
const index_t n = index / (out_H * out_W);
|
||||
const auto grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
|
||||
|
||||
// get the corresponding input x, y co-ordinates from grid
|
||||
// get the corresponding input x, y coordinates from grid
|
||||
scalar_t x = grid.data[grid_offset];
|
||||
scalar_t y = grid.data[grid_offset + grid_sCoor];
|
||||
|
||||
@ -572,7 +572,7 @@ namespace {
|
||||
const index_t n = index / (out_D * out_H * out_W);
|
||||
const auto grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
|
||||
|
||||
// get the corresponding input x, y, z co-ordinates from grid
|
||||
// get the corresponding input x, y, z coordinates from grid
|
||||
scalar_t ix = grid.data[grid_offset];
|
||||
scalar_t iy = grid.data[grid_offset + grid_sCoor];
|
||||
scalar_t iz = grid.data[grid_offset + 2 * grid_sCoor];
|
||||
|
@ -8,7 +8,7 @@
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
|
||||
// Three warninngs in Cutlass included header files
|
||||
// Three warnings in Cutlass included header files
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
|
||||
|
@ -377,7 +377,7 @@ __noinline__ __host__ __device__ scalar_t calc_igammac(scalar_t a, scalar_t x) {
|
||||
* result at the boundary
|
||||
* - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
|
||||
* Large Parameter (see DLMF 8.12.4 [igam1])
|
||||
* - if x > 1.1 and x < a, using the substraction from the regularized lower
|
||||
* - if x > 1.1 and x < a, using the subtraction from the regularized lower
|
||||
* incomplete gamma
|
||||
* - otherwise, calculate the series from [igam2] eq (5)
|
||||
*/
|
||||
@ -460,7 +460,7 @@ __noinline__ __host__ __device__ scalar_t calc_igamma(scalar_t a, scalar_t x) {
|
||||
* result at the boundary
|
||||
* - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
|
||||
* Large Parameter (see DLMF 8.12.3 [igam1])
|
||||
* - if x > 1 and x > a, using the substraction from the regularized upper
|
||||
* - if x > 1 and x > a, using the subtraction from the regularized upper
|
||||
* incomplete gamma
|
||||
* - otherwise, calculate the series from [igam2] eq (4)
|
||||
*/
|
||||
|
@ -332,7 +332,7 @@ void cuda_take_put_kernel(
|
||||
const auto offset_calc = make_offset_calculator<2>(iter);
|
||||
using uindex_t = std::make_unsigned_t<index_t>;
|
||||
|
||||
// OffsetCalculator needs the sizes and strides reveresed
|
||||
// OffsetCalculator needs the sizes and strides reversed
|
||||
const auto indexed_sizes = std::vector<int64_t>(indexed.sizes().rbegin(), indexed.sizes().rend());
|
||||
const auto indexed_strides = std::vector<int64_t>(indexed.strides().rbegin(), indexed.strides().rend());
|
||||
const auto* indexed_strides_data = indexed_strides.data();
|
||||
|
@ -1611,7 +1611,7 @@ void index_select_out_cuda_impl(
|
||||
|
||||
// SmallIndexKernel is more performant when the number of indices is small, and pre-loading
|
||||
// the index reduces memory accesses. When the number of indices is large, we avoid that
|
||||
// and increase parallellism by calling gather_out which is a generalization of index_select
|
||||
// and increase parallelism by calling gather_out which is a generalization of index_select
|
||||
if (cuda::detail::canUse32BitIndexMath(out) &&
|
||||
cuda::detail::canUse32BitIndexMath(self) &&
|
||||
cuda::detail::canUse32BitIndexMath(index) &&
|
||||
|
@ -273,7 +273,7 @@ __device__ __forceinline__ void opportunistic_fastAtomicAdd(
|
||||
|
||||
scalar_t* dst = self_ptr + index;
|
||||
|
||||
//pack coalseced bf16 and fp16
|
||||
//pack coalesced bf16 and fp16
|
||||
if constexpr (std::is_same<scalar_t, c10::BFloat16>::value || std::is_same<scalar_t, c10::Half>::value)
|
||||
{
|
||||
typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2;
|
||||
@ -316,7 +316,7 @@ __device__ __forceinline__ void opportunistic_fastAtomicAdd(
|
||||
}
|
||||
}
|
||||
|
||||
// not coalsced, so now let try to capture lane-matches...
|
||||
// not coalesced, so now let try to capture lane-matches...
|
||||
|
||||
if (numel > 16 /*<-hueristic threshold*/ * 64 ) {
|
||||
// well shucks, unlikely to capture same-dest atomics in a wave.
|
||||
|
@ -1,90 +0,0 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/native/cuda/SortingCommon.cuh>
|
||||
#include <ATen/cuda/cub_definitions.cuh>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
#else
|
||||
#include <ATen/ops/empty_like.h>
|
||||
#endif
|
||||
|
||||
#include <ATen/cuda/ThrustAllocator.h>
|
||||
#include <thrust/device_ptr.h>
|
||||
#include <thrust/execution_policy.h>
|
||||
#include <thrust/sort.h>
|
||||
#include <thrust/unique.h>
|
||||
#include <thrust/device_ptr.h>
|
||||
#include <thrust/iterator/constant_iterator.h>
|
||||
|
||||
namespace at::native {
|
||||
|
||||
#if !CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
|
||||
template<typename index_t>
|
||||
void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count) {
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
at::cuda::ThrustAllocator allocator;
|
||||
auto policy = thrust::cuda::par(allocator).on(stream);
|
||||
|
||||
auto num_indices = count.numel();
|
||||
|
||||
// Compute an increasing sequence per unique item in sortedIndices:
|
||||
// sorted: 2 5 5 5 7 7 8 9 9
|
||||
// count: 1 1 2 3 1 2 1 1 2
|
||||
auto sorted_data = thrust::device_ptr<const index_t>(sorted_indices.const_data_ptr<index_t>());
|
||||
auto count_data = thrust::device_ptr<index_t>(count.mutable_data_ptr<index_t>());
|
||||
thrust::inclusive_scan_by_key(
|
||||
policy,
|
||||
sorted_data,
|
||||
sorted_data + num_indices,
|
||||
thrust::make_constant_iterator(1),
|
||||
count_data
|
||||
);
|
||||
|
||||
// Take the maximum of each count per unique key in reverse:
|
||||
// sorted: 2 5 5 5 7 7 8 9 9
|
||||
// count: 1 3 3 3 2 2 1 2 2
|
||||
thrust::inclusive_scan_by_key(
|
||||
policy,
|
||||
thrust::make_reverse_iterator(sorted_data + num_indices),
|
||||
thrust::make_reverse_iterator(sorted_data),
|
||||
thrust::make_reverse_iterator(count_data + num_indices),
|
||||
thrust::make_reverse_iterator(count_data + num_indices),
|
||||
thrust::equal_to<index_t>(),
|
||||
thrust::maximum<index_t>()
|
||||
);
|
||||
}
|
||||
|
||||
template
|
||||
void embedding_dense_backward_cuda_scan<int>(Tensor &sorted_indices, Tensor &count);
|
||||
template
|
||||
void embedding_dense_backward_cuda_scan<int64_t>(Tensor &sorted_indices, Tensor &count);
|
||||
|
||||
#endif
|
||||
|
||||
template<typename index_t>
|
||||
int64_t embedding_backward_cuda_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets) {
|
||||
auto stream = at::cuda::getCurrentCUDAStream();
|
||||
at::cuda::ThrustAllocator allocator;
|
||||
auto policy = thrust::cuda::par(allocator).on(stream);
|
||||
const ptrdiff_t numel = sorted_indices.numel();
|
||||
auto sorted_indices_dev = thrust::device_ptr<const index_t>(sorted_indices.const_data_ptr<index_t>());
|
||||
auto dummy = at::empty_like(sorted_indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
||||
auto dummy_dev = thrust::device_ptr<index_t>(dummy.mutable_data_ptr<index_t>());
|
||||
auto ends = thrust::unique_by_key_copy(
|
||||
policy,
|
||||
sorted_indices_dev,
|
||||
sorted_indices_dev + numel,
|
||||
thrust::make_counting_iterator(0),
|
||||
dummy_dev,
|
||||
thrust::device_ptr<index_t>(segment_offsets.mutable_data_ptr<index_t>()));
|
||||
return thrust::get<0>(ends) - dummy_dev;
|
||||
}
|
||||
|
||||
template
|
||||
int64_t embedding_backward_cuda_kernel_unique_by_key<int>(const Tensor &sorted_indices, Tensor &segment_offsets);
|
||||
template
|
||||
int64_t embedding_backward_cuda_kernel_unique_by_key<int64_t>(const Tensor &sorted_indices, Tensor &segment_offsets);
|
||||
|
||||
} // namespace at::native
|
@ -343,7 +343,7 @@ ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data,
|
||||
if (input_length == 0)
|
||||
return;
|
||||
|
||||
// "first" row, the beta initialization before eq (10) (t=target_length - differes per batch)
|
||||
// "first" row, the beta initialization before eq (10) (t=target_length - differs per batch)
|
||||
for (int64_t block_s = 2*max_target_length - (2*max_target_length % blockDim.x); block_s >= 0; block_s -= blockDim.x) {
|
||||
int64_t s = threadIdx.x + block_s;
|
||||
scalar_t lb;
|
||||
|
@ -816,7 +816,7 @@ const auto erfcx_string = jiterator_stringify(
|
||||
with the usual checks for overflow etcetera.
|
||||
|
||||
Performance-wise, it seems to be substantially faster than either
|
||||
the SLATEC DERFC function [or an erfcx function derived therefrom]
|
||||
the SLATEC DERFC function [or an erfcx function derived there from]
|
||||
or Cody's CALERF function (from netlib.org/specfun), while
|
||||
retaining near machine precision in accuracy.
|
||||
*/
|
||||
|
@ -370,7 +370,7 @@ struct vectorized {
|
||||
|
||||
#ifdef USE_ROCM
|
||||
// This is similar to vectorized policy above, but this one supports
|
||||
// heterogenous input tensor types as templated parameters.
|
||||
// heterogeneous input tensor types as templated parameters.
|
||||
// Its use should be limited to frequently used heterogeneous data types
|
||||
// as each instantiation will generate a separate kernel, leading to code
|
||||
// bloating if applied to all combinations supported in PyTorch. Assumption: all
|
||||
|
@ -309,7 +309,7 @@ __global__ void sampleMultinomialOnce(
|
||||
} else {
|
||||
// This should address a rare bug where we don't select a valid index. This likely occurs when
|
||||
// due to floating point arithmetic rounding errors, our cumulative sum does not add up to 1, but
|
||||
// and our uniform sample is greater than this value. In this case we likely have unitialized memory
|
||||
// and our uniform sample is greater than this value. In this case we likely have uninitialized memory
|
||||
// in dest[curDist]. So basically we will loop through the distribution and pick the largest index
|
||||
// where the distribution is non-zero. This is obviously terribly inefficient, but due to the
|
||||
// rarity in which this occurs, this should not be an issue.
|
||||
|
@ -1623,7 +1623,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_cuda_template(
|
||||
const auto stride = input.sizes()[1];
|
||||
const auto reduction_size = input.numel() / stride;
|
||||
|
||||
// Input is guarunteed to be channels-last compatible
|
||||
// Input is guaranteed to be channels-last compatible
|
||||
at::Tensor grad_input = at::empty_like(input);
|
||||
|
||||
dim3 block;
|
||||
@ -1691,7 +1691,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_cuda_template(
|
||||
const auto reduction_size = input.numel() / stride;
|
||||
auto norm_fct = 1.0 / reduction_size;
|
||||
|
||||
// Input is guarunteed to be channels-last compatible
|
||||
// Input is guaranteed to be channels-last compatible
|
||||
at::Tensor grad_input = at::empty_like(input);
|
||||
|
||||
dim3 block;
|
||||
|
@ -37,7 +37,7 @@ namespace at::native {
|
||||
// threshold probability for having non-duplicate keys, then it can be proved that[1]
|
||||
// the number of bits required is: ceil(log2(n - (6 n^2 + 1) / (12 log(q))))
|
||||
//
|
||||
// Then after sort, we lauch a separate kernel that additionally shuffles any islands
|
||||
// Then after sort, we launch a separate kernel that additionally shuffles any islands
|
||||
// of values whose keys matched. The algorithm of this kernel is as follows:
|
||||
// Each thread reads its key and the keys of its neighbors to tell if it's part of an island.
|
||||
// For each island, the first thread in the island sees a key match at index i+1 but not index i-1.
|
||||
|
@ -1088,12 +1088,12 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
|
||||
// load instructions.
|
||||
//
|
||||
// Case 1: "vectorize along input"
|
||||
// This case happens when we are reducing along fastest moving dimesion. In such case, threads
|
||||
// This case happens when we are reducing along fastest moving dimension. In such case, threads
|
||||
// with the same threadIdx.y works on the same reduction cooperatively and will produce results
|
||||
// for the same output. In such case, values in each loaded vector always correspond to the same output.
|
||||
//
|
||||
// Case 2: "vectorize along output"
|
||||
// This case happens when the fastest moving dimesion is not the dimension of reduction. In such case,
|
||||
// This case happens when the fastest moving dimension is not the dimension of reduction. In such case,
|
||||
// threads with different threadIdx.x are independent and will produce results for different outputs.
|
||||
// In such case, values in each loaded vector always correspond to different outputs.
|
||||
if (fastest_moving_stride == sizeof(scalar_t)) {
|
||||
|
@ -241,7 +241,7 @@ __global__ void reflection_pad2d_backward_det_out_kernel(
|
||||
const int64_t dist_cols = ::abs(inp_col - (input_dim_x - 1));
|
||||
|
||||
// we were dist_rows after, now we want to be dist_rows before
|
||||
// we were dist_cols before, now we wnat to be dist_cols after
|
||||
// we were dist_cols before, now we want to be dist_cols after
|
||||
const int64_t reflect_tr_out_row = (corner_tr_out_row - dist_rows);
|
||||
const int64_t reflect_tr_out_col = (corner_tr_out_col + dist_cols);
|
||||
const int64_t reflect_tr_out =
|
||||
|
@ -5,7 +5,7 @@
|
||||
#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
|
||||
// Two warninngs in Cutlass included header files
|
||||
// Two warnings in Cutlass included header files
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wmissing-field-initializers")
|
||||
|
@ -7,7 +7,7 @@
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
// Two warninngs in Cutlass included header files
|
||||
// Two warnings in Cutlass included header files
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-variable")
|
||||
|
@ -19,7 +19,6 @@
|
||||
|
||||
namespace at::native {
|
||||
|
||||
// TODO: remove this when CUDA <11.6 is no longer supported
|
||||
void topk_out_with_sort(
|
||||
const Tensor& self,
|
||||
int64_t k, int64_t dim, bool largest,
|
||||
@ -31,21 +30,12 @@ void topk_out_with_sort(
|
||||
indices.copy_(sorted_indices.narrow(dim, 0, k));
|
||||
}
|
||||
|
||||
// TODO: remove this when CUDA <11.6 is no longer supported
|
||||
bool disable_sort_for_topk();
|
||||
bool should_use_sort(const Tensor& self, int64_t dim) {
|
||||
#if defined(USE_ROCM)
|
||||
if (self.dtype() == kBool) return false; // Bool sort not supported in ROCm: https://github.com/pytorch/pytorch/issues/139972
|
||||
return (self.numel() >= 10000 && self.numel() == self.size(dim)); // based on the experiments in https://github.com/pytorch/pytorch/pull/146387
|
||||
#else
|
||||
if (disable_sort_for_topk()) return false;
|
||||
// This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/68632
|
||||
if (self.dim() == 0) return false;
|
||||
if (self.dtype() == kBool) return false; // Bool is not support by topk
|
||||
int64_t slice_size = self.size(dim);
|
||||
if (slice_size == 0) return false;
|
||||
int64_t num_slices = self.numel() / slice_size;
|
||||
return num_slices <= 10 && slice_size >= 100000;
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -21,11 +21,6 @@ using namespace at::native;
|
||||
|
||||
namespace at::native {
|
||||
|
||||
// TODO: remove this when CUDA <11.6 is no longer supported
|
||||
bool disable_sort_for_topk() {
|
||||
return CUB_SUPPORTS_SCAN_BY_KEY();
|
||||
}
|
||||
|
||||
namespace sbtopk { // single_block_topk
|
||||
|
||||
template <typename T>
|
||||
@ -418,10 +413,6 @@ __global__ void computeBlockwiseWithinKCounts(
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
#if !CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
return;
|
||||
#endif
|
||||
|
||||
Bitwise desired_digit = at::cuda::Bitfield<Bitwise>::getBitfield(desired, current_bit, RADIX_BITS);
|
||||
|
||||
// if largest, then only threads that has tidx > desired_digit are active
|
||||
@ -477,7 +468,6 @@ __global__ void computeBlockwiseWithinKCounts(
|
||||
}
|
||||
}
|
||||
|
||||
#if CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
// Assumption: slice_size can not be larger than UINT32_MAX
|
||||
template <typename Bitwise>
|
||||
__global__ void computeBlockwiseKthCounts(
|
||||
@ -609,7 +599,6 @@ __global__ void gatherTopK(at::cuda::detail::TensorInfo<const T, IndexType> inpu
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
int get_items_per_thread(uint64_t num_slices, uint64_t slice_size) {
|
||||
// occupancy of this kernel is limited by registers per threads
|
||||
@ -687,16 +676,12 @@ void launch(
|
||||
uint32_t* digit_cum_sum = reinterpret_cast<uint32_t*>(digit_cum_sum_buffer.get());
|
||||
AT_CUDA_CHECK(cudaMemsetAsync(digit_cum_sum, 0, numInputSlices * RADIX_DIGITS * sizeof(uint32_t), stream));
|
||||
|
||||
#if CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
auto withinKCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
|
||||
uint32_t* withinKCounts = reinterpret_cast<uint32_t*>(withinKCounts_buffer.get());
|
||||
AT_CUDA_CHECK(cudaMemsetAsync(withinKCounts, 0, num_blocks * sizeof(uint32_t), stream));
|
||||
|
||||
auto kthCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
|
||||
uint32_t* kthCounts = reinterpret_cast<uint32_t*>(kthCounts_buffer.get());
|
||||
#else
|
||||
uint32_t* withinKCounts = nullptr;
|
||||
#endif
|
||||
|
||||
Bitwise desiredMask = 0;
|
||||
dim3 grid;
|
||||
@ -743,7 +728,6 @@ void launch(
|
||||
}
|
||||
desired = desired_in;
|
||||
|
||||
#if CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
computeBlockwiseKthCounts<Bitwise><<<std::min(((int64_t)numInputSlices + 255) / 256, (int64_t)1073741824), 256, 0, stream>>>(
|
||||
desired, counts, num_blocks, blocks_per_slice, kthCounts);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
@ -759,28 +743,6 @@ void launch(
|
||||
topK, topKWithinSliceStride, indices, indicesWithinSliceStride, items_per_thread,
|
||||
blocks_per_slice, kthValues, withinKCounts, kthCounts, num_blocks);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
#else
|
||||
// Find topk values based on kth values
|
||||
{
|
||||
dim3 grid;
|
||||
TORCH_INTERNAL_ASSERT(getGridFromTiles(numInputSlices, grid), "Too many slices for topk");
|
||||
int warp_size = at::cuda::warp_size();
|
||||
dim3 block(std::min(at::ceil_div((int64_t)inputSliceSize, (int64_t)warp_size) * (int64_t)warp_size, (int64_t)1024));
|
||||
sbtopk::gatherTopK<T, IndexType, Dim, /* WithKthValues= */true><<<grid, block, 0, stream>>>(
|
||||
input,
|
||||
inputSliceSize,
|
||||
outputSliceSize,
|
||||
largest,
|
||||
numInputSlices,
|
||||
inputWithinSliceStride,
|
||||
topK,
|
||||
topKWithinSliceStride,
|
||||
indices,
|
||||
indicesWithinSliceStride,
|
||||
kthValues);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace mbtopk
|
||||
@ -788,7 +750,6 @@ void launch(
|
||||
bool should_use_multiblock(int64_t num_slices, int64_t slice_size) {
|
||||
if (num_slices > std::numeric_limits<uint32_t>::max() ||
|
||||
slice_size > std::numeric_limits<uint32_t>::max()) return false;
|
||||
#if CUB_SUPPORTS_SCAN_BY_KEY()
|
||||
// This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/74267
|
||||
return (num_slices <= 20 && slice_size >= 20000) ||
|
||||
(num_slices > 20 && num_slices <= 40 && slice_size >= 10000) ||
|
||||
@ -797,12 +758,6 @@ bool should_use_multiblock(int64_t num_slices, int64_t slice_size) {
|
||||
(num_slices >= 200 && num_slices < 800 && slice_size >= 3000) ||
|
||||
(num_slices >= 800 && num_slices <= 4000 && slice_size >= 800) ||
|
||||
(num_slices > 4000 && slice_size >= 400);
|
||||
#else
|
||||
// This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/71081
|
||||
return (num_slices <= 400 && slice_size >= 5000) ||
|
||||
(num_slices > 400 && num_slices < 4000 && slice_size >= 1000) ||
|
||||
(num_slices >= 4000 && slice_size >= 300);
|
||||
#endif
|
||||
}
|
||||
|
||||
void launch_gather_topk_kernel(
|
||||
|
@ -127,29 +127,6 @@ __global__ void upsample_bilinear2d_nhwc_out_frame(
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef USE_ROCM
|
||||
// Helper function to compute output pixel range that can contribute to input pixel
|
||||
template <typename accscalar_t>
|
||||
__device__ __forceinline__ void compute_output_range(
|
||||
int input_pos,
|
||||
accscalar_t scale,
|
||||
int output_size,
|
||||
bool align_corners,
|
||||
int& min_output,
|
||||
int& max_output) {
|
||||
accscalar_t lo, hi;
|
||||
if (align_corners) {
|
||||
lo = static_cast<accscalar_t>(input_pos - 1) / scale;
|
||||
hi = static_cast<accscalar_t>(input_pos + 1) / scale;
|
||||
} else {
|
||||
lo = (input_pos - static_cast<accscalar_t>(0.5)) / scale - static_cast<accscalar_t>(0.5);
|
||||
hi = (input_pos + static_cast<accscalar_t>(1.5)) / scale - static_cast<accscalar_t>(0.5);
|
||||
}
|
||||
min_output = max(0, static_cast<int>(ceil(lo)));
|
||||
max_output = min(output_size - 1, static_cast<int>(floor(hi)));
|
||||
}
|
||||
#endif
|
||||
|
||||
// Backward (adjoint) operation 1 <- 2 (accumulates)
|
||||
template <typename scalar_t, typename accscalar_t>
|
||||
C10_LAUNCH_BOUNDS_1(1024)
|
||||
@ -164,74 +141,8 @@ __global__ void upsample_bilinear2d_backward_out_frame(
|
||||
const bool align_corners,
|
||||
scalar_t* __restrict__ idata,
|
||||
const scalar_t* __restrict__ odata) {
|
||||
// In C++, integer multiplication, like in standard arithmetic, is generally commutative.
|
||||
const size_t i_numel = nc * width1 * height1;
|
||||
#ifdef USE_ROCM
|
||||
for (size_t index = blockDim.x * blockIdx.x + threadIdx.x; index < i_numel;
|
||||
index += blockDim.x * gridDim.x) {
|
||||
// Decode input pixel coordinates
|
||||
size_t index_temp = index;
|
||||
const int w1 = index_temp % width1;
|
||||
index_temp /= width1;
|
||||
const int h1 = index_temp % height1;
|
||||
const size_t nc_idx = index_temp / height1;
|
||||
|
||||
accscalar_t grad_sum = 0;
|
||||
|
||||
// Find range of output pixels that could interpolate from this input pixel
|
||||
int h2_min, h2_max, w2_min, w2_max;
|
||||
compute_output_range<accscalar_t>(h1, rheight, height2, align_corners, h2_min, h2_max);
|
||||
compute_output_range<accscalar_t>(w1, rwidth, width2, align_corners, w2_min, w2_max);
|
||||
|
||||
// Iterate over potential output pixels
|
||||
for (int h2 = h2_min; h2 <= h2_max; h2++) {
|
||||
for (int w2 = w2_min; w2 <= w2_max; w2++) {
|
||||
// Compute source coordinates for this output pixel
|
||||
const accscalar_t h1r = area_pixel_compute_source_index<accscalar_t>(
|
||||
rheight, h2, align_corners, /*cubic=*/false);
|
||||
const int h1_base = (int)h1r;
|
||||
const int h1p = (h1_base < height1 - 1) ? 1 : 0;
|
||||
const accscalar_t h1lambda = h1r - h1_base;
|
||||
const accscalar_t h0lambda = static_cast<accscalar_t>(1) - h1lambda;
|
||||
|
||||
const accscalar_t w1r = area_pixel_compute_source_index<accscalar_t>(
|
||||
rwidth, w2, align_corners, /*cubic=*/false);
|
||||
const int w1_base = (int)w1r;
|
||||
const int w1p = (w1_base < width1 - 1) ? 1 : 0;
|
||||
const accscalar_t w1lambda = w1r - w1_base;
|
||||
const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
|
||||
|
||||
// Check if our input pixel participates in this interpolation and accumulate all weights
|
||||
// At boundaries, h1p=0 or w1p=0 causes some sampling positions to collapse
|
||||
// to the same pixel, so we need to accumulate weights from all matching positions
|
||||
accscalar_t weight = 0;
|
||||
|
||||
// Check all four interpolation positions and accumulate weights
|
||||
if (h1 == h1_base && w1 == w1_base) {
|
||||
weight += h0lambda * w0lambda; // top-left
|
||||
}
|
||||
if (h1 == h1_base && w1 == w1_base + w1p) {
|
||||
weight += h0lambda * w1lambda; // top-right (may be same as top-left if w1p=0)
|
||||
}
|
||||
if (h1 == h1_base + h1p && w1 == w1_base) {
|
||||
weight += h1lambda * w0lambda; // bottom-left (may be same as top-left if h1p=0)
|
||||
}
|
||||
if (h1 == h1_base + h1p && w1 == w1_base + w1p) {
|
||||
weight += h1lambda * w1lambda; // bottom-right (may collapse to other positions)
|
||||
}
|
||||
|
||||
if (weight > 0) {
|
||||
const size_t output_idx = nc_idx * height2 * width2 + h2 * width2 + w2;
|
||||
grad_sum += weight * static_cast<accscalar_t>(odata[output_idx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write accumulated gradient (no atomics needed)
|
||||
idata[index] = static_cast<scalar_t>(grad_sum);
|
||||
}
|
||||
#else
|
||||
const size_t o_numel = nc * width2 * height2;
|
||||
const size_t i_numel = nc * width1 * height1;
|
||||
for (size_t index = blockDim.x * blockIdx.x + threadIdx.x; index < o_numel;
|
||||
index += blockDim.x * gridDim.x) {
|
||||
size_t index_temp = index;
|
||||
@ -280,7 +191,6 @@ __global__ void upsample_bilinear2d_backward_out_frame(
|
||||
static_cast<scalar_t>(h1lambda * w1lambda * d2val),
|
||||
true);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename accscalar_t>
|
||||
@ -477,6 +387,7 @@ static void upsample_bilinear2d_backward_out_cuda_template(
|
||||
// threads are not covering the whole input tensor.
|
||||
grad_input.zero_();
|
||||
|
||||
const size_t num_kernels = nbatch * channels * output_height * output_width;
|
||||
const int num_threads = std::min(
|
||||
at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
@ -486,12 +397,6 @@ static void upsample_bilinear2d_backward_out_cuda_template(
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef USE_ROCM
|
||||
constexpr bool use_input = true;
|
||||
#else
|
||||
constexpr bool use_input = false;
|
||||
#endif
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(
|
||||
at::ScalarType::Half, at::ScalarType::BFloat16,
|
||||
grad_output_.scalar_type(), "upsample_bilinear2d_backward_out_frame", [&] {
|
||||
@ -509,8 +414,6 @@ static void upsample_bilinear2d_backward_out_cuda_template(
|
||||
const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
|
||||
input_width, output_width, align_corners, scales_w);
|
||||
|
||||
const size_t num_kernels = nbatch * channels * output_height * output_width;
|
||||
|
||||
upsample_bilinear2d_backward_nhwc_out_frame<scalar_t, accscalar_t>
|
||||
<<<ceil_div(num_kernels, static_cast<size_t>(num_threads)), num_threads, 0, stream>>>(
|
||||
input_height,
|
||||
@ -541,8 +444,6 @@ static void upsample_bilinear2d_backward_out_cuda_template(
|
||||
const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
|
||||
input_width, output_width, align_corners, scales_w);
|
||||
|
||||
const size_t num_kernels = nbatch * channels * (use_input ? input_height * input_width : output_height * output_width);
|
||||
|
||||
upsample_bilinear2d_backward_out_frame<scalar_t, accscalar_t>
|
||||
<<<ceil_div(num_kernels, static_cast<size_t>(num_threads)),
|
||||
num_threads,
|
||||
|
@ -460,7 +460,7 @@ __global__ void GammaBetaBackwardCUDAKernel2(
|
||||
}
|
||||
}
|
||||
|
||||
// Do warp reduce for the 2st 16 cols in the tile.
|
||||
// Do warp reduce for the 2nd 16 cols in the tile.
|
||||
sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y];
|
||||
sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y];
|
||||
sum1 = cuda_utils::WarpReduceSum<T_ACC>(sum1);
|
||||
|
@ -1532,7 +1532,7 @@ NvrtcFunction jit_pwise_function(
|
||||
|
||||
std::string file_path;
|
||||
if (cache_dir.has_value()) {
|
||||
// Attemps to read from the cache.
|
||||
// Attempts to read from the cache.
|
||||
// Cubin name is <kernel name>_arch<major>.<minor>_nvrtc<major>.<minor>_<ptx or sass>_<program length>_<string hash>
|
||||
// Note that the SHA1 hash used in the file name is NOT the SHA1 hash of the file's contents,
|
||||
// because we hash on the CUDA code, but we save the compiled ptx or sass
|
||||
@ -1556,19 +1556,19 @@ NvrtcFunction jit_pwise_function(
|
||||
ss << "_" << hash_code;
|
||||
file_path = ss.str();
|
||||
|
||||
std::ifstream readin{file_path, std::ios::in | std::ifstream::binary};
|
||||
if (readin.fail()) {
|
||||
std::ifstream read_stream{file_path, std::ios::in | std::ifstream::binary};
|
||||
if (read_stream.fail()) {
|
||||
// NOTE: this does not warn because the file might not exist
|
||||
// TODO: consider if this should explicitly check for the file's existence or not to throw
|
||||
// an informative warning
|
||||
readin.close();
|
||||
read_stream.close();
|
||||
} else {
|
||||
// TODO: try passing the "mapped" file directly to cuModuleLoadCall instead of using an intermediate buffer
|
||||
std::vector<char> buffer(std::istreambuf_iterator<char>(readin), {});
|
||||
std::vector<char> buffer(std::istreambuf_iterator<char>(read_stream), {});
|
||||
AT_CUDA_DRIVER_CHECK(nvrtc.cuModuleLoadData(&(compiled_kernel_.module), buffer.data()));
|
||||
AT_CUDA_DRIVER_CHECK(
|
||||
nvrtc.cuModuleGetFunction(&(compiled_kernel_.function), compiled_kernel_.module, name.c_str()));
|
||||
readin.close();
|
||||
read_stream.close();
|
||||
return compiled_kernel_;
|
||||
}
|
||||
}
|
||||
|
@ -1050,7 +1050,7 @@ void launch_vectorized_layer_norm_kernel(
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
#ifdef USE_ROCM
|
||||
// the blocks.x contains the max grid x dimention without invalid configuration error
|
||||
// the blocks.x contains the max grid x dimension without invalid configuration error
|
||||
// Fix invalid configuration https://github.com/pytorch/pytorch/issues/136291
|
||||
// Ensure all elements are processed. Prepare for next round
|
||||
int64_t remaining = M - blocks.x;
|
||||
|
@ -1346,7 +1346,7 @@ void cholesky_helper_magma(const Tensor& input, bool upper, const Tensor& info)
|
||||
});
|
||||
|
||||
if (input.dim() > 2) {
|
||||
// if upper=true we need to tranpose and conjugate the result tensor
|
||||
// if upper=true we need to transpose and conjugate the result tensor
|
||||
// because the cholesky decomposition is stored in the lower triangular part
|
||||
if (upper) {
|
||||
input.copy_(result.mH());
|
||||
@ -1857,7 +1857,7 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
|
||||
|
||||
auto preferred_backend = at::globalContext().linalgPreferredBackend();
|
||||
switch (preferred_backend) {
|
||||
// TODO Investigate whether the following magma bug is still occuring.
|
||||
// TODO Investigate whether the following magma bug is still occurring.
|
||||
// It may be the case that geqrf followed by orgqr is wrong for the magma backend
|
||||
// geqrf_magma currently uses geqrf2_gpu
|
||||
//
|
||||
|
@ -82,7 +82,7 @@ void lu_factor_looped_cusolver(const Tensor& self, const Tensor& pivots, const T
|
||||
#if defined(BUILD_LAZY_CUDA_LINALG)
|
||||
namespace cuda { namespace detail {
|
||||
// This is only used for an old-style dispatches
|
||||
// Please do not add any new entires to it
|
||||
// Please do not add any new entries to it
|
||||
struct LinalgDispatch {
|
||||
Tensor (*cholesky_solve_helper)(const Tensor& self, const Tensor& A, bool upper);
|
||||
};
|
||||
|
@ -177,7 +177,7 @@ bool use_ragged_in_dense(
|
||||
TORCH_WARN_ONCE(
|
||||
"TORCH_CUDNN_SDPA_AVOID_RECOMPILE=1 only works with Q, K, V, and output in BSHD memory layout,"
|
||||
"e.g., Q, K, V must be allocated with torch.randn((B, S, H, D).transpose(1, 2)."
|
||||
"Falling back to regualr dense case, which may trigger excessive recompilation.");
|
||||
"Falling back to regular dense case, which may trigger excessive recompilation.");
|
||||
}
|
||||
return all_bshd;
|
||||
}
|
||||
@ -487,9 +487,7 @@ std::unique_ptr<fe::graph::Graph> build_graph(
|
||||
auto scaled_dot_product_flash_attention_options =
|
||||
fe::graph::SDPA_attributes()
|
||||
.set_name("CUDNN_SDPA")
|
||||
.set_is_inference(return_softmaxstats == false)
|
||||
// TODO(eqy): switch to this API once cuDNN FE is upgraded
|
||||
// .set_generate_stats(return_softmaxstats)
|
||||
.set_generate_stats(return_softmaxstats)
|
||||
.set_causal_mask(is_causal)
|
||||
.set_attn_scale(attn_scale);
|
||||
if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
|
||||
@ -707,9 +705,7 @@ std::unique_ptr<fe::graph::Graph> build_graph_nestedtensor(
|
||||
auto scaled_dot_product_flash_attention_options =
|
||||
fe::graph::SDPA_attributes()
|
||||
.set_name("CUDNN_SDPA_NESTEDTENSOR")
|
||||
.set_is_inference(return_softmaxstats == false)
|
||||
// TODO(eqy): switch to this API once cuDNN FE is upgraded
|
||||
// .set_generate_stats(return_softmaxstats)
|
||||
.set_generate_stats(return_softmaxstats)
|
||||
.set_causal_mask(is_causal)
|
||||
.set_attn_scale(attn_scale)
|
||||
.set_seq_len_q(SEQ_LEN_Q_)
|
||||
@ -775,7 +771,7 @@ std::unique_ptr<fe::graph::Graph> build_graph_nestedtensor(
|
||||
if (attn_bias.has_value()) {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"attn_bias not yet supportd with cuDNN Attention and NestedTensor");
|
||||
"attn_bias not yet supported with cuDNN Attention and NestedTensor");
|
||||
scaled_dot_product_flash_attention_options.set_bias(
|
||||
mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_uid(BIAS)
|
||||
@ -1200,7 +1196,7 @@ std::unique_ptr<fe::graph::Graph> build_graph_backward_nestedtensor(
|
||||
if (attn_bias.has_value()) {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"attn_bias not yet supportd with cuDNN Attention and NestedTensor");
|
||||
"attn_bias not yet supported with cuDNN Attention and NestedTensor");
|
||||
sdpa_backward_options.set_bias(
|
||||
mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_uid(BIAS)
|
||||
@ -1868,7 +1864,7 @@ void run_cudnn_SDP_bprop_nestedtensor(
|
||||
}
|
||||
TORCH_CHECK(
|
||||
!attn_bias.has_value(),
|
||||
"attn_bias not yet supportd with cuDNN Attention and NestedTensor");
|
||||
"attn_bias not yet supported with cuDNN Attention and NestedTensor");
|
||||
|
||||
auto workspace_size = mha_graph.get_workspace_size();
|
||||
auto workspace_ptr =
|
||||
|
@ -30,7 +30,7 @@ static const std::unordered_map<
|
||||
};
|
||||
|
||||
|
||||
// This is the heursitic to choose a kernel based on inputs
|
||||
// This is the heuristic to choose a kernel based on inputs
|
||||
BGEMMKernel_BFloat16 dispatch_bfloat16_bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
|
||||
// Optional/future use: directly lookup shape tuples to map to instances
|
||||
/*
|
||||
|
@ -11,7 +11,7 @@ using S = ck::Sequence<Is...>;
|
||||
namespace at::native {
|
||||
|
||||
void dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
|
||||
// If any of the shapes cant be tiled, we must use padding.
|
||||
// If any of the shapes can't be tiled, we must use padding.
|
||||
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
|
||||
// Dispatch to best implementation.
|
||||
// TODO add more configurations. Optimize.
|
||||
@ -471,7 +471,7 @@ void dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
|
||||
}
|
||||
|
||||
void dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
|
||||
// If any of the shapes cant be tiled, we must use padding.
|
||||
// If any of the shapes can't be tiled, we must use padding.
|
||||
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
|
||||
// Dispatch to best implementation.
|
||||
// TODO add more configurations. Optimize.
|
||||
|
@ -11,7 +11,7 @@ using S = ck::Sequence<Is...>;
|
||||
namespace at::native {
|
||||
|
||||
void dispatch_float_gemm(CUDABLAS_GEMM_ARGTYPES(float)) {
|
||||
// If any of the shapes cant be tiled, we must use padding.
|
||||
// If any of the shapes can't be tiled, we must use padding.
|
||||
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
|
||||
// Dispatch to best implementation.
|
||||
// TODO add more configurations. Optimize.
|
||||
|
@ -13,7 +13,7 @@ namespace at::native {
|
||||
|
||||
void dispatch_half_gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
|
||||
#if 0
|
||||
// If any of the shapes cant be tiled, we must use padding.
|
||||
// If any of the shapes can't be tiled, we must use padding.
|
||||
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
|
||||
// Dispatch to best implementation.
|
||||
// TODO add more configurations. Optimize.
|
||||
@ -299,7 +299,7 @@ void dispatch_half_gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
|
||||
#endif
|
||||
}
|
||||
void dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
|
||||
// If any of the shapes cant be tiled, we must use padding.
|
||||
// If any of the shapes can't be tiled, we must use padding.
|
||||
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
|
||||
// Dispatch to best implementation.
|
||||
// TODO add more configurations. Optimize.
|
||||
|
@ -545,7 +545,7 @@ kernel void reshape(texture2d_array<half, access::read> in_arr[[texture(0), func
|
||||
const ushort slices2 = divRoundUp(C2, 4);
|
||||
const ushort slices1 = divRoundUp(C1, 4);
|
||||
const ushort n2 = gid.z / slices2; //image index
|
||||
const ushort s2 = gid.z - n2 * slices2; // slice offest
|
||||
const ushort s2 = gid.z - n2 * slices2; // slice offset
|
||||
half4 value;
|
||||
for (int idx = 0; idx < 4; ++idx){
|
||||
// we compute the "linear index" of the output element,
|
||||
|
@ -86,4 +86,4 @@ TORCH_LIBRARY_IMPL(aten, Metal, m) {
|
||||
m.impl(TORCH_SELECTIVE_NAME("aten::hardsigmoid_"), TORCH_FN(hardsigmoid_));
|
||||
}
|
||||
|
||||
} // namepsace at::native::metal
|
||||
} // namespace at::native::metal
|
||||
|
@ -147,7 +147,7 @@ static void check_shape_forward(const Tensor& input,
|
||||
// blocked format will propagate between layers. Input, output will be in blocked format.
|
||||
//
|
||||
// For inference case, weight can be prepacked into blocked format by
|
||||
// (so as to save weight reoder overhead):
|
||||
// (so as to save weight reorder overhead):
|
||||
// model = torch.utils.mkldnn.to_mkldnn(model)
|
||||
//
|
||||
// For training case, grad_output can be CPU tensor or MKLDNN tensor,
|
||||
|
@ -540,7 +540,7 @@ static void _mkldnn_matmul_i8i8i32_with_primitive(
|
||||
args.insert({DNNL_ARG_WEIGHTS, expected_weight});
|
||||
args.insert({DNNL_ARG_DST, dst});
|
||||
args.insert({DNNL_ARG_SCRATCHPAD, scratchpad});
|
||||
// Create primitve and execute
|
||||
// Create primitive and execute
|
||||
auto primitive = dnnl::matmul(prim_desc);
|
||||
primitive.execute(ideep::stream::default_stream(), args);
|
||||
}
|
||||
|
@ -215,7 +215,7 @@ partition create_sdpa_graph_partition(
|
||||
// For optional additive mask
|
||||
std::optional<op> mask_add;
|
||||
|
||||
// For optional implicite causal mask
|
||||
// For optional implicit causal mask
|
||||
std::optional<op> mask_gen_idx_row;
|
||||
std::optional<logical_tensor> mask_row_idx;
|
||||
std::optional<op> mask_gen_idx_col;
|
||||
@ -556,7 +556,7 @@ partition create_sdpa_backward_graph_partition(
|
||||
// For optional additive mask
|
||||
std::optional<op> mask_add;
|
||||
|
||||
// For optional implicite causal mask
|
||||
// For optional implicit causal mask
|
||||
std::optional<op> mask_gen_idx_row;
|
||||
std::optional<logical_tensor> mask_row_idx;
|
||||
std::optional<op> mask_gen_idx_col;
|
||||
|
@ -34,7 +34,7 @@ namespace at::native::onednn {
|
||||
|
||||
/*
|
||||
oneDNN postops usage:
|
||||
Currently, oneDNN supports 5 kinds of post ops. More details can be refered
|
||||
Currently, oneDNN supports 5 kinds of post ops. More details can be referred
|
||||
to oneDNN doc.
|
||||
https://oneapi-src.github.io/oneDNN/dev_guide_attributes_post_ops.html#doxid-dev-guide-attributes-post-ops-1dev-guide-attributes-post-ops-eltwise
|
||||
|
||||
@ -345,7 +345,7 @@ class Attr {
|
||||
dnnl::memory binary_m;
|
||||
auto binary = ops_params_[i].binary_;
|
||||
auto md = ops_params_[i].meta_;
|
||||
// qeury expected_md to achieve peak performance
|
||||
// query expected_md to achieve peak performance
|
||||
auto expected_md = pd.query_md(
|
||||
dnnl::query::exec_arg_md,
|
||||
DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1);
|
||||
@ -399,7 +399,7 @@ static inline void construct_attr_for_unary(
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
unary_post_op == "none",
|
||||
"onednn qlinear: unspported unary post op",
|
||||
"onednn qlinear: unsupported unary post op",
|
||||
unary_post_op);
|
||||
}
|
||||
}
|
||||
|
@ -301,7 +301,7 @@ bool is_onednn_matmul_strides(const at::Tensor& tensor) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// the overlaped cases are not supported
|
||||
// the overlapped cases are not supported
|
||||
dnnl::memory::dims strides = get_onednn_strides(tensor);
|
||||
int64_t storage_size = 1;
|
||||
for (size_t dim = 0; dim < tensor_dim; ++dim)
|
||||
|
@ -29,7 +29,7 @@
|
||||
secondaryTensor:(MPSGraphTensor*)secondaryTensor
|
||||
name:(NSString*)name {
|
||||
// As of MacOS-15.1 m..imumWithNanPropagation is only defined for floating types and calling it with integral
|
||||
// agruments results in
|
||||
// arguments results in
|
||||
// /AppleInternal/Library/BuildRoots/c7c74b64-74b4-11ef-aeda-9635a580fe0d/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShaders/MPSCore/Utility/MPSKernelDAG.mm:805:
|
||||
// failed assertion `Error getting visible function: (null) Function isNaN_u8_i8 was not found in the library'
|
||||
if (([primaryTensor dataType] & MPSDataTypeFloatBit) == 0) {
|
||||
@ -42,7 +42,7 @@
|
||||
secondaryTensor:(MPSGraphTensor*)secondaryTensor
|
||||
name:(NSString*)name {
|
||||
// As of MacOS-15.1 m..imumWithNanPropagation is only defined for floating types and calling it with integral
|
||||
// agruments results in
|
||||
// arguments results in
|
||||
// /AppleInternal/Library/BuildRoots/c7c74b64-74b4-11ef-aeda-9635a580fe0d/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShaders/MPSCore/Utility/MPSKernelDAG.mm:805:
|
||||
// failed assertion `Error getting visible function: (null) Function isNaN_u8_i8 was not found in the library'
|
||||
if (([primaryTensor dataType] & MPSDataTypeFloatBit) == 0) {
|
||||
@ -539,7 +539,7 @@ Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor,
|
||||
|
||||
static const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
|
||||
// Use gather kernel to solve strides for macOS < 15.0
|
||||
// Starting with macOS 15.0, MPS supports native strides direclty in the kernels
|
||||
// Starting with macOS 15.0, MPS supports native strides directly in the kernels
|
||||
if (!is_macOS_15_0_or_newer || !useMPSStridedAPI) {
|
||||
if ((!src.is_contiguous() || src.storage_offset()) && gatherTensorData) {
|
||||
Tensor emptyShell = Tensor();
|
||||
@ -856,7 +856,7 @@ id<MTLLibrary> MetalShaderLibrary::getLibrary(const std::initializer_list<std::s
|
||||
break;
|
||||
}
|
||||
default:
|
||||
TORCH_INTERNAL_ASSERT(false, "Unsupported number of paramaters ", nparams);
|
||||
TORCH_INTERNAL_ASSERT(false, "Unsupported number of parameters ", nparams);
|
||||
}
|
||||
return libMap[key] = lib;
|
||||
}
|
||||
@ -1184,9 +1184,9 @@ void MetalKernelFunction::dispatch(uint64_t length, std::optional<uint64_t> grou
|
||||
}
|
||||
|
||||
void MetalKernelFunction::dispatch(c10::ArrayRef<uint64_t> length, c10::OptionalArrayRef<uint64_t> group_size) {
|
||||
TORCH_CHECK(!length.empty() && length.size() < 4, "Dispatch dimentions must be less than 3 and non-empty");
|
||||
TORCH_CHECK(!length.empty() && length.size() < 4, "Dispatch dimensions must be less than 3 and non-empty");
|
||||
TORCH_CHECK(!group_size.has_value() || group_size->size() == length.size(),
|
||||
"size and group_size must have same number of dimentions");
|
||||
"size and group_size must have same number of dimensions");
|
||||
const auto max_tg_size = getMaxThreadsPerThreadgroup();
|
||||
const auto group_size_length = group_size.has_value() ? group_size->size() : 0;
|
||||
auto tg_size = MTLSizeMake(group_size_length > 0 ? group_size->at(0) : max_tg_size,
|
||||
|
@ -59,7 +59,7 @@ static GridSamplerOffsets find_grid_sampler_offsets(
|
||||
return offsets;
|
||||
}
|
||||
|
||||
// Mod function which gives postive output when `a` is negative
|
||||
// Mod function which gives positive output when `a` is negative
|
||||
static int32_t mod(int32_t a, int32_t b) {
|
||||
auto r = a % b;
|
||||
return r + (r < 0 ? b : 0);
|
||||
@ -191,9 +191,9 @@ void grid_sampler_single_element(
|
||||
int32_t right_indices[3];
|
||||
opmath_t<T> scales[3];
|
||||
|
||||
// For each dimension, find the pair of indices in the cooresponding dimension
|
||||
// For each dimension, find the pair of indices in the corresponding dimension
|
||||
// of `input` which surround the grid coordinate in that dimension. We'll do
|
||||
// this by mapping different coordiante spaces onto each other. There are
|
||||
// this by mapping different coordinate spaces onto each other. There are
|
||||
// basically three different coordinate spaces to keep in mind:
|
||||
//
|
||||
// * aligned grid space
|
||||
|
@ -137,7 +137,7 @@ kernel void index_put_serial(
|
||||
constant int64_t* index_strides,
|
||||
constant uint4& ndim_nindices_numel,
|
||||
uint thread_index [[thread_position_in_grid]]) {
|
||||
(void)thread_index; // Suppress unused vairable varning
|
||||
(void)thread_index; // Suppress unused variable warning
|
||||
for (uint idx = 0; idx < ndim_nindices_numel.z; ++idx) {
|
||||
index_put_impl(
|
||||
output,
|
||||
|
@ -112,7 +112,7 @@ kernel void int4pack_mm(constant T *A [[buffer(0)]],
|
||||
constant uchar *B_ptr = B + ((n * K) / k_pack_factor);
|
||||
|
||||
thread float4 result = float4(0.0);
|
||||
// We multipy group of 4 channels with these scales.
|
||||
// We multiply group of 4 channels with these scales.
|
||||
// Because corresponding values from weight matrix are effectively left
|
||||
// shifted. This is to avoid doing right shift on those values which ends up
|
||||
// affecting performance. This is the trick applied in MLX kernels.
|
||||
|
@ -372,7 +372,7 @@ struct log1p_functor {
|
||||
}
|
||||
template <typename T>
|
||||
inline enable_if_t<is_complex_v<T>, T> operator()(const T x) {
|
||||
// TODO: Implement proper log1p algoirthm
|
||||
// TODO: Implement proper log1p algorithm
|
||||
auto magnitude = ::precise::sqrt((1.0f + x.x) * (1.0f + x.x) + x.y * x.y);
|
||||
auto real = ::precise::log(magnitude);
|
||||
auto imag = (x.x == -1 && x.y == 0) ? 0 : ::precise::atan2(x.y, 1.0 + x.x);
|
||||
|
@ -448,7 +448,7 @@ kernel void upsample_trilinear_backward(
|
||||
|
||||
// See Note [ Weights computation for uint8_t and multiplication trick ]
|
||||
// Essentially fall back to fixed floating point arithmetic during uint8
|
||||
// interpolation, which is not necesserily more accurate (see example below),
|
||||
// interpolation, which is not necessarily more accurate (see example below),
|
||||
// but matches closes to what CPU can deliver
|
||||
// I.e. mid-point 152+249+172+35 is 152, but algorithm yields 153 as horizontal
|
||||
// and vertical interpolation is done in separate steps and results are rounded
|
||||
|
@ -196,6 +196,28 @@ bool use_metal_mm(const Tensor& self, const Tensor& other, const Tensor& output)
|
||||
other.size(0) > max_stride_size || other.size(1) > max_stride_size);
|
||||
}
|
||||
|
||||
void map_mps_decomposition_error_code_to_blas(const Tensor& status) {
|
||||
const auto& status_flat = status.view(-1);
|
||||
|
||||
for (const auto i : c10::irange(status_flat.size(0))) {
|
||||
int code = status_flat[i].item<int>();
|
||||
switch (code) {
|
||||
case MPSMatrixDecompositionStatusSuccess:
|
||||
status_flat[i] = 0;
|
||||
break;
|
||||
case MPSMatrixDecompositionStatusNonPositiveDefinite:
|
||||
case MPSMatrixDecompositionStatusSingular:
|
||||
status_flat[i] = 2;
|
||||
break;
|
||||
case MPSMatrixDecompositionStatusFailure:
|
||||
status_flat[i] = -1;
|
||||
break;
|
||||
default:
|
||||
TORCH_INTERNAL_ASSERT(false, "Unknown MPSMatrixDecompositionStatus enum value: ", code);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
static void linalg_lu_factor_ex_out_mps_impl(const Tensor& A,
|
||||
@ -487,6 +509,9 @@ static void linalg_solve_out_mps_impl(const Tensor& A,
|
||||
"mpsmatrixdecompositionstatus for details.");
|
||||
}
|
||||
}
|
||||
|
||||
map_mps_decomposition_error_code_to_blas(info);
|
||||
|
||||
if (!left) {
|
||||
// If this was a right solve, transpose the result back
|
||||
result.copy_(result_t.transpose(-2, -1).contiguous());
|
||||
|
@ -1282,7 +1282,7 @@ static void all_any_common_impl_mps(const Tensor& input_t,
|
||||
auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
|
||||
|
||||
auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
|
||||
// reductionOrWithTensor:axis: will throw an internal assert if number of dimentions is more than 4
|
||||
// reductionOrWithTensor:axis: will throw an internal assert if number of dimensions is more than 4
|
||||
// See https://github.com/pytorch/pytorch/issues/95538
|
||||
MPSGraphTensor* outputTensor = nil;
|
||||
if (input_t.ndimension() > 4) {
|
||||
@ -1352,7 +1352,7 @@ TORCH_IMPL_FUNC(any_all_out_mps)(const Tensor& input_t, const Tensor& output_t)
|
||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
|
||||
auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
|
||||
// reductionOrWithTensor:axes: will throw an internal assert if number of dimentions is more than 4
|
||||
// reductionOrWithTensor:axes: will throw an internal assert if number of dimensions is more than 4
|
||||
// See https://github.com/pytorch/pytorch/issues/95538
|
||||
if (input_t.dim() > 4) {
|
||||
castInputTensor = [mpsGraph reshapeTensor:castInputTensor withShape:@[ @-1 ] name:nil];
|
||||
@ -1400,7 +1400,7 @@ TORCH_IMPL_FUNC(all_all_out_mps)(const Tensor& input_t, const Tensor& output_t)
|
||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
|
||||
auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
|
||||
// reductionAndWithTensor:axes: will throw an internal assert if number of dimentions is more than 4
|
||||
// reductionAndWithTensor:axes: will throw an internal assert if number of dimensions is more than 4
|
||||
// See https://github.com/pytorch/pytorch/issues/95538
|
||||
if (input_t.ndimension() > 4) {
|
||||
castInputTensor = [mpsGraph reshapeTensor:castInputTensor withShape:@[ @-1 ] name:nil];
|
||||
|
@ -1370,6 +1370,7 @@
|
||||
dispatch:
|
||||
SparseCPU: bmm_sparse_cpu
|
||||
SparseCUDA: bmm_sparse_cuda
|
||||
SparseMPS: bmm_sparse_mps
|
||||
NestedTensorCPU: bmm_nested
|
||||
NestedTensorCUDA: bmm_nested_cuda
|
||||
tags: core
|
||||
@ -1385,6 +1386,7 @@
|
||||
MTIA: bmm_out_mtia
|
||||
SparseCPU: bmm_out_sparse_cpu
|
||||
SparseCUDA: bmm_out_sparse_cuda
|
||||
SparseMPS: bmm_out_sparse_mps
|
||||
SparseCsrCUDA: bmm_out_sparse_csr_cuda
|
||||
|
||||
- func: bmm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
|
||||
@ -4173,7 +4175,7 @@
|
||||
structured_delegate: mm.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: _sparse_mm
|
||||
SparseCPU, SparseCUDA, SparseMPS: _sparse_mm
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm
|
||||
tags: core
|
||||
|
||||
@ -7112,6 +7114,7 @@
|
||||
MTIA: addmm_out_mtia
|
||||
SparseCPU: addmm_out_sparse_dense_cpu
|
||||
SparseCUDA: addmm_out_sparse_dense_cuda
|
||||
SparseMPS: addmm_out_sparse_dense_mps
|
||||
SparseCsrCPU: addmm_out_sparse_compressed_cpu
|
||||
SparseCsrCUDA: addmm_out_sparse_compressed_cuda
|
||||
|
||||
@ -7121,6 +7124,7 @@
|
||||
dispatch:
|
||||
SparseCPU: addmm_sparse_dense_cpu
|
||||
SparseCUDA: addmm_sparse_dense_cuda
|
||||
SparseMPS: addmm_sparse_dense_mps
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: addmm_sparse_compressed_dense
|
||||
tags: core
|
||||
|
||||
|
@ -41,7 +41,7 @@ Tensor pad_tensor_to_shape(
|
||||
const Tensor& t,
|
||||
IntArrayRef goal_shape,
|
||||
double value = 0) {
|
||||
std::vector<int64_t> padd;
|
||||
std::vector<int64_t> padding;
|
||||
auto tup = t.sizes();
|
||||
TORCH_CHECK(
|
||||
t.dim() == (int64_t)(goal_shape.size()),
|
||||
@ -51,10 +51,10 @@ Tensor pad_tensor_to_shape(
|
||||
goal_shape.size(),
|
||||
" of goal shape.");
|
||||
for (int64_t i = static_cast<int64_t>(tup.size()) - 1; i >= 0; i--) {
|
||||
padd.push_back(0);
|
||||
padd.push_back(goal_shape[i] - tup[i]);
|
||||
padding.push_back(0);
|
||||
padding.push_back(goal_shape[i] - tup[i]);
|
||||
}
|
||||
Tensor new_tensor = at::constant_pad_nd(t, IntArrayRef(padd), value);
|
||||
Tensor new_tensor = at::constant_pad_nd(t, IntArrayRef(padding), value);
|
||||
new_tensor = new_tensor.reshape(goal_shape);
|
||||
return new_tensor;
|
||||
}
|
||||
|
@ -53,7 +53,7 @@ C10_ALWAYS_INLINE std::pair<int64_t, int64_t> _check_nested_layer_norm_inputs(
|
||||
normalized_shape);
|
||||
|
||||
// Check that the normalized_shape has the exact same sizes as the last dimensions from the NestedTensor input
|
||||
// Also, compute M and N considering the idiosyncracies of NestedTensors
|
||||
// Also, compute M and N considering the idiosyncrasies of NestedTensors
|
||||
int64_t N = 1;
|
||||
for (const auto i: c10::irange(normalized_ndim)) {
|
||||
TORCH_CHECK(
|
||||
|
@ -95,7 +95,7 @@ std::vector<Tensor> chunk_nested_tensor(const Tensor& self, int64_t chunks, int6
|
||||
for (const auto split_idx : c10::irange(chunks)) {
|
||||
auto new_sizes = sizes.clone();
|
||||
auto new_strides = strides.clone();
|
||||
// This copys offsets so we are safe to move
|
||||
// This copies offsets so we are safe to move
|
||||
auto new_offsets = offsets.clone();
|
||||
int64_t *size_ptr = new_sizes.data_ptr<int64_t>();
|
||||
int64_t *new_offsets_ptr = new_offsets.data_ptr<int64_t>();
|
||||
|
@ -1,5 +1,6 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/native/SparseTensorUtils.h>
|
||||
#include <ATen/ExpandUtils.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <ATen/native/sparse/SparseStubs.h>
|
||||
#include <ATen/native/sparse/SparseBinaryOpIntersectionCommon.h>
|
||||
@ -18,6 +19,8 @@
|
||||
#include <ATen/ops/ones_like.h>
|
||||
#include <ATen/ops/argsort.h>
|
||||
#include <ATen/ops/result_type.h>
|
||||
#include <ATen/ops/bmm_native.h>
|
||||
#include <ATen/ops/addmm_native.h>
|
||||
#include <ATen/ops/copy_sparse_to_sparse.h>
|
||||
#include <ATen/ops/mul.h>
|
||||
#endif
|
||||
@ -33,6 +36,305 @@ static auto& lib = MetalShaderLibrary::getBundledLibrary();
|
||||
#include <ATen/native/mps/Mul_metallib.h>
|
||||
#endif
|
||||
|
||||
static Tensor& s_addmm_out_sparse_dense_mps(
|
||||
Tensor& r,
|
||||
const Tensor& t,
|
||||
const SparseTensor& sparse_,
|
||||
const Tensor& dense,
|
||||
const Scalar& beta,
|
||||
const Scalar& alpha) {
|
||||
TORCH_CHECK(sparse_.sparse_dim() == 2, "addmm: sparse_dim must be 2, got ", sparse_.sparse_dim());
|
||||
TORCH_CHECK(sparse_.dense_dim() == 0, "addmm: sparse values must be 0-dense-dim, got ", sparse_.dense_dim());
|
||||
TORCH_CHECK(dense.dim() == 2, "addmm: 'dense' must be 2D, got ", dense.dim());
|
||||
TORCH_CHECK(t.dim() == 2, "addmm: 't' must be 2D, got ", t.dim());
|
||||
|
||||
const int64_t I = sparse_.size(0);
|
||||
const int64_t J = sparse_.size(1);
|
||||
const int64_t K = dense.size(1);
|
||||
|
||||
TORCH_CHECK(dense.size(0) == J,
|
||||
"addmm: dense (mat2) dim0 must be ", J, ", got ", dense.size(0));
|
||||
TORCH_CHECK(t.size(0) == I && t.size(1) == K,
|
||||
"addmm: 't' shape must be (", I, ", ", K, "), got (", t.size(0), ", ", t.size(1), ")");
|
||||
|
||||
r.resize_({I, K});
|
||||
|
||||
auto sparse = sparse_.coalesce();
|
||||
const int64_t nnz = sparse._nnz();
|
||||
|
||||
if (nnz == 0 || I == 0 || K == 0) {
|
||||
at::mul_out(r, t, beta);
|
||||
return r;
|
||||
}
|
||||
|
||||
const auto v_dtype = sparse._values().scalar_type();
|
||||
const auto d_dtype = dense.scalar_type();
|
||||
const auto t_dtype = t.scalar_type();
|
||||
auto compute_dtype = c10::promoteTypes(c10::promoteTypes(v_dtype, d_dtype), t_dtype);
|
||||
|
||||
TORCH_CHECK(canCast(compute_dtype, r.scalar_type()),
|
||||
"Can't convert computed type ", compute_dtype, " to output ", r.scalar_type());
|
||||
|
||||
auto indices2d = sparse._indices().contiguous();
|
||||
auto values = sparse._values().to(compute_dtype);
|
||||
auto dense_c = dense.to(compute_dtype).contiguous();
|
||||
auto t_c = t.to(compute_dtype).contiguous();
|
||||
|
||||
const bool out_needs_cast = (r.scalar_type() != compute_dtype) || !r.is_contiguous();
|
||||
Tensor out_buf = out_needs_cast
|
||||
? at::empty({I, K}, r.options().dtype(compute_dtype))
|
||||
: r;
|
||||
auto out_contig = out_buf.contiguous();
|
||||
|
||||
auto device = r.device();
|
||||
auto stream = getCurrentMPSStream();
|
||||
|
||||
const float alpha_f = alpha.to<float>();
|
||||
const float beta_f = beta.to<float>();
|
||||
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
const std::string func = "spmm_addmm_coo_" + mps::scalarToMetalTypeString(values);
|
||||
auto pso = lib.getPipelineStateForFunc(func);
|
||||
auto enc = stream->commandEncoder();
|
||||
[enc setComputePipelineState:pso];
|
||||
|
||||
const uint32_t tew = pso.threadExecutionWidth;
|
||||
const uint32_t gridX = static_cast<uint32_t>(K);
|
||||
const uint32_t gridZ = static_cast<uint32_t>(I);
|
||||
const uint32_t tgW = std::min<uint32_t>(gridX, tew);
|
||||
|
||||
MTLSize grid = MTLSizeMake(gridX, 1, gridZ);
|
||||
MTLSize tgs = MTLSizeMake(tgW, 1, 1);
|
||||
|
||||
mtl_setArgs(enc,
|
||||
indices2d,
|
||||
values,
|
||||
dense_c,
|
||||
t_c,
|
||||
out_contig,
|
||||
std::array<uint32_t, 3>{static_cast<uint32_t>(I),
|
||||
static_cast<uint32_t>(J),
|
||||
static_cast<uint32_t>(K)},
|
||||
std::array<float, 2>{alpha_f, beta_f},
|
||||
static_cast<uint32_t>(nnz));
|
||||
[enc dispatchThreads:grid threadsPerThreadgroup:tgs];
|
||||
}
|
||||
});
|
||||
|
||||
if (out_needs_cast) {
|
||||
r.copy_(out_contig.to(r.scalar_type()));
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
static void build_batch_ptr_mps(
|
||||
const Tensor& indices_dim0,
|
||||
int64_t B,
|
||||
Tensor& batch_ptr
|
||||
) {
|
||||
// Builds an array of pointers which point to each batches elements. Example:
|
||||
// idx_b = [0, 0, 0, 1, 1, 2, 2, 2, 2] // 9 non-zero elements
|
||||
// └─────┘ └──┘ └─────────┘
|
||||
// batch 0 batch 1 batch 2
|
||||
// batch_ptr = [0, 3, 5, 9]
|
||||
// │ │ │ └─ end of batch 2 (total nnz)
|
||||
// │ │ └──── batch 2 starts at index 5
|
||||
// │ └─────── batch 1 starts at index 3
|
||||
// └────────── batch 0 starts at index 0
|
||||
TORCH_CHECK(indices_dim0.is_mps() && batch_ptr.is_mps(), "MPS device expected");
|
||||
auto device = indices_dim0.device();
|
||||
auto stream = getCurrentMPSStream();
|
||||
|
||||
const int64_t nnz = indices_dim0.numel();
|
||||
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
auto pso = lib.getPipelineStateForFunc("build_batch_ptr_from_sorted_batches");
|
||||
auto enc = stream->commandEncoder();
|
||||
[enc setComputePipelineState:pso];
|
||||
|
||||
const uint32_t tew = pso.threadExecutionWidth;
|
||||
const uint32_t Q = static_cast<uint32_t>(B + 1);
|
||||
const uint32_t tgW = std::min<uint32_t>(Q, tew);
|
||||
MTLSize grid = MTLSizeMake(Q, 1, 1);
|
||||
MTLSize tgs = MTLSizeMake(tgW, 1, 1);
|
||||
|
||||
mtl_setArgs(enc,
|
||||
indices_dim0,
|
||||
batch_ptr,
|
||||
std::array<uint32_t, 2>{static_cast<uint32_t>(nnz),
|
||||
static_cast<uint32_t>(B)});
|
||||
[enc dispatchThreads:grid threadsPerThreadgroup:tgs];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
static void build_row_ptr_per_batch_mps(
|
||||
const Tensor& rows,
|
||||
const Tensor& batch_ptr,
|
||||
int64_t B,
|
||||
int64_t I,
|
||||
Tensor& row_ptr
|
||||
) {
|
||||
// Build per-batch CSR-style row pointer arrays from row indices sorted by batch
|
||||
// Given:
|
||||
// rows: 1-D array of length nnz with row ids in [0, I), sorted within each batch
|
||||
// batch_ptr: length B+1, where [batch_ptr[b], batch_ptr[b+1]) is the subrange for batch b
|
||||
// Produces:
|
||||
// - row_ptr: shape [B, I+1]
|
||||
//
|
||||
// Example (B = 2, I = 4):
|
||||
// rows = [0, 0, 1, 3, 0, 2, 2] // 7 non-zero elements
|
||||
// └─── batch 0 ──┘ └─ batch 1 ─┘
|
||||
// batch_ptr = [0, 4, 7]
|
||||
// │ │ └─ end of batch 1 (total nnz)
|
||||
// │ └──── end of batch 0/start of batch 1
|
||||
// └─────── start of batch 0
|
||||
//
|
||||
// per-batch row pointers (I+1 entries each):
|
||||
// row_ptr[0] = [0, 2, 3, 3, 4]
|
||||
// row_ptr[1] = [0, 1, 1, 3, 3]
|
||||
// laid out in memory: [0, 2, 3, 3, 4, 0, 1, 1, 3, 3]
|
||||
TORCH_CHECK(rows.is_mps() && batch_ptr.is_mps() && row_ptr.is_mps(), "MPS device expected");
|
||||
auto stream = getCurrentMPSStream();
|
||||
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
auto pso = lib.getPipelineStateForFunc("build_row_ptr_from_sorted_rows_by_batch");
|
||||
auto enc = stream->commandEncoder();
|
||||
[enc setComputePipelineState:pso];
|
||||
|
||||
const uint32_t tew = pso.threadExecutionWidth;
|
||||
const uint32_t Qx = static_cast<uint32_t>(I + 1);
|
||||
const uint32_t Qy = static_cast<uint32_t>(B);
|
||||
const uint32_t tgW = std::min<uint32_t>(Qx, tew);
|
||||
|
||||
MTLSize grid = MTLSizeMake(Qx, Qy, 1);
|
||||
MTLSize tgs = MTLSizeMake(tgW, 1, 1);
|
||||
|
||||
mtl_setArgs(enc,
|
||||
rows,
|
||||
batch_ptr,
|
||||
row_ptr,
|
||||
std::array<uint32_t, 2>{static_cast<uint32_t>(I),
|
||||
static_cast<uint32_t>(B)});
|
||||
[enc dispatchThreads:grid threadsPerThreadgroup:tgs];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
Tensor& bmm_out_sparse_mps(const SparseTensor& self_, const Tensor& mat2_, Tensor& result_) {
|
||||
TORCH_CHECK(result_.is_mps(), "bmm_sparse: expected 'out' to be MPS, got ", result_.device());
|
||||
TORCH_CHECK(self_.is_mps(), "bmm_sparse: expected 'self' to be MPS, got ", self_.device());
|
||||
TORCH_CHECK(mat2_.is_mps(), "bmm_sparse: expected 'mat2' to be MPS, got ", mat2_.device());
|
||||
|
||||
TORCH_CHECK(self_.dense_dim() == 0, "bmm_sparse: Tensor 'self' must have 0 dense dims, but has ", self_.dense_dim());
|
||||
TORCH_CHECK(self_.sparse_dim() == 3, "bmm_sparse: Tensor 'self' must have 3 sparse dims, but has ", self_.sparse_dim());
|
||||
TORCH_CHECK(mat2_.dim() == 3, "bmm_sparse: Tensor 'mat2' must have 3 dims, but has ", mat2_.dim());
|
||||
|
||||
TORCH_CHECK(self_.size(0) == mat2_.size(0), "bmm_sparse: 'self.size(0)' and 'mat2.size(0)' must match");
|
||||
TORCH_CHECK(self_.size(2) == mat2_.size(1), "bmm_sparse: 'self.size(2)' and 'mat2.size(1)' must match");
|
||||
|
||||
const int64_t B = self_.size(0);
|
||||
const int64_t I = self_.size(1);
|
||||
const int64_t J = self_.size(2);
|
||||
const int64_t K = mat2_.size(2);
|
||||
|
||||
auto self = self_.coalesce();
|
||||
const int64_t nnz = self._nnz();
|
||||
if (nnz == 0) {
|
||||
return result_.zero_();
|
||||
}
|
||||
|
||||
const auto computeDtype = at::kFloat;
|
||||
|
||||
auto indices = self._indices();
|
||||
auto values = self._values();
|
||||
|
||||
auto values_c = values.scalar_type() == computeDtype ? values : values.to(computeDtype);
|
||||
auto mat2_c = mat2_.scalar_type() == computeDtype ? mat2_ : mat2_.to(computeDtype);
|
||||
auto mat2_contig = mat2_c.contiguous();
|
||||
|
||||
auto idx_b = indices.select(0, 0).contiguous();
|
||||
auto idx_i = indices.select(0, 1).contiguous();
|
||||
auto idx_j = indices.select(0, 2).contiguous();
|
||||
|
||||
// builds an array of pointers of where the batch_idx's pointer starts and ends
|
||||
// look in function for better explanation
|
||||
auto batch_ptr = at::empty({B + 1}, at::device(result_.device()).dtype(kLong));
|
||||
build_batch_ptr_mps(idx_b, B, batch_ptr);
|
||||
// build row_ptr per batch: for each (b, i) get [start, end) into rows/cols/vals
|
||||
auto row_ptr = at::empty({B * (I + 1)}, at::device(result_.device()).dtype(kLong));
|
||||
build_row_ptr_per_batch_mps(idx_i, batch_ptr, B, I, row_ptr);
|
||||
|
||||
const bool out_needs_cast = (result_.scalar_type() != computeDtype) || !result_.is_contiguous();
|
||||
Tensor out_buf = out_needs_cast
|
||||
? at::empty({B, I, K}, result_.options().dtype(computeDtype))
|
||||
: result_;
|
||||
auto out_contig = out_buf.contiguous();
|
||||
|
||||
auto stream = getCurrentMPSStream();
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
auto pso = lib.getPipelineStateForFunc("spmm_bmm_coo_rows_grouped_" + mps::scalarToMetalTypeString(values));
|
||||
auto enc = stream->commandEncoder();
|
||||
[enc setComputePipelineState:pso];
|
||||
|
||||
const uint32_t tew = pso.threadExecutionWidth;
|
||||
const uint32_t tgW = std::min<uint32_t>((uint32_t)K, tew);
|
||||
|
||||
// One threadgroup per (row i, batch b), lanes cover K
|
||||
MTLSize grid = MTLSizeMake(tgW, (uint32_t)I, (uint32_t)B);
|
||||
MTLSize tgs = MTLSizeMake(tgW, 1, 1);
|
||||
|
||||
mtl_setArgs(enc,
|
||||
idx_i,
|
||||
idx_j,
|
||||
values_c,
|
||||
mat2_contig,
|
||||
out_contig,
|
||||
row_ptr,
|
||||
std::array<uint32_t, 4>{(uint32_t)B, (uint32_t)I, (uint32_t)J, (uint32_t)K});
|
||||
[enc dispatchThreads:grid threadsPerThreadgroup:tgs];
|
||||
}
|
||||
});
|
||||
if (out_needs_cast) {
|
||||
result_.copy_(out_contig.to(result_.scalar_type()));
|
||||
}
|
||||
return result_;
|
||||
}
|
||||
|
||||
Tensor bmm_sparse_mps(const Tensor& self, const Tensor& mat2) {
|
||||
Tensor result = at::zeros({self.size(0), self.size(1), mat2.size(2)}, mat2.options());
|
||||
return bmm_out_sparse_mps(self, mat2, result);
|
||||
}
|
||||
|
||||
Tensor& addmm_out_sparse_dense_mps(
|
||||
const Tensor& self,
|
||||
const SparseTensor& mat1,
|
||||
const Tensor& mat2,
|
||||
const Scalar& beta,
|
||||
const Scalar& alpha,
|
||||
Tensor& result) {
|
||||
c10::MaybeOwned<Tensor> b_self = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm_out");
|
||||
return s_addmm_out_sparse_dense_mps(result, *b_self, mat1, mat2, beta, alpha);
|
||||
}
|
||||
|
||||
Tensor addmm_sparse_dense_mps(
|
||||
const Tensor& self,
|
||||
const SparseTensor& mat1,
|
||||
const Tensor& mat2,
|
||||
const Scalar& beta,
|
||||
const Scalar& alpha
|
||||
) {
|
||||
c10::MaybeOwned<Tensor> b_self = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm_out");
|
||||
Tensor result = at::empty({0}, self.options());
|
||||
return s_addmm_out_sparse_dense_mps(result, *b_self, mat1, mat2, beta, alpha);
|
||||
}
|
||||
|
||||
static SparseTensor& mul_out_dense_sparse_mps(
|
||||
const Tensor& dense,
|
||||
const Tensor& sparse,
|
||||
|
@ -1,10 +1,105 @@
|
||||
#include <metal_stdlib>
|
||||
#include <c10/metal/indexing.h>
|
||||
#include <c10/metal/utils.h>
|
||||
using namespace c10::metal;
|
||||
using namespace metal;
|
||||
|
||||
inline uint lower_bound_i64(device const long* arr, uint lo, uint hi, long key) {
|
||||
uint l = lo, r = hi;
|
||||
while (l < r) {
|
||||
uint m = (l + r) >> 1;
|
||||
long v = arr[m];
|
||||
if (v < key) {
|
||||
l = m + 1;
|
||||
} else {
|
||||
r = m;
|
||||
}
|
||||
}
|
||||
return l;
|
||||
}
|
||||
|
||||
template <typename T> struct MulAccum { using type = float; };
|
||||
template <> struct MulAccum<float2> { using type = float2; };
|
||||
inline uint upper_bound_i64(device const long* arr, uint lo, uint hi, long key) {
|
||||
uint l = lo, r = hi;
|
||||
while (l < r) {
|
||||
uint m = (l + r) >> 1;
|
||||
long v = arr[m];
|
||||
if (v <= key) {
|
||||
l = m + 1;
|
||||
} else {
|
||||
r = m;
|
||||
}
|
||||
}
|
||||
return l;
|
||||
}
|
||||
|
||||
kernel void build_row_ptr_from_sorted_rows_by_batch(
|
||||
device const long* rows [[buffer(0)]],
|
||||
device const long* batch_ptr [[buffer(1)]],
|
||||
device long* row_ptr [[buffer(2)]],
|
||||
constant uint2& dims [[buffer(3)]],
|
||||
uint3 tid [[thread_position_in_grid]])
|
||||
{
|
||||
const uint I = dims.x;
|
||||
const uint B = dims.y;
|
||||
|
||||
const uint i = tid.x;
|
||||
const uint b = tid.y;
|
||||
|
||||
if (b >= B || i > I) return;
|
||||
|
||||
const uint base = (uint)batch_ptr[b];
|
||||
const uint lim = (uint)batch_ptr[b + 1];
|
||||
|
||||
const ulong out_base = (ulong)b * (ulong)(I + 1);
|
||||
|
||||
if (i == I) {
|
||||
row_ptr[out_base + (ulong)I] = (long)lim;
|
||||
} else {
|
||||
const long key = (long)i;
|
||||
const uint pos = lower_bound_i64(rows, base, lim, key);
|
||||
row_ptr[out_base + (ulong)i] = (long)pos;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
kernel void spmm_bmm_coo_rows_grouped(
|
||||
device const long* rows [[buffer(0)]],
|
||||
device const long* cols [[buffer(1)]],
|
||||
device const T* vals [[buffer(2)]],
|
||||
device const T* dense [[buffer(3)]],
|
||||
device T* out [[buffer(4)]],
|
||||
device const long* row_ptr [[buffer(5)]],
|
||||
constant uint4& dims [[buffer(6)]],
|
||||
uint3 tid [[thread_position_in_grid]],
|
||||
uint3 ltid [[thread_position_in_threadgroup]],
|
||||
uint3 tptg [[threads_per_threadgroup]])
|
||||
{
|
||||
const uint B = dims.x;
|
||||
const uint I = dims.y;
|
||||
const uint J = dims.z;
|
||||
const uint K = dims.w;
|
||||
|
||||
const uint b = tid.z;
|
||||
const uint i = tid.y;
|
||||
const uint lane = ltid.x;
|
||||
const uint tgW = tptg.x;
|
||||
|
||||
const ulong rp_base = (ulong)b * (ulong)(I + 1);
|
||||
const uint start = (uint)row_ptr[rp_base + (ulong)i];
|
||||
const uint end = (uint)row_ptr[rp_base + (ulong)i + 1];
|
||||
|
||||
for (uint k = lane; k < K; k += tgW) {
|
||||
auto acc = static_cast<accum_t<T>>(T(0));
|
||||
for (uint p = start; p < end; ++p) {
|
||||
const uint c = (uint)cols[p];
|
||||
const auto v = static_cast<accum_t<T>>(vals[p]);
|
||||
const uint d_off = ((b * J) + c) * K + k;
|
||||
const auto d = static_cast<accum_t<T>>(dense[d_off]);
|
||||
acc += mul(v, d);
|
||||
}
|
||||
const uint y_off = ((b * I) + i) * K + k;
|
||||
out[y_off] = static_cast<T>(acc);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
kernel void dense_sparse_mul_kernel(
|
||||
@ -32,10 +127,9 @@ kernel void dense_sparse_mul_kernel(
|
||||
ulong dense_idx = (ulong)key * (ulong)view_cols + (ulong)col;
|
||||
ulong val_idx = (ulong)i * (ulong)view_cols + (ulong)col;
|
||||
|
||||
using accum_t = typename MulAccum<T>::type;
|
||||
const accum_t a = static_cast<accum_t>(values[val_idx]);
|
||||
const accum_t b = static_cast<accum_t>(dense[dense_idx]);
|
||||
out_values[val_idx] = static_cast<T>(a * b);
|
||||
const auto a = static_cast<accum_t<T>>(values[val_idx]);
|
||||
const auto b = static_cast<accum_t<T>>(dense[dense_idx]);
|
||||
out_values[val_idx] = static_cast<T>(mul(a, b));
|
||||
}
|
||||
|
||||
kernel void intersect_binary_search(
|
||||
@ -120,6 +214,76 @@ kernel void fused_gather_mul_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
kernel void build_batch_ptr_from_sorted_batches(
|
||||
device const long* batches [[buffer(0)]],
|
||||
device long* batch_ptr [[buffer(1)]],
|
||||
constant uint2& nnz_B [[buffer(2)]],
|
||||
uint3 tid [[thread_position_in_grid]])
|
||||
{
|
||||
uint b = tid.x;
|
||||
uint nnz = nnz_B.x;
|
||||
uint batch = nnz_B.y;
|
||||
|
||||
if (b == batch) {
|
||||
batch_ptr[b] = (long)nnz;
|
||||
return;
|
||||
}
|
||||
|
||||
uint lo = 0;
|
||||
uint hi = nnz;
|
||||
long key = (long)b;
|
||||
while (lo < hi) {
|
||||
uint mid = (lo + hi) >> 1;
|
||||
long v = batches[mid];
|
||||
if (v < key) lo = mid + 1;
|
||||
else hi = mid;
|
||||
}
|
||||
batch_ptr[b] = (long)lo;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
kernel void spmm_addmm_coo(
|
||||
device const long* indices2d [[buffer(0)]],
|
||||
device const T* vals [[buffer(1)]],
|
||||
device const T* dense [[buffer(2)]],
|
||||
device const T* t_in [[buffer(3)]],
|
||||
device T* out [[buffer(4)]],
|
||||
constant uint3& dims [[buffer(5)]],
|
||||
constant float2& alpha_beta [[buffer(6)]],
|
||||
constant uint& nnz [[buffer(7)]],
|
||||
uint3 tid [[thread_position_in_grid]])
|
||||
{
|
||||
const uint K = dims.z;
|
||||
const uint k = tid.x;
|
||||
const uint i = tid.z;
|
||||
const float alpha = alpha_beta.x;
|
||||
const float beta = alpha_beta.y;
|
||||
|
||||
device const long* rows = indices2d;
|
||||
device const long* cols = indices2d + nnz;
|
||||
|
||||
const uint start = lower_bound_i64(rows, 0u, nnz, (long)i);
|
||||
const uint end = upper_bound_i64(rows, 0u, nnz, (long)i);
|
||||
|
||||
// accumulator is float for scalar/half/bfloat and float2 for float2
|
||||
auto acc = static_cast<accum_t<T>>(T(0));
|
||||
|
||||
for (uint p = start; p < end; ++p) {
|
||||
const uint c = (uint)cols[p];
|
||||
const auto v = static_cast<accum_t<T>>(vals[p]);
|
||||
const uint dense_off = c * K + k;
|
||||
const auto d = static_cast<accum_t<T>>(dense[dense_off]);
|
||||
acc += mul(v, d);
|
||||
}
|
||||
|
||||
const uint off = i * K + k;
|
||||
const auto base = (beta != 0.0f) ? (static_cast<accum_t<T>>(t_in[off]) * beta) : static_cast<accum_t<T>>(T(0));
|
||||
const auto y = base + alpha * acc;
|
||||
out[off] = static_cast<T>(y);
|
||||
}
|
||||
|
||||
|
||||
#define INSTANTIATE_DENSE_SPARSE_MUL(DTYPE) \
|
||||
template [[host_name("dense_sparse_mul_kernel_" #DTYPE)]] kernel void \
|
||||
dense_sparse_mul_kernel<DTYPE>( \
|
||||
@ -151,6 +315,36 @@ INSTANTIATE_DENSE_SPARSE_MUL(float2);
|
||||
constant uint2& dims_output [[buffer(8)]], \
|
||||
uint3 gid [[thread_position_in_grid]]);
|
||||
|
||||
INSTANTIATE_FUSED_GATHER_MUL(float);
|
||||
INSTANTIATE_FUSED_GATHER_MUL(half);
|
||||
INSTANTIATE_FUSED_GATHER_MUL(bfloat);
|
||||
INSTANTIATE_FOR_FLOAT_TYPES(INSTANTIATE_FUSED_GATHER_MUL);
|
||||
|
||||
|
||||
#define INSTANTIATE_SPMM_BMM_COO_ROWS_GROUPED(DTYPE) \
|
||||
template [[host_name("spmm_bmm_coo_rows_grouped_" #DTYPE)]] kernel void \
|
||||
spmm_bmm_coo_rows_grouped<DTYPE>( \
|
||||
device const long* rows [[buffer(0)]], \
|
||||
device const long* cols [[buffer(1)]], \
|
||||
device const DTYPE* vals [[buffer(2)]], \
|
||||
device const DTYPE* dense [[buffer(3)]], \
|
||||
device DTYPE* out [[buffer(4)]], \
|
||||
device const long* row_ptr [[buffer(5)]], \
|
||||
constant uint4& dims [[buffer(6)]], \
|
||||
uint3 tid [[thread_position_in_grid]], \
|
||||
uint3 ltid [[thread_position_in_threadgroup]], \
|
||||
uint3 tptg [[threads_per_threadgroup]]);
|
||||
|
||||
INSTANTIATE_FOR_ALL_TYPES(INSTANTIATE_SPMM_BMM_COO_ROWS_GROUPED);
|
||||
|
||||
#define INSTANTIATE_SPMM_ADDMM_COO(DTYPE) \
|
||||
template [[host_name("spmm_addmm_coo_" #DTYPE)]] kernel void \
|
||||
spmm_addmm_coo<DTYPE>( \
|
||||
device const long* indices2d [[buffer(0)]], \
|
||||
device const DTYPE* vals [[buffer(1)]], \
|
||||
device const DTYPE* dense [[buffer(2)]], \
|
||||
device const DTYPE* t_in [[buffer(3)]], \
|
||||
device DTYPE* out [[buffer(4)]], \
|
||||
constant uint3& dims [[buffer(5)]], \
|
||||
constant float2& alpha_beta [[buffer(6)]], \
|
||||
constant uint& nnz [[buffer(7)]], \
|
||||
uint3 tid [[thread_position_in_grid]]);
|
||||
|
||||
INSTANTIATE_FOR_ALL_TYPES(INSTANTIATE_SPMM_ADDMM_COO);
|
||||
|
@ -1751,8 +1751,8 @@ def maybe_snapshot_memory(should_snapshot_memory, suffix):
|
||||
f"{output_filename.rstrip('.csv')}_{suffix}.pickle",
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
log.error("Failed to save memory snapshot, %s", e)
|
||||
except Exception:
|
||||
log.exception("Failed to save memory snapshot")
|
||||
|
||||
torch.cuda.memory._record_memory_history(enabled=None)
|
||||
|
||||
@ -2284,9 +2284,11 @@ class BenchmarkRunner:
|
||||
)
|
||||
):
|
||||
is_same = False
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
# Sometimes torch.allclose may throw RuntimeError
|
||||
is_same = False
|
||||
exception_string = str(e)
|
||||
accuracy_status = f"fail_exception: {exception_string}"
|
||||
return record_status(accuracy_status, dynamo_start_stats=start_stats)
|
||||
|
||||
if not is_same:
|
||||
accuracy_status = "eager_two_runs_differ"
|
||||
@ -2403,9 +2405,11 @@ class BenchmarkRunner:
|
||||
force_max_multiplier=force_max_multiplier,
|
||||
):
|
||||
is_same = False
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
# Sometimes torch.allclose may throw RuntimeError
|
||||
is_same = False
|
||||
exception_string = str(e)
|
||||
accuracy_status = f"fail_exception: {exception_string}"
|
||||
return record_status(accuracy_status, dynamo_start_stats=start_stats)
|
||||
|
||||
if not is_same:
|
||||
if self.args.skip_accuracy_check:
|
||||
|
@ -124,7 +124,7 @@ with open(MODELS_FILENAME) as fh:
|
||||
continue
|
||||
batch_size = int(batch_size)
|
||||
BATCH_SIZE_KNOWN_MODELS[model_name] = batch_size
|
||||
assert len(BATCH_SIZE_KNOWN_MODELS)
|
||||
assert BATCH_SIZE_KNOWN_MODELS
|
||||
|
||||
|
||||
try:
|
||||
|
@ -296,8 +296,8 @@ class OperatorInputsLoader:
|
||||
for key in self.operator_db.keys():
|
||||
try:
|
||||
op = eval(key)
|
||||
except AttributeError as ae:
|
||||
log.warning("Evaluating an op name into an OpOverload: %s", ae)
|
||||
except AttributeError:
|
||||
log.warning("Evaluating an op name into an OpOverload", exc_info=True)
|
||||
continue
|
||||
yield op
|
||||
|
||||
|
@ -85,7 +85,7 @@ class WeightOnlyInt8QuantHandler:
|
||||
cur_state_dict[f"{fqn}.weight"] = int8_weight
|
||||
cur_state_dict[f"{fqn}.scales"] = scales.to(mod.weight.dtype)
|
||||
elif isinstance(mod, ConditionalFeedForward):
|
||||
for weight_idx in range(0, 3):
|
||||
for weight_idx in range(3):
|
||||
weight_name = f"w{weight_idx + 1}"
|
||||
scales_name = f"scales{weight_idx + 1}"
|
||||
weight = getattr(mod, weight_name)
|
||||
|
@ -1,5 +1,4 @@
|
||||
#include <c10/core/AllocatorConfig.h>
|
||||
#include <c10/core/DeviceType.h>
|
||||
#include <c10/util/env.h>
|
||||
|
||||
namespace c10::CachingAllocator {
|
||||
|
@ -4,7 +4,6 @@
|
||||
#include <c10/core/SymNodeImpl.h>
|
||||
#include <c10/util/intrusive_ptr.h>
|
||||
#include <c10/util/safe_numerics.h>
|
||||
#include <functional>
|
||||
|
||||
namespace c10 {
|
||||
|
||||
|
@ -9,7 +9,6 @@
|
||||
#include <c10/core/impl/TorchDispatchModeTLS.h>
|
||||
#include <c10/util/Logging.h>
|
||||
#include <c10/util/accumulate.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <optional>
|
||||
|
||||
#include <utility>
|
||||
|
@ -1,9 +1,5 @@
|
||||
#include <c10/core/TensorOptions.h>
|
||||
|
||||
#include <c10/core/Device.h>
|
||||
#include <c10/core/Layout.h>
|
||||
#include <c10/util/Optional.h>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
namespace c10 {
|
||||
|
@ -2,7 +2,6 @@
|
||||
|
||||
#include <c10/core/Allocator.h>
|
||||
#include <c10/core/StorageImpl.h>
|
||||
#include <c10/core/alignment.h>
|
||||
#include <c10/core/impl/COWDeleter.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/ParallelGuard.h>
|
||||
|
@ -1,5 +1,4 @@
|
||||
#include <c10/core/DispatchKey.h>
|
||||
#include <c10/core/SafePyObject.h>
|
||||
#include <c10/core/impl/LocalDispatchKeySet.h>
|
||||
#include <c10/core/impl/TorchDispatchModeTLS.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
@ -1260,6 +1260,9 @@ class DeviceCachingAllocator {
|
||||
// thread local compile context for each device
|
||||
static thread_local std::stack<std::string> compile_context;
|
||||
|
||||
// thread local user metadata for annotating allocations
|
||||
static thread_local std::string user_metadata;
|
||||
|
||||
public:
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
||||
explicit DeviceCachingAllocator(c10::DeviceIndex id)
|
||||
@ -1302,6 +1305,14 @@ class DeviceCachingAllocator {
|
||||
}
|
||||
}
|
||||
|
||||
void setUserMetadata(const std::string& metadata) {
|
||||
user_metadata = metadata;
|
||||
}
|
||||
|
||||
std::string getUserMetadata() {
|
||||
return user_metadata;
|
||||
}
|
||||
|
||||
bool checkPoolLiveAllocations(
|
||||
MempoolId_t mempool_id,
|
||||
const std::unordered_set<void*>& expected_live_allocations) const {
|
||||
@ -3682,7 +3693,8 @@ class DeviceCachingAllocator {
|
||||
mempool_id,
|
||||
getApproximateTime(),
|
||||
record_context_ >= RecordContext::ALLOC ? std::move(context) : nullptr,
|
||||
compile_string);
|
||||
compile_string,
|
||||
user_metadata);
|
||||
|
||||
// Callbacks should not include any Pytorch call
|
||||
for (const auto& cb : trace_trackers_) {
|
||||
@ -3737,6 +3749,7 @@ static void uncached_delete(void* ptr) {
|
||||
|
||||
static void local_raw_delete(void* ptr);
|
||||
thread_local std::stack<std::string> DeviceCachingAllocator::compile_context;
|
||||
thread_local std::string DeviceCachingAllocator::user_metadata;
|
||||
#ifdef __cpp_lib_hardware_interference_size
|
||||
using std::hardware_destructive_interference_size;
|
||||
#else
|
||||
@ -3934,6 +3947,18 @@ class NativeCachingAllocator : public CUDAAllocator {
|
||||
device_allocator[device]->popCompileContext();
|
||||
}
|
||||
|
||||
void setUserMetadata(const std::string& metadata) override {
|
||||
c10::DeviceIndex device = 0;
|
||||
C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
|
||||
device_allocator[device]->setUserMetadata(metadata);
|
||||
}
|
||||
|
||||
std::string getUserMetadata() override {
|
||||
c10::DeviceIndex device = 0;
|
||||
C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
|
||||
return device_allocator[device]->getUserMetadata();
|
||||
}
|
||||
|
||||
bool isHistoryEnabled() override {
|
||||
c10::DeviceIndex device = 0;
|
||||
C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
|
||||
|
@ -118,7 +118,8 @@ struct TraceEntry {
|
||||
MempoolId_t mempool,
|
||||
approx_time_t time,
|
||||
std::shared_ptr<GatheredContext> context = nullptr,
|
||||
std::string compile_context = "")
|
||||
std::string compile_context = "",
|
||||
std::string user_metadata = "")
|
||||
: action_(action),
|
||||
device_(device),
|
||||
addr_(addr),
|
||||
@ -126,7 +127,8 @@ struct TraceEntry {
|
||||
stream_(stream),
|
||||
size_(size),
|
||||
mempool_(std::move(mempool)),
|
||||
compile_context_(std::move(compile_context)) {
|
||||
compile_context_(std::move(compile_context)),
|
||||
user_metadata_(std::move(user_metadata)) {
|
||||
time_.approx_t_ = time;
|
||||
}
|
||||
Action action_;
|
||||
@ -138,6 +140,7 @@ struct TraceEntry {
|
||||
MempoolId_t mempool_;
|
||||
trace_time_ time_{};
|
||||
std::string compile_context_;
|
||||
std::string user_metadata_;
|
||||
};
|
||||
|
||||
// Calls made by record_function will save annotations
|
||||
@ -297,6 +300,10 @@ class CUDAAllocator : public DeviceAllocator {
|
||||
const std::vector<std::pair<std::string, std::string>>& /*md*/) {}
|
||||
virtual void pushCompileContext(std::string& md) {}
|
||||
virtual void popCompileContext() {}
|
||||
virtual void setUserMetadata(const std::string& metadata) {}
|
||||
virtual std::string getUserMetadata() {
|
||||
return "";
|
||||
}
|
||||
virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0;
|
||||
|
||||
// Attached AllocatorTraceTracker callbacks will be called while the
|
||||
@ -536,6 +543,14 @@ inline void enablePeerAccess(
|
||||
get()->enablePeerAccess(dev, dev_to_access);
|
||||
}
|
||||
|
||||
inline void setUserMetadata(const std::string& metadata) {
|
||||
get()->setUserMetadata(metadata);
|
||||
}
|
||||
|
||||
inline std::string getUserMetadata() {
|
||||
return get()->getUserMetadata();
|
||||
}
|
||||
|
||||
} // namespace c10::cuda::CUDACachingAllocator
|
||||
|
||||
namespace c10::cuda {
|
||||
|
@ -1,8 +1,6 @@
|
||||
#include <c10/cuda/CUDADeviceAssertionHost.h>
|
||||
#include <c10/cuda/CUDAException.h>
|
||||
#include <c10/cuda/CUDAFunctions.h>
|
||||
#include <c10/util/Backtrace.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/env.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
@ -4,7 +4,6 @@
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <c10/util/UniqueVoidPtr.h>
|
||||
#include <c10/util/flat_hash_map.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
@ -1,7 +1,6 @@
|
||||
#include <c10/cuda/CUDAMiscFunctions.h>
|
||||
#include <c10/util/env.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
|
||||
namespace c10::cuda {
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user