mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-23 06:34:55 +08:00
Compare commits
316 Commits
cslpull88
...
dev/joona/
Author | SHA1 | Date | |
---|---|---|---|
b9caa336a0 | |||
dab7d646d5 | |||
7647c398ff | |||
d67cc58181 | |||
dddaadac6c | |||
02169364e1 | |||
c30042fbeb | |||
6700175531 | |||
de8a8653c0 | |||
86335e9135 | |||
14e3f3c062 | |||
9852c6d236 | |||
6354271178 | |||
12902f6ecf | |||
3decb676aa | |||
8d68a02905 | |||
28330a8a39 | |||
eaba287adb | |||
f5f1d0a753 | |||
5bc238c73e | |||
79223114db | |||
7cfd23636c | |||
0d1d69fd25 | |||
21a64d57b1 | |||
1a74952925 | |||
a130ed828a | |||
eb0fe02933 | |||
d270e2d240 | |||
16b37b309f | |||
13ee85ca5e | |||
94d2471d1f | |||
5ca46be15e | |||
9a04cfbeff | |||
66db61f0d1 | |||
c025f7becc | |||
8c4e1148b8 | |||
e20ee39558 | |||
74fd1bf965 | |||
5d964a5eb7 | |||
118d7e1480 | |||
dd47f6f623 | |||
e05ea2b179 | |||
ad75b09d89 | |||
a2cb9b7331 | |||
451eaf0ff2 | |||
09519eb195 | |||
5314ae2660 | |||
da587de9cb | |||
82a4df2d5f | |||
18a9030952 | |||
03f23d07b4 | |||
8c738c9270 | |||
7ddacaf40a | |||
183c32fd3b | |||
3ab12e2596 | |||
596e93b506 | |||
f96e8041b1 | |||
7cf9c81918 | |||
49e0b88aab | |||
ee8c5cc1cc | |||
ce4d146f56 | |||
0226fcaacf | |||
4cde5096c4 | |||
443c015393 | |||
4ae6d7c18f | |||
3084b7b5c0 | |||
5c3d0a2ded | |||
c608b17f60 | |||
444b52ff40 | |||
160c228a4b | |||
0d15122092 | |||
6a3edfcc1e | |||
356f14e7b7 | |||
34dc8f69a1 | |||
cd9ee49a69 | |||
26e5572dd2 | |||
693897df42 | |||
3bf6be457d | |||
492f064f15 | |||
29408ea81a | |||
02dcb07765 | |||
5c38aa72c0 | |||
5134ba7458 | |||
e48ee2cf50 | |||
eb38ee21ba | |||
8057b72763 | |||
7b17918dc9 | |||
66c45f3ed9 | |||
0a9d55d2ee | |||
4ca65d3323 | |||
c932b39739 | |||
1f15973657 | |||
fc88ba260f | |||
bf8d0e3107 | |||
3a1239a248 | |||
4f9f1775d8 | |||
5e0788befb | |||
440f8f57af | |||
e004d539da | |||
c4b84a46a9 | |||
bc1b8f094d | |||
f65a564fa2 | |||
386b313028 | |||
6d7cbc20d2 | |||
ca16956b20 | |||
67735d1ee8 | |||
6e13f5eb38 | |||
23b1486185 | |||
9902b349cb | |||
5a9ac83e94 | |||
1adf28a5c0 | |||
c18052da0e | |||
c0d2f991b1 | |||
e889252493 | |||
6546c6186d | |||
1d9fefff19 | |||
7ec17b49cf | |||
146921007a | |||
a71e5509bc | |||
136e28f616 | |||
39a61795e3 | |||
b4feec9782 | |||
d81731615f | |||
e2f9a83b85 | |||
70a65a8bd5 | |||
689d278543 | |||
9b764491e3 | |||
cbc6b30a24 | |||
5b368de7f7 | |||
09a5e88bef | |||
a4e6a0b240 | |||
4ab232d0c4 | |||
2032f107d7 | |||
5f7d956362 | |||
a13c118994 | |||
21241bfeee | |||
73a6fc6e30 | |||
09287e3af4 | |||
16c3b8f87c | |||
9c6dff4941 | |||
0eb425a563 | |||
011cae9570 | |||
dfb2b661f7 | |||
5a69e0ebbe | |||
5e145861f2 | |||
c35b953531 | |||
dced0d6d9f | |||
c0436c5701 | |||
60e8dc4374 | |||
e6c3f58584 | |||
90e12cf63d | |||
44c08f4984 | |||
b6186353c6 | |||
b7eb7256fb | |||
c1ae78be92 | |||
defb515306 | |||
31c4e0d37d | |||
53290ca00b | |||
16f5155992 | |||
37144be03d | |||
3bdc54ed18 | |||
2196f32475 | |||
cfc227ad43 | |||
20cab91a12 | |||
a6fae2e811 | |||
042f2f7746 | |||
fd494dd426 | |||
8334cb2fb9 | |||
e72ed4717e | |||
3bebc09be9 | |||
a2db22e6bb | |||
eac5e12548 | |||
18479c5f70 | |||
f7c0c06692 | |||
b53d97c7be | |||
6c1da66407 | |||
d7c97e7245 | |||
be9f4ffe88 | |||
692faa9bc6 | |||
32f3af72b7 | |||
ebab5c85c4 | |||
3d734d837b | |||
c92227c41a | |||
e6a0221fc6 | |||
a6b9d444fb | |||
d42b0c8f22 | |||
941d094dd1 | |||
b1a934741e | |||
0c661f3e1a | |||
2c7e314803 | |||
ead4407f57 | |||
2f5b40c099 | |||
993b5647ab | |||
2ab26806f1 | |||
b1612569f6 | |||
dc0e818738 | |||
06e414d7fe | |||
a681260caf | |||
95e976a63f | |||
306ac44eaa | |||
a7643baceb | |||
a4030e37be | |||
22e1fb6faa | |||
2a4890e315 | |||
3ce433aef2 | |||
7f2d20e687 | |||
32fd29c1ea | |||
5eebd9315a | |||
a15aabc975 | |||
b143426db3 | |||
13ba0a2e5c | |||
8520ce5f78 | |||
196748d491 | |||
177e4f4218 | |||
3988b3468b | |||
04118d8617 | |||
24482e5c68 | |||
c0ec599f27 | |||
7074de43c0 | |||
771dcce11d | |||
de74aafff4 | |||
ad29a2c0dc | |||
3a9e33dca8 | |||
a086882d72 | |||
84ae6b7d6b | |||
60a097a071 | |||
13bae39e22 | |||
4ef6c05f65 | |||
d6b9bd3e60 | |||
d0591f4658 | |||
b5dea061c8 | |||
041960a1ce | |||
67c7924ea1 | |||
217ba7b2ab | |||
758d515d98 | |||
60d98b4cfb | |||
590a3e9f8a | |||
764ee6e3f9 | |||
67f98a99a4 | |||
e020a8755a | |||
7ffb3b201c | |||
f946bf88c4 | |||
66da3b3b2a | |||
41e653456e | |||
e40a0a9359 | |||
c05a7adb36 | |||
5f57be7571 | |||
29d72c1100 | |||
3b1a334c0f | |||
07689a38bf | |||
06a7dc21c1 | |||
d9a18173fa | |||
d8543e3162 | |||
ad01fc194d | |||
e162414963 | |||
9e5a797771 | |||
b46a1b9e2d | |||
9688014820 | |||
8f6e73f068 | |||
1e57ef08fa | |||
614b86d602 | |||
0b96dfb736 | |||
62b221d5cc | |||
66dd4577b1 | |||
cc28634172 | |||
c83cdf068b | |||
28ccfba248 | |||
b2386bdca1 | |||
bdfc8d9f96 | |||
70779dded8 | |||
ea231300d1 | |||
8f66995459 | |||
144fde4fd2 | |||
43f4947d44 | |||
65e1c34061 | |||
830247c355 | |||
4262755b5a | |||
3825607144 | |||
3c8f71ff93 | |||
fc890b55b5 | |||
058a69d91a | |||
6c5920d515 | |||
116fd474da | |||
a5d70cf545 | |||
7fe819d917 | |||
f63571060c | |||
38fead8f7c | |||
24a223c49d | |||
e4920a1364 | |||
bc5ecf83d7 | |||
e55c0f59e5 | |||
a4cf9653ee | |||
9c0b03020b | |||
034717a029 | |||
9c38b00999 | |||
8efe547046 | |||
82d00acfee | |||
098431a29d | |||
be660ea2d3 | |||
52c7c89ea4 | |||
1efd341d15 | |||
a096f2899d | |||
dbeb8a1691 | |||
b1f72e2984 | |||
bb3c2408f4 | |||
2c99f17a32 | |||
0043dcd79e | |||
2e2fb668fa | |||
9d24f945ba | |||
ecbd715363 | |||
58f2477a26 | |||
43dcb4bb61 | |||
50d1e37079 | |||
b99ef1a02e | |||
8a5c8e5db9 | |||
c7328dff7f |
@ -1,5 +1,5 @@
|
||||
0.6b
|
||||
0.7b
|
||||
manylinux_2_17
|
||||
rocm6.2
|
||||
7f07e8a1cb1f99627eb6d77f5c0e9295c775f3c7
|
||||
e4ab195d2bd19e939c675a13280c29714c6ef9f2cf420690da150fa0cac043b1
|
||||
9be04068c3c0857a4cfd17d7e39e71d0423ebac2
|
||||
3e9e1959d23b93d78a08fcc5f868125dc3854dece32fd9458be9ef4467982291
|
||||
|
@ -108,10 +108,10 @@ ENV CMAKE_C_COMPILER cc
|
||||
ENV CMAKE_CXX_COMPILER c++
|
||||
COPY ./common/install_triton.sh install_triton.sh
|
||||
COPY ./common/common_utils.sh common_utils.sh
|
||||
COPY ci_commit_pins/triton-rocm.txt triton-rocm.txt
|
||||
COPY ci_commit_pins/triton.txt triton.txt
|
||||
COPY triton_version.txt triton_version.txt
|
||||
RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
|
||||
RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt
|
||||
RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt
|
||||
|
||||
# Install AOTriton (Early fail)
|
||||
COPY ./aotriton_version.txt aotriton_version.txt
|
||||
|
@ -1 +0,0 @@
|
||||
21eae954efa5bf584da70324b640288c3ee7aede
|
@ -1 +1 @@
|
||||
1b2f15840e0d70eec50d84c7a0575cb835524def
|
||||
91b14bf5593cf58a8541f3e6b9125600a867d4ef
|
||||
|
@ -1 +1 @@
|
||||
dedb7bdf339a3546896d4820366ca562c586bfa0
|
||||
5fe38ffd73c2ac6ed6323b554205186696631c6f
|
||||
|
@ -4,12 +4,12 @@ set -ex
|
||||
|
||||
source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
|
||||
|
||||
TARBALL='aotriton.tar.bz2'
|
||||
TARBALL='aotriton.tar.gz'
|
||||
# This read command alwasy returns with exit code 1
|
||||
read -d "\n" VER MANYLINUX ROCMBASE PINNED_COMMIT SHA256 < aotriton_version.txt || true
|
||||
ARCH=$(uname -m)
|
||||
AOTRITON_INSTALL_PREFIX="$1"
|
||||
AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}-shared.tar.bz2"
|
||||
AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}-shared.tar.gz"
|
||||
|
||||
cd "${AOTRITON_INSTALL_PREFIX}"
|
||||
# Must use -L to follow redirects
|
||||
|
@ -12,10 +12,7 @@ conda_reinstall() {
|
||||
as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y --force-reinstall $*
|
||||
}
|
||||
|
||||
if [ -n "${ROCM_VERSION}" ]; then
|
||||
TRITON_REPO="https://github.com/openai/triton"
|
||||
TRITON_TEXT_FILE="triton-rocm"
|
||||
elif [ -n "${XPU_VERSION}" ]; then
|
||||
if [ -n "${XPU_VERSION}" ]; then
|
||||
TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
|
||||
TRITON_TEXT_FILE="triton-xpu"
|
||||
else
|
||||
|
@ -30,9 +30,14 @@ dill==0.3.7
|
||||
#Pinned versions: 0.3.7
|
||||
#test that import: dynamo/test_replay_record.py test_dataloader.py test_datapipe.py test_serialization.py
|
||||
|
||||
expecttest==0.1.6
|
||||
expecttest==0.2.1
|
||||
#Description: method for writing tests where test framework auto populates
|
||||
# the expected output based on previous runs
|
||||
#Pinned versions: 0.2.1
|
||||
#test that import:
|
||||
|
||||
fbscribelogger==0.1.6
|
||||
#Description: write to scribe from authenticated jobs on CI
|
||||
#Pinned versions: 0.1.6
|
||||
#test that import:
|
||||
|
||||
@ -332,3 +337,8 @@ onnxscript==0.1.0.dev20240817
|
||||
#Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
|
||||
parameterized==0.8.1
|
||||
#Description: Parameterizes unittests, both the tests themselves and the entire testing class
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
|
@ -1 +1 @@
|
||||
3.0.0
|
||||
3.1.0
|
||||
|
@ -100,10 +100,10 @@ ARG TRITON
|
||||
# try to reach out to S3, which docker build runners don't have access
|
||||
COPY ./common/install_triton.sh install_triton.sh
|
||||
COPY ./common/common_utils.sh common_utils.sh
|
||||
COPY ci_commit_pins/triton-rocm.txt triton-rocm.txt
|
||||
COPY ci_commit_pins/triton.txt triton.txt
|
||||
COPY triton_version.txt triton_version.txt
|
||||
RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
|
||||
RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt
|
||||
RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt
|
||||
|
||||
# Install AOTriton
|
||||
COPY ./aotriton_version.txt aotriton_version.txt
|
||||
|
@ -596,6 +596,9 @@ test_single_dynamo_benchmark() {
|
||||
|
||||
test_inductor_micro_benchmark() {
|
||||
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
||||
if [[ "${TEST_CONFIG}" == *cpu* ]]; then
|
||||
test_inductor_set_cpu_affinity
|
||||
fi
|
||||
python benchmarks/gpt_fast/benchmark.py --output "${TEST_REPORTS_DIR}/gpt_fast_benchmark.csv"
|
||||
}
|
||||
|
||||
|
@ -43,6 +43,9 @@ python -m pip install z3-solver==4.12.2.0
|
||||
# Install tlparse for test\dynamo\test_structured_trace.py UTs.
|
||||
python -m pip install tlparse==0.3.25
|
||||
|
||||
# Install parameterized
|
||||
python -m pip install parameterized==0.8.1
|
||||
|
||||
run_tests() {
|
||||
# Run nvidia-smi if available
|
||||
for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
|
||||
|
@ -119,6 +119,11 @@ fi
|
||||
# Test the package
|
||||
/builder/check_binary.sh
|
||||
|
||||
if [[ "\$GPU_ARCH_TYPE" != *s390x* && "\$GPU_ARCH_TYPE" != *xpu* && "\$GPU_ARCH_TYPE" != *rocm* && "$PACKAGE_TYPE" != libtorch ]]; then
|
||||
# Exclude s390, xpu, rocm and libtorch builds from smoke testing
|
||||
python /builder/test/smoke_test/smoke_test.py --package=torchonly --torch-compile-check disabled
|
||||
fi
|
||||
|
||||
# Clean temp files
|
||||
cd /builder && git clean -ffdx
|
||||
|
||||
|
@ -90,7 +90,7 @@ fi
|
||||
if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" ]]; then
|
||||
TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
|
||||
if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
|
||||
TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-rocm.txt)
|
||||
TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
|
||||
TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
|
||||
fi
|
||||
if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
|
||||
|
12
.github/merge_rules.yaml
vendored
12
.github/merge_rules.yaml
vendored
@ -86,6 +86,18 @@
|
||||
- pull
|
||||
- inductor
|
||||
|
||||
- name: OSS CI / pytorchbot / slow tests
|
||||
patterns:
|
||||
- test/slow_tests.json
|
||||
approved_by:
|
||||
- pytorchbot
|
||||
ignore_flaky_failures: false
|
||||
mandatory_checks_name:
|
||||
- EasyCLA
|
||||
- Lint
|
||||
- pull
|
||||
- slow
|
||||
|
||||
- name: OSS CI /pytorchbot / Executorch
|
||||
patterns:
|
||||
- .ci/docker/ci_commit_pins/executorch.txt
|
||||
|
1
.github/pytorch-probot.yml
vendored
1
.github/pytorch-probot.yml
vendored
@ -9,6 +9,7 @@ ciflow_push_tags:
|
||||
- ciflow/inductor-rocm
|
||||
- ciflow/inductor-perf-compare
|
||||
- ciflow/inductor-micro-benchmark
|
||||
- ciflow/inductor-micro-benchmark-cpu-x86
|
||||
- ciflow/inductor-cu124
|
||||
- ciflow/linux-aarch64
|
||||
- ciflow/mps
|
||||
|
@ -1,6 +1,7 @@
|
||||
boto3==1.19.12
|
||||
hypothesis==6.56.4
|
||||
expecttest==0.1.6
|
||||
expecttest==0.2.1
|
||||
fbscribelogger==0.1.6
|
||||
librosa>=0.6.2
|
||||
mpmath==1.3.0
|
||||
networkx==2.8.7
|
||||
@ -30,3 +31,4 @@ optree==0.12.1
|
||||
# NB: test_hparams_* from test_tensorboard is failing with protobuf 5.26.0 in
|
||||
# which the stringify metadata is wrong when escaping double quote
|
||||
protobuf==3.20.2
|
||||
parameterized==0.8.1
|
||||
|
4
.github/scripts/build_triton_wheel.py
vendored
4
.github/scripts/build_triton_wheel.py
vendored
@ -15,9 +15,7 @@ REPO_DIR = SCRIPT_DIR.parent.parent
|
||||
|
||||
def read_triton_pin(device: str = "cuda") -> str:
|
||||
triton_file = "triton.txt"
|
||||
if device == "rocm":
|
||||
triton_file = "triton-rocm.txt"
|
||||
elif device == "xpu":
|
||||
if device == "xpu":
|
||||
triton_file = "triton-xpu.txt"
|
||||
with open(REPO_DIR / ".ci" / "docker" / "ci_commit_pins" / triton_file) as f:
|
||||
return f.read().strip()
|
||||
|
47
.github/scripts/generate_binary_build_matrix.py
vendored
47
.github/scripts/generate_binary_build_matrix.py
vendored
@ -325,6 +325,7 @@ def generate_wheels_matrix(
|
||||
os: str,
|
||||
arches: Optional[List[str]] = None,
|
||||
python_versions: Optional[List[str]] = None,
|
||||
use_split_build: bool = False,
|
||||
) -> List[Dict[str, str]]:
|
||||
package_type = "wheel"
|
||||
if os == "linux" or os == "linux-aarch64" or os == "linux-s390x":
|
||||
@ -371,7 +372,17 @@ def generate_wheels_matrix(
|
||||
) and python_version == "3.13":
|
||||
continue
|
||||
|
||||
if use_split_build and (
|
||||
arch_version not in ["12.4", "12.1", "11.8", "cpu"] or os != "linux"
|
||||
):
|
||||
raise RuntimeError(
|
||||
"Split build is only supported on linux with cuda 12.4, 12.1, 11.8, and cpu.\n"
|
||||
f"Currently attempting to build on arch version {arch_version} and os {os}.\n"
|
||||
"Please modify the matrix generation to exclude this combination."
|
||||
)
|
||||
|
||||
# 12.1 linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
|
||||
|
||||
if (
|
||||
arch_version in ["12.4", "12.1", "11.8"]
|
||||
and os == "linux"
|
||||
@ -385,6 +396,7 @@ def generate_wheels_matrix(
|
||||
"desired_cuda": translate_desired_cuda(
|
||||
gpu_arch_type, gpu_arch_version
|
||||
),
|
||||
"use_split_build": "True" if use_split_build else "False",
|
||||
"devtoolset": (
|
||||
"cxx11-abi" if arch_version == "cuda-aarch64" else ""
|
||||
),
|
||||
@ -400,7 +412,8 @@ def generate_wheels_matrix(
|
||||
),
|
||||
}
|
||||
)
|
||||
if arch_version != "cuda-aarch64":
|
||||
# Special build building to use on Colab. PyThon 3.10 for 12.1 CUDA
|
||||
if python_version == "3.10" and arch_version == "12.1":
|
||||
ret.append(
|
||||
{
|
||||
"python_version": python_version,
|
||||
@ -409,40 +422,16 @@ def generate_wheels_matrix(
|
||||
"desired_cuda": translate_desired_cuda(
|
||||
gpu_arch_type, gpu_arch_version
|
||||
),
|
||||
"use_split_build": "True",
|
||||
"use_split_build": "True" if use_split_build else "False",
|
||||
"devtoolset": "",
|
||||
"container_image": WHEEL_CONTAINER_IMAGES[arch_version],
|
||||
"package_type": package_type,
|
||||
"pytorch_extra_install_requirements": (
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version] # fmt: skip
|
||||
if os != "linux-aarch64"
|
||||
else ""
|
||||
),
|
||||
"build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-split".replace( # noqa: B950
|
||||
"pytorch_extra_install_requirements": "",
|
||||
"build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace( # noqa: B950
|
||||
".", "_"
|
||||
),
|
||||
}
|
||||
)
|
||||
# Special build building to use on Colab. PyThon 3.10 for 12.1 CUDA
|
||||
if python_version == "3.10" and arch_version == "12.1":
|
||||
ret.append(
|
||||
{
|
||||
"python_version": python_version,
|
||||
"gpu_arch_type": gpu_arch_type,
|
||||
"gpu_arch_version": gpu_arch_version,
|
||||
"desired_cuda": translate_desired_cuda(
|
||||
gpu_arch_type, gpu_arch_version
|
||||
),
|
||||
"use_split_build": "False",
|
||||
"devtoolset": "",
|
||||
"container_image": WHEEL_CONTAINER_IMAGES[arch_version],
|
||||
"package_type": package_type,
|
||||
"pytorch_extra_install_requirements": "",
|
||||
"build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace( # noqa: B950
|
||||
".", "_"
|
||||
),
|
||||
}
|
||||
)
|
||||
else:
|
||||
ret.append(
|
||||
{
|
||||
@ -452,6 +441,7 @@ def generate_wheels_matrix(
|
||||
"desired_cuda": translate_desired_cuda(
|
||||
gpu_arch_type, gpu_arch_version
|
||||
),
|
||||
"use_split_build": "True" if use_split_build else "False",
|
||||
"devtoolset": (
|
||||
"cxx11-abi" if arch_version == "cpu-cxx11-abi" else ""
|
||||
),
|
||||
@ -467,6 +457,7 @@ def generate_wheels_matrix(
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
|
38
.github/scripts/generate_ci_workflows.py
vendored
38
.github/scripts/generate_ci_workflows.py
vendored
@ -61,6 +61,7 @@ class BinaryBuildWorkflow:
|
||||
# Mainly for macos
|
||||
cross_compile_arm64: bool = False
|
||||
macos_runner: str = "macos-14-xlarge"
|
||||
use_split_build: bool = False
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if self.abi_version:
|
||||
@ -69,12 +70,20 @@ class BinaryBuildWorkflow:
|
||||
)
|
||||
else:
|
||||
self.build_environment = f"{self.os}-binary-{self.package_type}"
|
||||
if self.use_split_build:
|
||||
# added to distinguish concurrency groups
|
||||
self.build_environment += "-split"
|
||||
|
||||
def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
|
||||
output_file_path = (
|
||||
GITHUB_DIR
|
||||
/ f"workflows/generated-{self.build_environment}-{self.branches}.yml"
|
||||
)
|
||||
if self.use_split_build:
|
||||
output_file_path = (
|
||||
GITHUB_DIR
|
||||
/ f"workflows/generated-{self.build_environment}-{self.branches}"
|
||||
)
|
||||
with open(output_file_path, "w") as output_file:
|
||||
GENERATED = "generated" # Note that please keep the variable GENERATED otherwise phabricator will hide the whole file
|
||||
output_file.writelines([f"# @{GENERATED} DO NOT EDIT MANUALLY\n"])
|
||||
@ -110,6 +119,20 @@ LINUX_BINARY_BUILD_WORFKLOWS = [
|
||||
isolated_workflow=True,
|
||||
),
|
||||
),
|
||||
BinaryBuildWorkflow(
|
||||
os=OperatingSystem.LINUX,
|
||||
package_type="manywheel",
|
||||
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
|
||||
OperatingSystem.LINUX,
|
||||
use_split_build=True,
|
||||
arches=["11.8", "12.1", "12.4", "cpu"],
|
||||
),
|
||||
ciflow_config=CIFlowConfig(
|
||||
labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
|
||||
isolated_workflow=True,
|
||||
),
|
||||
use_split_build=True,
|
||||
),
|
||||
BinaryBuildWorkflow(
|
||||
os=OperatingSystem.LINUX,
|
||||
package_type="conda",
|
||||
@ -162,6 +185,21 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
|
||||
),
|
||||
branches="main",
|
||||
),
|
||||
BinaryBuildWorkflow(
|
||||
os=OperatingSystem.LINUX,
|
||||
package_type="manywheel",
|
||||
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
|
||||
OperatingSystem.LINUX,
|
||||
arches=["11.8", "12.1", "12.4"],
|
||||
python_versions=["3.9"],
|
||||
use_split_build=True,
|
||||
),
|
||||
ciflow_config=CIFlowConfig(
|
||||
labels={LABEL_CIFLOW_PERIODIC},
|
||||
),
|
||||
branches="main",
|
||||
use_split_build=True,
|
||||
),
|
||||
BinaryBuildWorkflow(
|
||||
os=OperatingSystem.LINUX,
|
||||
package_type="libtorch",
|
||||
|
350
.github/scripts/runner_determinator.py
vendored
350
.github/scripts/runner_determinator.py
vendored
@ -3,49 +3,94 @@
|
||||
"""
|
||||
This runner determinator is used to determine which set of runners to run a
|
||||
GitHub job on. It uses the first comment of a GitHub issue (by default
|
||||
https://github.com/pytorch/test-infra/issues/5132) as a user list to determine
|
||||
which users will get their jobs to run on experimental runners. This user list
|
||||
is also a comma separated list of additional features or experiments which the
|
||||
user could be opted in to.
|
||||
https://github.com/pytorch/test-infra/issues/5132) to define the configuration
|
||||
of which runners should be used to run which job.
|
||||
|
||||
The configuration has two parts, the settings and a list of opted-in users,
|
||||
separated by a line containing "---". If the line is not present, the
|
||||
settings are considered to be empty with only the second part, the user
|
||||
list, defined.
|
||||
|
||||
The first part is a YAML block that defines the rollout settings. This can be
|
||||
used to define any settings that are needed to determine which runners to use.
|
||||
It's fields are defined by the RolloutSettings class below.
|
||||
|
||||
The second part is a list of users who are explicitly opted in to the LF fleet.
|
||||
The user list is also a comma separated list of additional features or
|
||||
experiments which the user could be opted in to.
|
||||
|
||||
The user list has the following rules:
|
||||
|
||||
- Users are GitHub usernames with the @ prefix
|
||||
- If the first line is a "*" then all users will use the new runners
|
||||
- If the first line is a "!" then all users will use the old runners
|
||||
- Users are GitHub usernames, which must start with the @ prefix
|
||||
- Each user is also a comma-separated list of features/experiments to enable
|
||||
- A "#" prefix indicates the user is opted out of the new runners but is opting
|
||||
into features/experiments.
|
||||
- A "#" prefix opts the user out of all experiments
|
||||
|
||||
Example user list:
|
||||
Example config:
|
||||
# A list of experiments that can be opted into.
|
||||
# This defines the behavior they'll induce when opted into.
|
||||
# Expected syntax is:
|
||||
# [experiment_name]: # Name of the experiment. Also used for the label prefix.
|
||||
# rollout_perc: [int] # % of workflows to run with this experiment when users are not opted in.
|
||||
|
||||
@User1
|
||||
@User2,amz2023
|
||||
#@UserOptOutOfNewRunner,amz2023
|
||||
experiments:
|
||||
lf:
|
||||
rollout_percent: 25
|
||||
|
||||
---
|
||||
|
||||
# Opt-ins:
|
||||
# Users can opt into the LF fleet by adding their GitHub username to this list
|
||||
# and specifying experiments to enable in a comma-separated list.
|
||||
# Experiments should be from the above list.
|
||||
|
||||
@User1,lf,split_build
|
||||
@User2,lf
|
||||
@User3,split_build
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
from argparse import ArgumentParser
|
||||
from logging import LogRecord
|
||||
from typing import Any, Iterable
|
||||
from typing import Any, Dict, Iterable, List, NamedTuple, Tuple
|
||||
|
||||
import yaml
|
||||
from github import Auth, Github
|
||||
from github.Issue import Issue
|
||||
|
||||
|
||||
WORKFLOW_LABEL_META = "" # use meta runners
|
||||
DEFAULT_LABEL_PREFIX = "" # use meta runners
|
||||
WORKFLOW_LABEL_LF = "lf." # use runners from the linux foundation
|
||||
WORKFLOW_LABEL_LF_CANARY = "lf.c." # use canary runners from the linux foundation
|
||||
|
||||
RUNNER_AMI_LEGACY = ""
|
||||
RUNNER_AMI_AMZ2023 = "amz2023"
|
||||
|
||||
GITHUB_OUTPUT = os.getenv("GITHUB_OUTPUT", "")
|
||||
GH_OUTPUT_KEY_AMI = "runner-ami"
|
||||
GH_OUTPUT_KEY_LABEL_TYPE = "label-type"
|
||||
|
||||
|
||||
SETTING_EXPERIMENTS = "experiments"
|
||||
|
||||
LF_FLEET_EXPERIMENT = "lf"
|
||||
CANARY_FLEET_SUFFIX = ".c"
|
||||
|
||||
|
||||
class Experiment(NamedTuple):
|
||||
rollout_perc: float = (
|
||||
0 # Percentage of workflows to experiment on when user is not opted-in.
|
||||
)
|
||||
|
||||
# Add more fields as needed
|
||||
|
||||
|
||||
class Settings(NamedTuple):
|
||||
"""
|
||||
Settings for the experiments that can be opted into.
|
||||
"""
|
||||
|
||||
experiments: Dict[str, Experiment] = {}
|
||||
|
||||
|
||||
class ColorFormatter(logging.Formatter):
|
||||
"""Color codes the log messages based on the log level"""
|
||||
|
||||
@ -172,85 +217,180 @@ def is_exception_branch(branch: str) -> bool:
|
||||
return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}
|
||||
|
||||
|
||||
def get_fleet(rollout_state: str, workflow_requestors: Iterable[str]) -> str:
|
||||
"""
|
||||
Determines if the job should run on the LF fleet or the Meta fleet
|
||||
|
||||
Returns:
|
||||
The appropriate label prefix for the runner, corresponding to the fleet to use.
|
||||
This gets prefixed to the very start of the runner label.
|
||||
"""
|
||||
|
||||
def load_yaml(yaml_text: str) -> Any:
|
||||
try:
|
||||
if rollout_state[0] == "!":
|
||||
log.info("LF Workflows are disabled for everyone. Using meta runners.")
|
||||
return WORKFLOW_LABEL_META
|
||||
elif rollout_state[0] == "*":
|
||||
log.info("LF Workflows are enabled for everyone. Using LF runners.")
|
||||
return WORKFLOW_LABEL_LF
|
||||
else:
|
||||
all_opted_in_users = {
|
||||
usr_raw.strip("\n\t@ ").split(",")[0]
|
||||
for usr_raw in rollout_state.split()
|
||||
}
|
||||
opted_in_requestors = {
|
||||
usr for usr in workflow_requestors if usr in all_opted_in_users
|
||||
}
|
||||
if opted_in_requestors:
|
||||
log.info(
|
||||
f"LF Workflows are enabled for {', '.join(opted_in_requestors)}. Using LF runners."
|
||||
)
|
||||
return WORKFLOW_LABEL_LF
|
||||
else:
|
||||
log.info(
|
||||
f"LF Workflows are disabled for {', '.join(workflow_requestors)}. Using meta runners."
|
||||
)
|
||||
return WORKFLOW_LABEL_META
|
||||
|
||||
except Exception as e:
|
||||
log.error(
|
||||
f"Failed to get determine workflow type. Falling back to meta runners. Exception: {e}"
|
||||
)
|
||||
return WORKFLOW_LABEL_META
|
||||
data = yaml.safe_load(yaml_text)
|
||||
return data
|
||||
except yaml.YAMLError as exc:
|
||||
log.exception("Error loading YAML")
|
||||
raise
|
||||
|
||||
|
||||
def get_optin_feature(
|
||||
rollout_state: str, workflow_requestors: Iterable[str], feature: str, fallback: str
|
||||
def extract_settings_user_opt_in_from_text(rollout_state: str) -> Tuple[str, str]:
|
||||
"""
|
||||
Extracts the text with settings, if any, and the opted in users from the rollout state.
|
||||
|
||||
If the issue body contains "---" then the text above that is the settings
|
||||
and the text below is the list of opted in users.
|
||||
|
||||
If it doesn't contain "---" then the settings are empty and the rest is the users.
|
||||
"""
|
||||
rollout_state_parts = rollout_state.split("---")
|
||||
if len(rollout_state_parts) >= 2:
|
||||
return rollout_state_parts[0], rollout_state_parts[1]
|
||||
else:
|
||||
return "", rollout_state
|
||||
|
||||
|
||||
class UserOptins(Dict[str, List[str]]):
|
||||
"""
|
||||
Dictionary of users with a list of features they have opted into
|
||||
"""
|
||||
|
||||
|
||||
def parse_user_opt_in_from_text(user_optin_text: str) -> UserOptins:
|
||||
"""
|
||||
Parse the user opt-in text into a key value pair of username and the list of features they have opted into
|
||||
|
||||
Users are GitHub usernames with the @ prefix. Each user is also a comma-separated list of features/experiments to enable.
|
||||
- Example line: "@User1,lf,split_build"
|
||||
- A "#" prefix indicates the user is opted out of all experiments
|
||||
|
||||
|
||||
"""
|
||||
optins = UserOptins()
|
||||
for user in user_optin_text.split("\n"):
|
||||
user = user.strip("\r\n\t -")
|
||||
if not user or not user.startswith("@"):
|
||||
# Not a valid user. Skip
|
||||
continue
|
||||
|
||||
if user:
|
||||
usr_name = user.split(",")[0].strip("@")
|
||||
optins[usr_name] = [exp.strip(" ") for exp in user.split(",")[1:]]
|
||||
|
||||
return optins
|
||||
|
||||
|
||||
def parse_settings_from_text(settings_text: str) -> Settings:
|
||||
"""
|
||||
Parse the experiments from the issue body into a list of ExperimentSettings
|
||||
"""
|
||||
try:
|
||||
if settings_text:
|
||||
# Escape the backtick as well so that we can have the settings in a code block on the GH issue
|
||||
# for easy reading
|
||||
# Note: Using ascii for the backtick so that the cat step in _runner-determinator.yml doesn't choke on
|
||||
# the backtick character in shell commands.
|
||||
backtick = chr(96) # backtick character
|
||||
settings_text = settings_text.strip(f"\r\n\t{backtick} ")
|
||||
settings = load_yaml(settings_text)
|
||||
|
||||
# For now we just load experiments. We can expand this if/when we add more settings
|
||||
experiments = {}
|
||||
|
||||
for exp_name, exp_settings in settings.get(SETTING_EXPERIMENTS).items():
|
||||
valid_settings = {}
|
||||
for setting in exp_settings:
|
||||
if setting not in Experiment._fields:
|
||||
log.warning(
|
||||
f"Unexpected setting in experiment: {setting} = {exp_settings[setting]}"
|
||||
)
|
||||
else:
|
||||
valid_settings[setting] = exp_settings[setting]
|
||||
|
||||
experiments[exp_name] = Experiment(**valid_settings)
|
||||
return Settings(experiments)
|
||||
|
||||
except Exception:
|
||||
log.exception("Failed to parse settings")
|
||||
|
||||
return Settings()
|
||||
|
||||
|
||||
def parse_settings(rollout_state: str) -> Settings:
|
||||
"""
|
||||
Parse settings, if any, from the rollout state.
|
||||
|
||||
If the issue body contains "---" then the text above that is the settings
|
||||
and the text below is the list of opted in users.
|
||||
|
||||
If it doesn't contain "---" then the settings are empty and the default values are used.
|
||||
"""
|
||||
settings_text, _ = extract_settings_user_opt_in_from_text(rollout_state)
|
||||
return parse_settings_from_text(settings_text)
|
||||
|
||||
|
||||
def parse_users(rollout_state: str) -> UserOptins:
|
||||
"""
|
||||
Parse users from the rollout state.
|
||||
|
||||
"""
|
||||
_, users_text = extract_settings_user_opt_in_from_text(rollout_state)
|
||||
return parse_user_opt_in_from_text(users_text)
|
||||
|
||||
|
||||
def is_user_opted_in(user: str, user_optins: UserOptins, experiment_name: str) -> bool:
|
||||
"""
|
||||
Check if a user is opted into an experiment
|
||||
"""
|
||||
return experiment_name in user_optins.get(user, [])
|
||||
|
||||
|
||||
def get_runner_prefix(
|
||||
rollout_state: str, workflow_requestors: Iterable[str], is_canary: bool = False
|
||||
) -> str:
|
||||
"""
|
||||
Used to dynamically opt in jobs to specific runner-type variants.
|
||||
settings = parse_settings(rollout_state)
|
||||
user_optins = parse_users(rollout_state)
|
||||
|
||||
Returns:
|
||||
The runner-type's variant name if the user has opted in to the feature, otherwise returns an empty string.
|
||||
This variant name is prefixed to the runner-type in the label.
|
||||
"""
|
||||
try:
|
||||
userlist = {u.lstrip("#").strip("\n\t@ ") for u in rollout_state.split()}
|
||||
all_opted_in_users = set()
|
||||
for user in userlist:
|
||||
for i in user.split(","):
|
||||
if i == feature:
|
||||
all_opted_in_users.add(user.split(",")[0])
|
||||
opted_in_requestors = {
|
||||
usr for usr in workflow_requestors if usr in all_opted_in_users
|
||||
}
|
||||
fleet_prefix = ""
|
||||
prefixes = []
|
||||
for experiment_name, experiment_settings in settings.experiments.items():
|
||||
enabled = False
|
||||
|
||||
if opted_in_requestors:
|
||||
# Is any workflow_requestor opted in to this experiment?
|
||||
opted_in_users = [
|
||||
requestor
|
||||
for requestor in workflow_requestors
|
||||
if is_user_opted_in(requestor, user_optins, experiment_name)
|
||||
]
|
||||
|
||||
if opted_in_users:
|
||||
log.info(
|
||||
f"Feature {feature} is enabled for {', '.join(opted_in_requestors)}. Using feature {feature}."
|
||||
f"{', '.join(opted_in_users)} have opted into experiment {experiment_name}."
|
||||
)
|
||||
return feature
|
||||
else:
|
||||
log.info(
|
||||
f"Feature {feature} is disabled for {', '.join(workflow_requestors)}. Using fallback \"{fallback}\"."
|
||||
)
|
||||
return fallback
|
||||
enabled = True
|
||||
elif experiment_settings.rollout_perc:
|
||||
# If no user is opted in, then we randomly enable the experiment based on the rollout percentage
|
||||
if random.uniform(0, 100) <= experiment_settings.rollout_perc:
|
||||
log.info(
|
||||
f"Based on rollout percentage of {experiment_settings.rollout_perc}%, enabling experiment {experiment_name}."
|
||||
)
|
||||
enabled = True
|
||||
|
||||
except Exception as e:
|
||||
if enabled:
|
||||
label = experiment_name
|
||||
if experiment_name == LF_FLEET_EXPERIMENT:
|
||||
# We give some special treatment to the "lf" experiment since determines the fleet we use
|
||||
# - If it's enabled, then we always list it's prefix first
|
||||
# - If we're in the canary branch, then we append ".c" to the lf prefix
|
||||
if is_canary:
|
||||
label += CANARY_FLEET_SUFFIX
|
||||
fleet_prefix = label
|
||||
else:
|
||||
prefixes.append(label)
|
||||
|
||||
if len(prefixes) > 1:
|
||||
log.error(
|
||||
f'Failed to determine if user has opted-in to feature {feature}. Using fallback "{fallback}". Exception: {e}'
|
||||
f"Only a fleet and one other experiment can be enabled for a job at any time. Enabling {prefixes[0]} and ignoring the rest, which are {', '.join(prefixes[1:])}"
|
||||
)
|
||||
return fallback
|
||||
prefixes = prefixes[:1]
|
||||
|
||||
# Fleet always comes first
|
||||
if fleet_prefix:
|
||||
prefixes.insert(0, fleet_prefix)
|
||||
|
||||
return ".".join(prefixes) + "." if prefixes else ""
|
||||
|
||||
|
||||
def get_rollout_state_from_issue(github_token: str, repo: str, issue_num: int) -> str:
|
||||
@ -268,9 +408,10 @@ def main() -> None:
|
||||
args = parse_args()
|
||||
|
||||
if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
|
||||
log.info(f"Exception branch: '{args.github_branch}', using meta runners")
|
||||
label_type = WORKFLOW_LABEL_META
|
||||
runner_ami = RUNNER_AMI_LEGACY
|
||||
log.info(
|
||||
f"Exception branch: '{args.github_branch}', using Meta runners and no experiments."
|
||||
)
|
||||
runner_label_prefix = DEFAULT_LABEL_PREFIX
|
||||
else:
|
||||
try:
|
||||
rollout_state = get_rollout_state_from_issue(
|
||||
@ -285,35 +426,18 @@ def main() -> None:
|
||||
args.github_branch,
|
||||
)
|
||||
|
||||
label_type = get_fleet(
|
||||
rollout_state,
|
||||
(
|
||||
args.github_issue_owner,
|
||||
username,
|
||||
),
|
||||
)
|
||||
runner_ami = get_optin_feature(
|
||||
rollout_state=rollout_state,
|
||||
workflow_requestors=(
|
||||
args.github_issue_owner,
|
||||
username,
|
||||
),
|
||||
feature=RUNNER_AMI_AMZ2023,
|
||||
fallback=RUNNER_AMI_LEGACY,
|
||||
is_canary = args.github_repo == "pytorch/pytorch-canary"
|
||||
|
||||
runner_label_prefix = get_runner_prefix(
|
||||
rollout_state, (args.github_issue_owner, username), is_canary
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
log.error(
|
||||
f"Failed to get issue. Falling back to meta runners. Exception: {e}"
|
||||
f"Failed to get issue. Defaulting to Meta runners and no experiments. Exception: {e}"
|
||||
)
|
||||
label_type = WORKFLOW_LABEL_META
|
||||
runner_ami = RUNNER_AMI_LEGACY
|
||||
|
||||
# For Canary builds use canary runners
|
||||
if args.github_repo == "pytorch/pytorch-canary" and label_type == WORKFLOW_LABEL_LF:
|
||||
label_type = WORKFLOW_LABEL_LF_CANARY
|
||||
|
||||
set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, label_type)
|
||||
set_github_output(GH_OUTPUT_KEY_AMI, runner_ami)
|
||||
set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, runner_label_prefix)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -51,6 +51,8 @@ def main() -> None:
|
||||
|
||||
for platform_image in platform_images: # type: ignore[attr-defined]
|
||||
for arch in platform_image.keys(): # type: ignore[attr-defined]
|
||||
if arch == "cpu-s390x":
|
||||
continue
|
||||
tag_image(
|
||||
platform_image[arch], # type: ignore[index]
|
||||
default_tag,
|
||||
|
237
.github/scripts/test_runner_determinator.py
vendored
Normal file
237
.github/scripts/test_runner_determinator.py
vendored
Normal file
@ -0,0 +1,237 @@
|
||||
from unittest import main, TestCase
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import runner_determinator as rd
|
||||
|
||||
|
||||
class TestRunnerDeterminatorIssueParser(TestCase):
|
||||
def test_parse_settings(self) -> None:
|
||||
settings_text = """
|
||||
experiments:
|
||||
lf:
|
||||
rollout_perc: 25
|
||||
otherExp:
|
||||
rollout_perc: 0
|
||||
---
|
||||
|
||||
Users:
|
||||
@User1,lf
|
||||
@User2,lf,otherExp
|
||||
|
||||
"""
|
||||
|
||||
settings = rd.parse_settings(settings_text)
|
||||
|
||||
self.assertTupleEqual(
|
||||
rd.Experiment(rollout_perc=25),
|
||||
settings.experiments["lf"],
|
||||
"lf settings not parsed correctly",
|
||||
)
|
||||
self.assertTupleEqual(
|
||||
rd.Experiment(rollout_perc=0),
|
||||
settings.experiments["otherExp"],
|
||||
"otherExp settings not parsed correctly",
|
||||
)
|
||||
|
||||
def test_parse_settings_in_code_block(self) -> None:
|
||||
settings_text = """
|
||||
|
||||
```
|
||||
experiments:
|
||||
lf:
|
||||
rollout_perc: 25
|
||||
otherExp:
|
||||
rollout_perc: 0
|
||||
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
Users:
|
||||
@User1,lf
|
||||
@User2,lf,otherExp
|
||||
|
||||
"""
|
||||
|
||||
settings = rd.parse_settings(settings_text)
|
||||
|
||||
self.assertTupleEqual(
|
||||
rd.Experiment(rollout_perc=25),
|
||||
settings.experiments["lf"],
|
||||
"lf settings not parsed correctly",
|
||||
)
|
||||
self.assertTupleEqual(
|
||||
rd.Experiment(rollout_perc=0),
|
||||
settings.experiments["otherExp"],
|
||||
"otherExp settings not parsed correctly",
|
||||
)
|
||||
|
||||
def test_parse_users(self) -> None:
|
||||
settings_text = """
|
||||
experiments:
|
||||
lf:
|
||||
rollout_perc: 0
|
||||
otherExp:
|
||||
rollout_perc: 0
|
||||
---
|
||||
|
||||
Users:
|
||||
@User1,lf
|
||||
@User2,lf,otherExp
|
||||
|
||||
"""
|
||||
|
||||
users = rd.parse_users(settings_text)
|
||||
self.assertDictEqual(
|
||||
{"User1": ["lf"], "User2": ["lf", "otherExp"]},
|
||||
users,
|
||||
"Users not parsed correctly",
|
||||
)
|
||||
|
||||
def test_parse_users_without_settings(self) -> None:
|
||||
settings_text = """
|
||||
|
||||
@User1,lf
|
||||
@User2,lf,otherExp
|
||||
|
||||
"""
|
||||
|
||||
users = rd.parse_users(settings_text)
|
||||
self.assertDictEqual(
|
||||
{"User1": ["lf"], "User2": ["lf", "otherExp"]},
|
||||
users,
|
||||
"Users not parsed correctly",
|
||||
)
|
||||
|
||||
|
||||
class TestRunnerDeterminatorGetRunnerPrefix(TestCase):
|
||||
def test_opted_in_user(self) -> None:
|
||||
settings_text = """
|
||||
experiments:
|
||||
lf:
|
||||
rollout_perc: 0
|
||||
otherExp:
|
||||
rollout_perc: 0
|
||||
---
|
||||
|
||||
Users:
|
||||
@User1,lf
|
||||
@User2,lf,otherExp
|
||||
|
||||
"""
|
||||
prefix = rd.get_runner_prefix(settings_text, ["User1"])
|
||||
self.assertEqual("lf.", prefix, "Runner prefix not correct for User1")
|
||||
|
||||
def test_opted_in_user_two_experiments(self) -> None:
|
||||
settings_text = """
|
||||
experiments:
|
||||
lf:
|
||||
rollout_perc: 0
|
||||
otherExp:
|
||||
rollout_perc: 0
|
||||
---
|
||||
|
||||
Users:
|
||||
@User1,lf
|
||||
@User2,lf,otherExp
|
||||
|
||||
"""
|
||||
prefix = rd.get_runner_prefix(settings_text, ["User2"])
|
||||
self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for User2")
|
||||
|
||||
@patch("random.uniform", return_value=50)
|
||||
def test_opted_out_user(self, mock_uniform: Mock) -> None:
|
||||
settings_text = """
|
||||
experiments:
|
||||
lf:
|
||||
rollout_perc: 25
|
||||
otherExp:
|
||||
rollout_perc: 25
|
||||
---
|
||||
|
||||
Users:
|
||||
@User1,lf
|
||||
@User2,lf,otherExp
|
||||
|
||||
"""
|
||||
prefix = rd.get_runner_prefix(settings_text, ["User3"])
|
||||
self.assertEqual("", prefix, "Runner prefix not correct for user")
|
||||
|
||||
@patch("random.uniform", return_value=10)
|
||||
def test_opted_out_user_was_pulled_in_by_rollout(self, mock_uniform: Mock) -> None:
|
||||
settings_text = """
|
||||
experiments:
|
||||
lf:
|
||||
rollout_perc: 25
|
||||
otherExp:
|
||||
rollout_perc: 25
|
||||
---
|
||||
|
||||
Users:
|
||||
@User1,lf
|
||||
@User2,lf,otherExp
|
||||
|
||||
"""
|
||||
|
||||
# User3 is opted out, but is pulled into both experiments by the 10% rollout
|
||||
prefix = rd.get_runner_prefix(settings_text, ["User3"])
|
||||
self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")
|
||||
|
||||
def test_lf_prefix_always_comes_first(self) -> None:
|
||||
settings_text = """
|
||||
experiments:
|
||||
otherExp:
|
||||
rollout_perc: 0
|
||||
lf:
|
||||
rollout_perc: 0
|
||||
---
|
||||
|
||||
Users:
|
||||
@User1,lf
|
||||
@User2,otherExp,lf
|
||||
|
||||
"""
|
||||
|
||||
prefix = rd.get_runner_prefix(settings_text, ["User2"])
|
||||
self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")
|
||||
|
||||
def test_ignores_commented_users(self) -> None:
|
||||
settings_text = """
|
||||
experiments:
|
||||
lf:
|
||||
rollout_perc: 0
|
||||
otherExp:
|
||||
rollout_perc: 0
|
||||
---
|
||||
|
||||
Users:
|
||||
#@User1,lf
|
||||
@User2,lf,otherExp
|
||||
|
||||
"""
|
||||
|
||||
prefix = rd.get_runner_prefix(settings_text, ["User1"])
|
||||
self.assertEqual("", prefix, "Runner prefix not correct for user")
|
||||
|
||||
def test_ignores_extra_experiments(self) -> None:
|
||||
settings_text = """
|
||||
experiments:
|
||||
lf:
|
||||
rollout_perc: 0
|
||||
otherExp:
|
||||
rollout_perc: 0
|
||||
foo:
|
||||
rollout_perc: 0
|
||||
---
|
||||
|
||||
Users:
|
||||
@User1,lf,otherExp,foo
|
||||
|
||||
"""
|
||||
|
||||
prefix = rd.get_runner_prefix(settings_text, ["User1"])
|
||||
self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
2
.github/templates/upload.yml.j2
vendored
2
.github/templates/upload.yml.j2
vendored
@ -45,7 +45,7 @@
|
||||
{%- if is_windows %}
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
{%- endif %}
|
||||
|
||||
{%- else %}
|
||||
|
350
.github/workflows/_runner-determinator.yml
vendored
350
.github/workflows/_runner-determinator.yml
vendored
@ -62,49 +62,94 @@ jobs:
|
||||
"""
|
||||
This runner determinator is used to determine which set of runners to run a
|
||||
GitHub job on. It uses the first comment of a GitHub issue (by default
|
||||
https://github.com/pytorch/test-infra/issues/5132) as a user list to determine
|
||||
which users will get their jobs to run on experimental runners. This user list
|
||||
is also a comma separated list of additional features or experiments which the
|
||||
user could be opted in to.
|
||||
https://github.com/pytorch/test-infra/issues/5132) to define the configuration
|
||||
of which runners should be used to run which job.
|
||||
|
||||
The configuration has two parts, the settings and a list of opted-in users,
|
||||
separated by a line containing "---". If the line is not present, the
|
||||
settings are considered to be empty with only the second part, the user
|
||||
list, defined.
|
||||
|
||||
The first part is a YAML block that defines the rollout settings. This can be
|
||||
used to define any settings that are needed to determine which runners to use.
|
||||
It's fields are defined by the RolloutSettings class below.
|
||||
|
||||
The second part is a list of users who are explicitly opted in to the LF fleet.
|
||||
The user list is also a comma separated list of additional features or
|
||||
experiments which the user could be opted in to.
|
||||
|
||||
The user list has the following rules:
|
||||
|
||||
- Users are GitHub usernames with the @ prefix
|
||||
- If the first line is a "*" then all users will use the new runners
|
||||
- If the first line is a "!" then all users will use the old runners
|
||||
- Users are GitHub usernames, which must start with the @ prefix
|
||||
- Each user is also a comma-separated list of features/experiments to enable
|
||||
- A "#" prefix indicates the user is opted out of the new runners but is opting
|
||||
into features/experiments.
|
||||
- A "#" prefix opts the user out of all experiments
|
||||
|
||||
Example user list:
|
||||
Example config:
|
||||
# A list of experiments that can be opted into.
|
||||
# This defines the behavior they'll induce when opted into.
|
||||
# Expected syntax is:
|
||||
# [experiment_name]: # Name of the experiment. Also used for the label prefix.
|
||||
# rollout_perc: [int] # % of workflows to run with this experiment when users are not opted in.
|
||||
|
||||
@User1
|
||||
@User2,amz2023
|
||||
#@UserOptOutOfNewRunner,amz2023
|
||||
experiments:
|
||||
lf:
|
||||
rollout_percent: 25
|
||||
|
||||
---
|
||||
|
||||
# Opt-ins:
|
||||
# Users can opt into the LF fleet by adding their GitHub username to this list
|
||||
# and specifying experiments to enable in a comma-separated list.
|
||||
# Experiments should be from the above list.
|
||||
|
||||
@User1,lf,split_build
|
||||
@User2,lf
|
||||
@User3,split_build
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
from argparse import ArgumentParser
|
||||
from logging import LogRecord
|
||||
from typing import Any, Iterable
|
||||
from typing import Any, Dict, Iterable, List, NamedTuple, Tuple
|
||||
|
||||
import yaml
|
||||
from github import Auth, Github
|
||||
from github.Issue import Issue
|
||||
|
||||
|
||||
WORKFLOW_LABEL_META = "" # use meta runners
|
||||
DEFAULT_LABEL_PREFIX = "" # use meta runners
|
||||
WORKFLOW_LABEL_LF = "lf." # use runners from the linux foundation
|
||||
WORKFLOW_LABEL_LF_CANARY = "lf.c." # use canary runners from the linux foundation
|
||||
|
||||
RUNNER_AMI_LEGACY = ""
|
||||
RUNNER_AMI_AMZ2023 = "amz2023"
|
||||
|
||||
GITHUB_OUTPUT = os.getenv("GITHUB_OUTPUT", "")
|
||||
GH_OUTPUT_KEY_AMI = "runner-ami"
|
||||
GH_OUTPUT_KEY_LABEL_TYPE = "label-type"
|
||||
|
||||
|
||||
SETTING_EXPERIMENTS = "experiments"
|
||||
|
||||
LF_FLEET_EXPERIMENT = "lf"
|
||||
CANARY_FLEET_SUFFIX = ".c"
|
||||
|
||||
|
||||
class Experiment(NamedTuple):
|
||||
rollout_perc: float = (
|
||||
0 # Percentage of workflows to experiment on when user is not opted-in.
|
||||
)
|
||||
|
||||
# Add more fields as needed
|
||||
|
||||
|
||||
class Settings(NamedTuple):
|
||||
"""
|
||||
Settings for the experiments that can be opted into.
|
||||
"""
|
||||
|
||||
experiments: Dict[str, Experiment] = {}
|
||||
|
||||
|
||||
class ColorFormatter(logging.Formatter):
|
||||
"""Color codes the log messages based on the log level"""
|
||||
|
||||
@ -231,85 +276,180 @@ jobs:
|
||||
return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}
|
||||
|
||||
|
||||
def get_fleet(rollout_state: str, workflow_requestors: Iterable[str]) -> str:
|
||||
"""
|
||||
Determines if the job should run on the LF fleet or the Meta fleet
|
||||
|
||||
Returns:
|
||||
The appropriate label prefix for the runner, corresponding to the fleet to use.
|
||||
This gets prefixed to the very start of the runner label.
|
||||
"""
|
||||
|
||||
def load_yaml(yaml_text: str) -> Any:
|
||||
try:
|
||||
if rollout_state[0] == "!":
|
||||
log.info("LF Workflows are disabled for everyone. Using meta runners.")
|
||||
return WORKFLOW_LABEL_META
|
||||
elif rollout_state[0] == "*":
|
||||
log.info("LF Workflows are enabled for everyone. Using LF runners.")
|
||||
return WORKFLOW_LABEL_LF
|
||||
else:
|
||||
all_opted_in_users = {
|
||||
usr_raw.strip("\n\t@ ").split(",")[0]
|
||||
for usr_raw in rollout_state.split()
|
||||
}
|
||||
opted_in_requestors = {
|
||||
usr for usr in workflow_requestors if usr in all_opted_in_users
|
||||
}
|
||||
if opted_in_requestors:
|
||||
log.info(
|
||||
f"LF Workflows are enabled for {', '.join(opted_in_requestors)}. Using LF runners."
|
||||
)
|
||||
return WORKFLOW_LABEL_LF
|
||||
else:
|
||||
log.info(
|
||||
f"LF Workflows are disabled for {', '.join(workflow_requestors)}. Using meta runners."
|
||||
)
|
||||
return WORKFLOW_LABEL_META
|
||||
|
||||
except Exception as e:
|
||||
log.error(
|
||||
f"Failed to get determine workflow type. Falling back to meta runners. Exception: {e}"
|
||||
)
|
||||
return WORKFLOW_LABEL_META
|
||||
data = yaml.safe_load(yaml_text)
|
||||
return data
|
||||
except yaml.YAMLError as exc:
|
||||
log.exception("Error loading YAML")
|
||||
raise
|
||||
|
||||
|
||||
def get_optin_feature(
|
||||
rollout_state: str, workflow_requestors: Iterable[str], feature: str, fallback: str
|
||||
def extract_settings_user_opt_in_from_text(rollout_state: str) -> Tuple[str, str]:
|
||||
"""
|
||||
Extracts the text with settings, if any, and the opted in users from the rollout state.
|
||||
|
||||
If the issue body contains "---" then the text above that is the settings
|
||||
and the text below is the list of opted in users.
|
||||
|
||||
If it doesn't contain "---" then the settings are empty and the rest is the users.
|
||||
"""
|
||||
rollout_state_parts = rollout_state.split("---")
|
||||
if len(rollout_state_parts) >= 2:
|
||||
return rollout_state_parts[0], rollout_state_parts[1]
|
||||
else:
|
||||
return "", rollout_state
|
||||
|
||||
|
||||
class UserOptins(Dict[str, List[str]]):
|
||||
"""
|
||||
Dictionary of users with a list of features they have opted into
|
||||
"""
|
||||
|
||||
|
||||
def parse_user_opt_in_from_text(user_optin_text: str) -> UserOptins:
|
||||
"""
|
||||
Parse the user opt-in text into a key value pair of username and the list of features they have opted into
|
||||
|
||||
Users are GitHub usernames with the @ prefix. Each user is also a comma-separated list of features/experiments to enable.
|
||||
- Example line: "@User1,lf,split_build"
|
||||
- A "#" prefix indicates the user is opted out of all experiments
|
||||
|
||||
|
||||
"""
|
||||
optins = UserOptins()
|
||||
for user in user_optin_text.split("\n"):
|
||||
user = user.strip("\r\n\t -")
|
||||
if not user or not user.startswith("@"):
|
||||
# Not a valid user. Skip
|
||||
continue
|
||||
|
||||
if user:
|
||||
usr_name = user.split(",")[0].strip("@")
|
||||
optins[usr_name] = [exp.strip(" ") for exp in user.split(",")[1:]]
|
||||
|
||||
return optins
|
||||
|
||||
|
||||
def parse_settings_from_text(settings_text: str) -> Settings:
|
||||
"""
|
||||
Parse the experiments from the issue body into a list of ExperimentSettings
|
||||
"""
|
||||
try:
|
||||
if settings_text:
|
||||
# Escape the backtick as well so that we can have the settings in a code block on the GH issue
|
||||
# for easy reading
|
||||
# Note: Using ascii for the backtick so that the cat step in _runner-determinator.yml doesn't choke on
|
||||
# the backtick character in shell commands.
|
||||
backtick = chr(96) # backtick character
|
||||
settings_text = settings_text.strip(f"\r\n\t{backtick} ")
|
||||
settings = load_yaml(settings_text)
|
||||
|
||||
# For now we just load experiments. We can expand this if/when we add more settings
|
||||
experiments = {}
|
||||
|
||||
for exp_name, exp_settings in settings.get(SETTING_EXPERIMENTS).items():
|
||||
valid_settings = {}
|
||||
for setting in exp_settings:
|
||||
if setting not in Experiment._fields:
|
||||
log.warning(
|
||||
f"Unexpected setting in experiment: {setting} = {exp_settings[setting]}"
|
||||
)
|
||||
else:
|
||||
valid_settings[setting] = exp_settings[setting]
|
||||
|
||||
experiments[exp_name] = Experiment(**valid_settings)
|
||||
return Settings(experiments)
|
||||
|
||||
except Exception:
|
||||
log.exception("Failed to parse settings")
|
||||
|
||||
return Settings()
|
||||
|
||||
|
||||
def parse_settings(rollout_state: str) -> Settings:
|
||||
"""
|
||||
Parse settings, if any, from the rollout state.
|
||||
|
||||
If the issue body contains "---" then the text above that is the settings
|
||||
and the text below is the list of opted in users.
|
||||
|
||||
If it doesn't contain "---" then the settings are empty and the default values are used.
|
||||
"""
|
||||
settings_text, _ = extract_settings_user_opt_in_from_text(rollout_state)
|
||||
return parse_settings_from_text(settings_text)
|
||||
|
||||
|
||||
def parse_users(rollout_state: str) -> UserOptins:
|
||||
"""
|
||||
Parse users from the rollout state.
|
||||
|
||||
"""
|
||||
_, users_text = extract_settings_user_opt_in_from_text(rollout_state)
|
||||
return parse_user_opt_in_from_text(users_text)
|
||||
|
||||
|
||||
def is_user_opted_in(user: str, user_optins: UserOptins, experiment_name: str) -> bool:
|
||||
"""
|
||||
Check if a user is opted into an experiment
|
||||
"""
|
||||
return experiment_name in user_optins.get(user, [])
|
||||
|
||||
|
||||
def get_runner_prefix(
|
||||
rollout_state: str, workflow_requestors: Iterable[str], is_canary: bool = False
|
||||
) -> str:
|
||||
"""
|
||||
Used to dynamically opt in jobs to specific runner-type variants.
|
||||
settings = parse_settings(rollout_state)
|
||||
user_optins = parse_users(rollout_state)
|
||||
|
||||
Returns:
|
||||
The runner-type's variant name if the user has opted in to the feature, otherwise returns an empty string.
|
||||
This variant name is prefixed to the runner-type in the label.
|
||||
"""
|
||||
try:
|
||||
userlist = {u.lstrip("#").strip("\n\t@ ") for u in rollout_state.split()}
|
||||
all_opted_in_users = set()
|
||||
for user in userlist:
|
||||
for i in user.split(","):
|
||||
if i == feature:
|
||||
all_opted_in_users.add(user.split(",")[0])
|
||||
opted_in_requestors = {
|
||||
usr for usr in workflow_requestors if usr in all_opted_in_users
|
||||
}
|
||||
fleet_prefix = ""
|
||||
prefixes = []
|
||||
for experiment_name, experiment_settings in settings.experiments.items():
|
||||
enabled = False
|
||||
|
||||
if opted_in_requestors:
|
||||
# Is any workflow_requestor opted in to this experiment?
|
||||
opted_in_users = [
|
||||
requestor
|
||||
for requestor in workflow_requestors
|
||||
if is_user_opted_in(requestor, user_optins, experiment_name)
|
||||
]
|
||||
|
||||
if opted_in_users:
|
||||
log.info(
|
||||
f"Feature {feature} is enabled for {', '.join(opted_in_requestors)}. Using feature {feature}."
|
||||
f"{', '.join(opted_in_users)} have opted into experiment {experiment_name}."
|
||||
)
|
||||
return feature
|
||||
else:
|
||||
log.info(
|
||||
f"Feature {feature} is disabled for {', '.join(workflow_requestors)}. Using fallback \"{fallback}\"."
|
||||
)
|
||||
return fallback
|
||||
enabled = True
|
||||
elif experiment_settings.rollout_perc:
|
||||
# If no user is opted in, then we randomly enable the experiment based on the rollout percentage
|
||||
if random.uniform(0, 100) <= experiment_settings.rollout_perc:
|
||||
log.info(
|
||||
f"Based on rollout percentage of {experiment_settings.rollout_perc}%, enabling experiment {experiment_name}."
|
||||
)
|
||||
enabled = True
|
||||
|
||||
except Exception as e:
|
||||
if enabled:
|
||||
label = experiment_name
|
||||
if experiment_name == LF_FLEET_EXPERIMENT:
|
||||
# We give some special treatment to the "lf" experiment since determines the fleet we use
|
||||
# - If it's enabled, then we always list it's prefix first
|
||||
# - If we're in the canary branch, then we append ".c" to the lf prefix
|
||||
if is_canary:
|
||||
label += CANARY_FLEET_SUFFIX
|
||||
fleet_prefix = label
|
||||
else:
|
||||
prefixes.append(label)
|
||||
|
||||
if len(prefixes) > 1:
|
||||
log.error(
|
||||
f'Failed to determine if user has opted-in to feature {feature}. Using fallback "{fallback}". Exception: {e}'
|
||||
f"Only a fleet and one other experiment can be enabled for a job at any time. Enabling {prefixes[0]} and ignoring the rest, which are {', '.join(prefixes[1:])}"
|
||||
)
|
||||
return fallback
|
||||
prefixes = prefixes[:1]
|
||||
|
||||
# Fleet always comes first
|
||||
if fleet_prefix:
|
||||
prefixes.insert(0, fleet_prefix)
|
||||
|
||||
return ".".join(prefixes) + "." if prefixes else ""
|
||||
|
||||
|
||||
def get_rollout_state_from_issue(github_token: str, repo: str, issue_num: int) -> str:
|
||||
@ -327,9 +467,10 @@ jobs:
|
||||
args = parse_args()
|
||||
|
||||
if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
|
||||
log.info(f"Exception branch: '{args.github_branch}', using meta runners")
|
||||
label_type = WORKFLOW_LABEL_META
|
||||
runner_ami = RUNNER_AMI_LEGACY
|
||||
log.info(
|
||||
f"Exception branch: '{args.github_branch}', using Meta runners and no experiments."
|
||||
)
|
||||
runner_label_prefix = DEFAULT_LABEL_PREFIX
|
||||
else:
|
||||
try:
|
||||
rollout_state = get_rollout_state_from_issue(
|
||||
@ -344,35 +485,18 @@ jobs:
|
||||
args.github_branch,
|
||||
)
|
||||
|
||||
label_type = get_fleet(
|
||||
rollout_state,
|
||||
(
|
||||
args.github_issue_owner,
|
||||
username,
|
||||
),
|
||||
)
|
||||
runner_ami = get_optin_feature(
|
||||
rollout_state=rollout_state,
|
||||
workflow_requestors=(
|
||||
args.github_issue_owner,
|
||||
username,
|
||||
),
|
||||
feature=RUNNER_AMI_AMZ2023,
|
||||
fallback=RUNNER_AMI_LEGACY,
|
||||
is_canary = args.github_repo == "pytorch/pytorch-canary"
|
||||
|
||||
runner_label_prefix = get_runner_prefix(
|
||||
rollout_state, (args.github_issue_owner, username), is_canary
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
log.error(
|
||||
f"Failed to get issue. Falling back to meta runners. Exception: {e}"
|
||||
f"Failed to get issue. Defaulting to Meta runners and no experiments. Exception: {e}"
|
||||
)
|
||||
label_type = WORKFLOW_LABEL_META
|
||||
runner_ami = RUNNER_AMI_LEGACY
|
||||
|
||||
# For Canary builds use canary runners
|
||||
if args.github_repo == "pytorch/pytorch-canary" and label_type == WORKFLOW_LABEL_LF:
|
||||
label_type = WORKFLOW_LABEL_LF_CANARY
|
||||
|
||||
set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, label_type)
|
||||
set_github_output(GH_OUTPUT_KEY_AMI, runner_ami)
|
||||
set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, runner_label_prefix)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
18
.github/workflows/build-libtorch-images.yml
vendored
18
.github/workflows/build-libtorch-images.yml
vendored
@ -29,9 +29,19 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
build-docker-cuda:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: linux.9xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
|
||||
strategy:
|
||||
matrix:
|
||||
cuda_version: ["12.4", "12.1", "11.8"]
|
||||
@ -66,7 +76,8 @@ jobs:
|
||||
.ci/docker/libtorch/build.sh libtorch-cxx11-builder:cuda${{matrix.cuda_version}}
|
||||
build-docker-rocm:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: linux.9xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
|
||||
strategy:
|
||||
matrix:
|
||||
rocm_version: ["6.1", "6.2"]
|
||||
@ -101,7 +112,8 @@ jobs:
|
||||
.ci/docker/libtorch/build.sh libtorch-cxx11-builder:rocm${{matrix.rocm_version}}
|
||||
build-docker-cpu:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: linux.9xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
|
||||
|
39
.github/workflows/build-manywheel-images.yml
vendored
39
.github/workflows/build-manywheel-images.yml
vendored
@ -33,9 +33,19 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
build-docker-cuda:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: am2.linux.9xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}am2.linux.9xlarge.ephemeral"
|
||||
strategy:
|
||||
matrix:
|
||||
cuda_version: ["12.4", "12.1", "11.8"]
|
||||
@ -73,7 +83,8 @@ jobs:
|
||||
# NOTE: manylinux_2_28 are still experimental, see https://github.com/pytorch/pytorch/issues/123649
|
||||
build-docker-cuda-manylinux_2_28:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: linux.9xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
|
||||
strategy:
|
||||
matrix:
|
||||
cuda_version: ["12.4", "12.1", "11.8"]
|
||||
@ -110,7 +121,8 @@ jobs:
|
||||
.ci/docker/manywheel/build.sh manylinux2_28-builder:cuda${{matrix.cuda_version}}
|
||||
build-docker-cuda-aarch64:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: linux.arm64.2xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
|
||||
strategy:
|
||||
matrix:
|
||||
cuda_version: ["12.4"]
|
||||
@ -143,7 +155,8 @@ jobs:
|
||||
.ci/docker/manywheel/build.sh manylinuxaarch64-builder:cuda${{matrix.cuda_version}}
|
||||
build-docker-rocm:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: am2.linux.9xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}am2.linux.9xlarge.ephemeral"
|
||||
strategy:
|
||||
matrix:
|
||||
rocm_version: ["6.1", "6.2"]
|
||||
@ -178,7 +191,8 @@ jobs:
|
||||
.ci/docker/manywheel/build.sh manylinux-builder:rocm${{matrix.rocm_version}}
|
||||
build-docker-cpu:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: am2.linux.9xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}am2.linux.9xlarge.ephemeral"
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
|
||||
@ -207,7 +221,8 @@ jobs:
|
||||
.ci/docker/manywheel/build.sh manylinux-builder:cpu
|
||||
build-docker-cpu-manylinux_2_28:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: linux.9xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
|
||||
env:
|
||||
GPU_ARCH_TYPE: cpu-manylinux_2_28
|
||||
steps:
|
||||
@ -238,7 +253,8 @@ jobs:
|
||||
.ci/docker/manywheel/build.sh manylinux2_28-builder:cpu
|
||||
build-docker-cpu-aarch64:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: linux.arm64.2xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
|
||||
env:
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
steps:
|
||||
@ -269,7 +285,8 @@ jobs:
|
||||
.ci/docker/manywheel/build.sh manylinuxaarch64-builder:cpu-aarch64
|
||||
build-docker-cpu-aarch64-2_28:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: linux.arm64.2xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
|
||||
env:
|
||||
GPU_ARCH_TYPE: cpu-aarch64-2_28
|
||||
steps:
|
||||
@ -303,7 +320,8 @@ jobs:
|
||||
.ci/docker/manywheel/build.sh manylinux2_28_aarch64-builder:cpu-aarch64
|
||||
build-docker-cpu-cxx11-abi:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: linux.9xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
|
||||
env:
|
||||
GPU_ARCH_TYPE: cpu-cxx11-abi
|
||||
steps:
|
||||
@ -334,7 +352,8 @@ jobs:
|
||||
.ci/docker/manywheel/build.sh manylinuxcxx11-abi-builder:cpu-cxx11-abi
|
||||
build-docker-xpu:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: linux.9xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
|
||||
env:
|
||||
GPU_ARCH_TYPE: xpu
|
||||
steps:
|
||||
|
21
.github/workflows/build-triton-wheel.yml
vendored
21
.github/workflows/build-triton-wheel.yml
vendored
@ -13,7 +13,6 @@ on:
|
||||
- .github/scripts/build_triton_wheel.py
|
||||
- .github/ci_commit_pins/triton.txt
|
||||
- .ci/docker/ci_commit_pins/triton.txt
|
||||
- .ci/docker/ci_commit_pins/triton-rocm.txt
|
||||
- .ci/docker/ci_commit_pins/triton-xpu.txt
|
||||
pull_request:
|
||||
paths:
|
||||
@ -21,7 +20,6 @@ on:
|
||||
- .github/scripts/build_triton_wheel.py
|
||||
- .github/ci_commit_pins/triton.txt
|
||||
- .ci/docker/ci_commit_pins/triton.txt
|
||||
- .ci/docker/ci_commit_pins/triton-rocm.txt
|
||||
- .ci/docker/ci_commit_pins/triton-xpu.txt
|
||||
|
||||
concurrency:
|
||||
@ -29,9 +27,19 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
build-wheel:
|
||||
name: "Build Triton Wheel"
|
||||
runs-on: [self-hosted, linux.2xlarge]
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@ -120,7 +128,7 @@ jobs:
|
||||
fi
|
||||
docker exec -t "${container_name}" chown -R 1000.1000 /artifacts
|
||||
|
||||
- uses: actions/upload-artifact@v3
|
||||
- uses: actions/upload-artifact@v4.4.0
|
||||
with:
|
||||
name: pytorch-triton-wheel-${{ matrix.py_vers }}-${{ matrix.device }}
|
||||
if-no-files-found: error
|
||||
@ -201,7 +209,8 @@ jobs:
|
||||
|
||||
build-conda:
|
||||
name: "Build Triton Conda"
|
||||
runs-on: [self-hosted, linux.2xlarge]
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@ -253,7 +262,7 @@ jobs:
|
||||
docker exec -t "${container_name}" python /pytorch/.github/scripts/build_triton_wheel.py --build-conda --py-version="${PY_VERS}" $RELEASE
|
||||
docker exec -t "${container_name}" chown -R 1000.1000 /artifacts
|
||||
|
||||
- uses: actions/upload-artifact@v3
|
||||
- uses: actions/upload-artifact@v4.4.0
|
||||
with:
|
||||
name: pytorch-triton-conda-${{ matrix.py_vers }}
|
||||
if-no-files-found: error
|
||||
|
17
.github/workflows/create_release.yml
vendored
17
.github/workflows/create_release.yml
vendored
@ -16,6 +16,15 @@ on:
|
||||
paths: [.github/workflows/create_release.yml]
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
release:
|
||||
if: ${{ github.repository == 'pytorch/pytorch' }}
|
||||
name: Create Release
|
||||
@ -63,7 +72,7 @@ jobs:
|
||||
files: ${{env.PT_RELEASE_FILE}}
|
||||
- name: Upload source distribution to GHA artifacts for release tags
|
||||
if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
|
||||
uses: actions/upload-artifact@v2
|
||||
uses: actions/upload-artifact@v4.4.0
|
||||
with:
|
||||
name: ${{ env.PT_RELEASE_FILE }}
|
||||
path: ${{ env.PT_RELEASE_FILE }}
|
||||
@ -73,12 +82,14 @@ jobs:
|
||||
|
||||
upload_source_code_to_s3:
|
||||
if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
|
||||
runs-on: linux.2xlarge
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
|
||||
environment: sourcecode-upload
|
||||
name: Upload source code to S3 for release tags
|
||||
permissions:
|
||||
id-token: write
|
||||
needs: release
|
||||
needs:
|
||||
- get-label-type
|
||||
- release
|
||||
steps:
|
||||
- uses: actions/download-artifact@v4.1.7
|
||||
with:
|
||||
|
12
.github/workflows/docker-builds.yml
vendored
12
.github/workflows/docker-builds.yml
vendored
@ -30,8 +30,18 @@ env:
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
docker-build:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
needs: get-label-type
|
||||
timeout-minutes: 240
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@ -68,7 +78,7 @@ jobs:
|
||||
- docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
|
||||
runner: linux.arm64.m7g.4xlarge
|
||||
timeout-minutes: 600
|
||||
runs-on: [self-hosted, "${{ matrix.runner }}"]
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }}"
|
||||
env:
|
||||
DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/${{ matrix.docker-image-name }}
|
||||
steps:
|
||||
|
18
.github/workflows/docker-release.yml
vendored
18
.github/workflows/docker-release.yml
vendored
@ -34,9 +34,19 @@ env:
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
generate-matrix:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
runs-on: [self-hosted, linux.large]
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.large"
|
||||
outputs:
|
||||
matrix: ${{ steps.generate-matrix.outputs.matrix }}
|
||||
steps:
|
||||
@ -54,10 +64,12 @@ jobs:
|
||||
|
||||
build:
|
||||
if: ${{ github.repository == 'pytorch/pytorch' }}
|
||||
runs-on: [self-hosted, linux.2xlarge]
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
|
||||
environment: ${{ (github.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
timeout-minutes: 240
|
||||
needs: generate-matrix
|
||||
needs:
|
||||
- generate-matrix
|
||||
- get-label-type
|
||||
strategy:
|
||||
matrix: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
|
||||
fail-fast: false
|
||||
|
20
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
20
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
@ -58,6 +58,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
@ -81,6 +82,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -103,6 +105,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cpu-aarch64
|
||||
secrets:
|
||||
@ -125,6 +128,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
|
||||
DESIRED_DEVTOOLSET: cxx11-abi
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
@ -149,6 +153,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
|
||||
DESIRED_DEVTOOLSET: cxx11-abi
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda-aarch64
|
||||
secrets:
|
||||
@ -170,6 +175,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
@ -193,6 +199,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -215,6 +222,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cpu-aarch64
|
||||
secrets:
|
||||
@ -237,6 +245,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
|
||||
DESIRED_DEVTOOLSET: cxx11-abi
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
@ -261,6 +270,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
|
||||
DESIRED_DEVTOOLSET: cxx11-abi
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cuda-aarch64
|
||||
secrets:
|
||||
@ -282,6 +292,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
@ -305,6 +316,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -327,6 +339,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cpu-aarch64
|
||||
secrets:
|
||||
@ -349,6 +362,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
|
||||
DESIRED_DEVTOOLSET: cxx11-abi
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
@ -373,6 +387,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
|
||||
DESIRED_DEVTOOLSET: cxx11-abi
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda-aarch64
|
||||
secrets:
|
||||
@ -394,6 +409,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
@ -417,6 +433,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
@ -439,6 +456,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cpu-aarch64
|
||||
secrets:
|
||||
@ -461,6 +479,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
|
||||
DESIRED_DEVTOOLSET: cxx11-abi
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
@ -485,6 +504,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
|
||||
DESIRED_DEVTOOLSET: cxx11-abi
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cuda-aarch64
|
||||
secrets:
|
||||
|
147
.github/workflows/generated-linux-binary-manywheel-main.yml
generated
vendored
147
.github/workflows/generated-linux-binary-manywheel-main.yml
generated
vendored
@ -54,6 +54,7 @@ jobs:
|
||||
GPU_ARCH_VERSION: 11.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda11_8
|
||||
@ -77,6 +78,7 @@ jobs:
|
||||
GPU_ARCH_VERSION: 11.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
@ -85,53 +87,6 @@ jobs:
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
manywheel-py3_9-cuda11_8-split-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu118
|
||||
GPU_ARCH_VERSION: 11.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda11_8-split
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda11_8-split-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-cuda11_8-split-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu118
|
||||
GPU_ARCH_VERSION: 11.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda11_8-split
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
manywheel-py3_9-cuda12_1-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -146,6 +101,7 @@ jobs:
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_1
|
||||
@ -169,6 +125,7 @@ jobs:
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
@ -177,53 +134,6 @@ jobs:
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
manywheel-py3_9-cuda12_1-split-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_1-split
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda12_1-split-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-cuda12_1-split-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_1-split
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
manywheel-py3_9-cuda12_4-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -238,6 +148,7 @@ jobs:
|
||||
GPU_ARCH_VERSION: 12.4
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_4
|
||||
@ -261,6 +172,7 @@ jobs:
|
||||
GPU_ARCH_VERSION: 12.4
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
@ -268,50 +180,3 @@ jobs:
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
manywheel-py3_9-cuda12_4-split-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu124
|
||||
GPU_ARCH_VERSION: 12.4
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_4-split
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda12_4-split-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-cuda12_4-split-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu124
|
||||
GPU_ARCH_VERSION: 12.4
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_4-split
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
1179
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
1179
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
File diff suppressed because it is too large
Load Diff
182
.github/workflows/generated-linux-binary-manywheel-split-main
vendored
Normal file
182
.github/workflows/generated-linux-binary-manywheel-split-main
vendored
Normal file
@ -0,0 +1,182 @@
|
||||
# @generated DO NOT EDIT MANUALLY
|
||||
|
||||
# Template is at: .github/templates/linux_binary_build_workflow.yml.j2
|
||||
# Generation script: .github/scripts/generate_ci_workflows.py
|
||||
name: linux-binary-manywheel-split
|
||||
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
tags:
|
||||
- 'ciflow/periodic/*'
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
# Needed for conda builds
|
||||
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
|
||||
ANACONDA_USER: pytorch
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
BINARY_ENV_FILE: /tmp/env
|
||||
BUILD_ENVIRONMENT: linux-binary-manywheel-split
|
||||
BUILDER_ROOT: /builder
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
PYTORCH_FINAL_PACKAGE_DIR: /artifacts
|
||||
PYTORCH_ROOT: /pytorch
|
||||
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
SKIP_ALL_TESTS: 0
|
||||
concurrency:
|
||||
group: linux-binary-manywheel-split-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
manywheel-py3_9-cuda11_8-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu118
|
||||
GPU_ARCH_VERSION: 11.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda11_8
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda11_8-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-cuda11_8-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu118
|
||||
GPU_ARCH_VERSION: 11.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda11_8
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
manywheel-py3_9-cuda12_1-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_1
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda12_1-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-cuda12_1-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_1
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
manywheel-py3_9-cuda12_4-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu124
|
||||
GPU_ARCH_VERSION: 12.4
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_4
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda12_4-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-cuda12_4-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu124
|
||||
GPU_ARCH_VERSION: 12.4
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_4
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
1516
.github/workflows/generated-linux-binary-manywheel-split-nightly
vendored
Normal file
1516
.github/workflows/generated-linux-binary-manywheel-split-nightly
vendored
Normal file
File diff suppressed because it is too large
Load Diff
15
.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
generated
vendored
15
.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
generated
vendored
@ -58,6 +58,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runs_on: linux.s390x
|
||||
ALPINE_IMAGE: "docker.io/s390x/alpine"
|
||||
@ -81,6 +82,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cpu-s390x
|
||||
build_environment: linux-s390x-binary-manywheel
|
||||
@ -103,6 +105,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cpu-s390x
|
||||
secrets:
|
||||
@ -124,6 +127,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runs_on: linux.s390x
|
||||
ALPINE_IMAGE: "docker.io/s390x/alpine"
|
||||
@ -147,6 +151,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cpu-s390x
|
||||
build_environment: linux-s390x-binary-manywheel
|
||||
@ -169,6 +174,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cpu-s390x
|
||||
secrets:
|
||||
@ -190,6 +196,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runs_on: linux.s390x
|
||||
ALPINE_IMAGE: "docker.io/s390x/alpine"
|
||||
@ -213,6 +220,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cpu-s390x
|
||||
build_environment: linux-s390x-binary-manywheel
|
||||
@ -235,6 +243,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cpu-s390x
|
||||
secrets:
|
||||
@ -256,6 +265,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runs_on: linux.s390x
|
||||
ALPINE_IMAGE: "docker.io/s390x/alpine"
|
||||
@ -279,6 +289,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cpu-s390x
|
||||
build_environment: linux-s390x-binary-manywheel
|
||||
@ -301,6 +312,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cpu-s390x
|
||||
secrets:
|
||||
@ -322,6 +334,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.13"
|
||||
runs_on: linux.s390x
|
||||
ALPINE_IMAGE: "docker.io/s390x/alpine"
|
||||
@ -345,6 +358,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.13"
|
||||
build_name: manywheel-py3_13-cpu-s390x
|
||||
build_environment: linux-s390x-binary-manywheel
|
||||
@ -367,6 +381,7 @@ jobs:
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu-s390x
|
||||
DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.13"
|
||||
build_name: manywheel-py3_13-cpu-s390x
|
||||
secrets:
|
||||
|
2
.github/workflows/generated-macos-arm64-binary-libtorch-cxx11-abi-nightly.yml
generated
vendored
2
.github/workflows/generated-macos-arm64-binary-libtorch-cxx11-abi-nightly.yml
generated
vendored
@ -49,7 +49,7 @@ jobs:
|
||||
DESIRED_DEVTOOLSET: cxx11-abi
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
|
4
.github/workflows/generated-windows-binary-libtorch-debug-main.yml
generated
vendored
4
.github/workflows/generated-windows-binary-libtorch-debug-main.yml
generated
vendored
@ -51,7 +51,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -169,7 +169,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
|
24
.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
generated
vendored
24
.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
generated
vendored
@ -58,7 +58,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -176,7 +176,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -290,7 +290,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: libtorch-cpu-shared-with-deps-debug
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -316,7 +316,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -435,7 +435,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -550,7 +550,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: libtorch-cuda11_8-shared-with-deps-debug
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -576,7 +576,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -695,7 +695,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -810,7 +810,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: libtorch-cuda12_1-shared-with-deps-debug
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -836,7 +836,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -955,7 +955,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -1070,7 +1070,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: libtorch-cuda12_4-shared-with-deps-debug
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
4
.github/workflows/generated-windows-binary-libtorch-release-main.yml
generated
vendored
4
.github/workflows/generated-windows-binary-libtorch-release-main.yml
generated
vendored
@ -51,7 +51,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -169,7 +169,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
|
24
.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
generated
vendored
24
.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
generated
vendored
@ -58,7 +58,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -176,7 +176,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -290,7 +290,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: libtorch-cpu-shared-with-deps-release
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -316,7 +316,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -435,7 +435,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -550,7 +550,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: libtorch-cuda11_8-shared-with-deps-release
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -576,7 +576,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -695,7 +695,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -810,7 +810,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: libtorch-cuda12_1-shared-with-deps-release
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -836,7 +836,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -955,7 +955,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -1070,7 +1070,7 @@ jobs:
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.8"
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: libtorch-cuda12_4-shared-with-deps-release
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
11
.github/workflows/inductor-cu124.yml
vendored
11
.github/workflows/inductor-cu124.yml
vendored
@ -18,11 +18,22 @@ concurrency:
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
|
||||
# Should be synced with the one in inductor.yml, but this doesn't run inductor_timm
|
||||
name: cuda12.4-py3.10-gcc9-sm86
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
sync-tag: linux-focal-cuda12_4-py3_10-gcc9-inductor-build
|
||||
build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
|
||||
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
|
40
.github/workflows/inductor-micro-benchmark-x86.yml
vendored
Normal file
40
.github/workflows/inductor-micro-benchmark-x86.yml
vendored
Normal file
@ -0,0 +1,40 @@
|
||||
name: inductor-micro-benchmark-x86
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: 0 7 * * *
|
||||
push:
|
||||
tags:
|
||||
- ciflow/inductor-micro-benchmark-cpu-x86/*
|
||||
workflow_dispatch:
|
||||
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
linux-jammy-cpu-py3_9-gcc11-inductor-build:
|
||||
name: linux-jammy-cpu-py3.9-gcc11-inductor
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11
|
||||
docker-image-name: pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
|
||||
# Use metal host for benchmark jobs
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor-micro-benchmark-cpu-x86", shard: 1, num_shards: 1, runner: "linux.24xl.spr-metal" },
|
||||
]}
|
||||
|
||||
linux-jammy-cpu-py3_9-gcc11-inductor-micro-benchmark-test:
|
||||
name: linux-jammy-cpu-py3.9-gcc11-inductor
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11
|
||||
docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
|
||||
use-gha: anything-non-empty-to-use-gha
|
||||
timeout-minutes: 720
|
11
.github/workflows/inductor-micro-benchmark.yml
vendored
11
.github/workflows/inductor-micro-benchmark.yml
vendored
@ -16,10 +16,21 @@ concurrency:
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build:
|
||||
name: cuda12.1-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
|
||||
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
|
11
.github/workflows/inductor-perf-compare.yml
vendored
11
.github/workflows/inductor-perf-compare.yml
vendored
@ -13,10 +13,21 @@ concurrency:
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
|
||||
name: cuda12.1-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
|
||||
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
|
@ -68,10 +68,21 @@ concurrency:
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
|
||||
name: cuda12.1-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
|
||||
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
|
@ -50,10 +50,21 @@ concurrency:
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-jammy-aarch64-py3_10-inductor-build:
|
||||
name: linux-jammy-aarch64-py3.10-inductor
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runner: linux.arm64.m7g.4xlarge
|
||||
build-environment: linux-jammy-aarch64-py3.10
|
||||
docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
|
||||
|
@ -48,10 +48,21 @@ concurrency:
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-jammy-cpu-py3_9-gcc11-inductor-build:
|
||||
name: linux-jammy-cpu-py3.9-gcc11-inductor
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image-name: pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
|
||||
test-matrix: |
|
||||
|
11
.github/workflows/inductor-perf-test-nightly.yml
vendored
11
.github/workflows/inductor-perf-test-nightly.yml
vendored
@ -66,10 +66,21 @@ concurrency:
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
|
||||
name: cuda12.1-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
|
||||
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
|
13
.github/workflows/inductor-periodic.yml
vendored
13
.github/workflows/inductor-periodic.yml
vendored
@ -18,10 +18,21 @@ concurrency:
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-focal-cuda12_1-py3_10-gcc9-periodic-dynamo-benchmarks-build:
|
||||
name: cuda12.1-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
|
||||
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.6'
|
||||
@ -60,7 +71,9 @@ jobs:
|
||||
linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp:
|
||||
name: cuda12.1-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
|
||||
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
|
11
.github/workflows/inductor-rocm.yml
vendored
11
.github/workflows/inductor-rocm.yml
vendored
@ -22,10 +22,21 @@ concurrency:
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-focal-rocm6_1-py3_8-inductor-build:
|
||||
name: rocm6.1-py3.8-inductor
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-rocm6.1-py3.8
|
||||
docker-image-name: pytorch-linux-focal-rocm-n-py3
|
||||
test-matrix: |
|
||||
|
2
.github/workflows/lint.yml
vendored
2
.github/workflows/lint.yml
vendored
@ -223,7 +223,7 @@ jobs:
|
||||
cache: pip
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install pytest-rerunfailures==11.1.* pytest-flakefinder==1.1.* pytest-xdist==3.3.* expecttest==0.1.* numpy==1.24.*
|
||||
pip install pytest-rerunfailures==11.1.* pytest-flakefinder==1.1.* pytest-xdist==3.3.* expecttest==0.2.* fbscribelogger==0.1.* numpy==1.24.*
|
||||
pip install torch --pre --index-url https://download.pytorch.org/whl/nightly/cpu/
|
||||
- name: Run run_test.py (nonretryable)
|
||||
run: |
|
||||
|
18
.github/workflows/periodic.yml
vendored
18
.github/workflows/periodic.yml
vendored
@ -57,8 +57,10 @@ jobs:
|
||||
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
linux-focal-cuda12_1-py3_10-gcc9-test:
|
||||
@ -87,8 +89,10 @@ jobs:
|
||||
{ config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
{ config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
|
||||
@ -333,8 +337,10 @@ jobs:
|
||||
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
|
||||
|
8
.github/workflows/rocm.yml
vendored
8
.github/workflows/rocm.yml
vendored
@ -3,18 +3,12 @@ name: rocm
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
# - main
|
||||
- main
|
||||
- release/*
|
||||
tags:
|
||||
- ciflow/rocm/*
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
|
||||
# Also run less frequently on weekends.
|
||||
- cron: 45 0,8,16 * * 1-5
|
||||
- cron: 45 4 * * 0,6
|
||||
- cron: 45 4,12,20 * * 1-5
|
||||
- cron: 45 12 * * 0,6
|
||||
- cron: 29 8 * * * # about 1:29am PDT
|
||||
|
||||
concurrency:
|
||||
|
19
.github/workflows/slow.yml
vendored
19
.github/workflows/slow.yml
vendored
@ -56,12 +56,14 @@ jobs:
|
||||
cuda-arch-list: 8.6
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 2, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 3, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 4, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 5, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 6, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 1, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 2, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 3, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 4, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 5, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 6, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 7, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 8, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
|
||||
linux-focal-cuda12_1-py3-gcc9-slow-gradcheck-test:
|
||||
@ -87,8 +89,9 @@ jobs:
|
||||
cuda-arch-list: 8.6
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "slow", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "slow", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "slow", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "slow", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "slow", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
|
||||
linux-focal-cuda12_1-py3_10-gcc9-sm86-test:
|
||||
|
@ -10,8 +10,18 @@ permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
index:
|
||||
runs-on: linux.g5.4xlarge.nvidia.gpu # 1 GPU A10G 24GB each
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" # 1 GPU A10G 24GB each
|
||||
environment: target-determinator-env
|
||||
steps:
|
||||
- name: Clone PyTorch
|
||||
|
11
.github/workflows/torchbench.yml
vendored
11
.github/workflows/torchbench.yml
vendored
@ -11,10 +11,21 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-focal-cuda12_1-py3_10-gcc9-torchbench-build-gcp:
|
||||
name: cuda12.1-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
|
||||
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
|
6
.github/workflows/trunk.yml
vendored
6
.github/workflows/trunk.yml
vendored
@ -266,8 +266,10 @@ jobs:
|
||||
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
|
4
.github/workflows/upload-test-stats.yml
vendored
4
.github/workflows/upload-test-stats.yml
vendored
@ -2,7 +2,7 @@ name: Upload test stats
|
||||
|
||||
on:
|
||||
workflow_run:
|
||||
workflows: [pull, trunk, periodic, inductor, unstable, slow, unstable-periodic, inductor-periodic, rocm, inductor-micro-benchmark, inductor-cu124, inductor-rocm]
|
||||
workflows: [pull, trunk, periodic, inductor, unstable, slow, unstable-periodic, inductor-periodic, rocm, inductor-micro-benchmark, inductor-micro-benchmark-x86, inductor-cu124, inductor-rocm]
|
||||
types:
|
||||
- completed
|
||||
|
||||
@ -96,7 +96,7 @@ jobs:
|
||||
python3 -m tools.stats.check_disabled_tests --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --repo "${REPO_FULLNAME}"
|
||||
|
||||
- name: Upload gpt-fast benchmark results to Rockset
|
||||
if: steps.upload-s3.outcome && steps.upload-s3.outcome == 'success' && github.event.workflow_run.name == 'inductor-micro-benchmark'
|
||||
if: steps.upload-s3.outcome && steps.upload-s3.outcome == 'success' && contains('inductor-micro-benchmark', github.event.workflow_run.name)
|
||||
env:
|
||||
ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
|
||||
WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
|
||||
|
@ -138,7 +138,7 @@ init_command = [
|
||||
'--dry-run={{DRYRUN}}',
|
||||
'numpy==1.24.3 ; python_version == "3.8"',
|
||||
'numpy==1.26.0 ; python_version >= "3.9"',
|
||||
'expecttest==0.1.6',
|
||||
'expecttest==0.2.1',
|
||||
'mypy==1.10.0',
|
||||
'sympy==1.12.1 ; python_version == "3.8"',
|
||||
'sympy==1.13.0 ; python_version >= "3.9"',
|
||||
@ -210,6 +210,8 @@ include_patterns = [
|
||||
'aten/src/ATen/native/nested/*.h',
|
||||
'c10/**/*.cpp',
|
||||
'c10/**/*.h',
|
||||
'caffe2/**/*.cc',
|
||||
'caffe2/**/*.h',
|
||||
'torch/*.h',
|
||||
'torch/csrc/*.h',
|
||||
'torch/csrc/*.cpp',
|
||||
|
1
BUCK.oss
1
BUCK.oss
@ -65,7 +65,6 @@ cxx_library(
|
||||
"caffe2/serialize/file_adapter.cc",
|
||||
"caffe2/serialize/inline_container.cc",
|
||||
"caffe2/serialize/istream_adapter.cc",
|
||||
"caffe2/serialize/read_adapter_interface.cc",
|
||||
],
|
||||
visibility = ["PUBLIC"],
|
||||
deps = [
|
||||
|
@ -332,6 +332,7 @@ intern_build_aten_ops(
|
||||
"@fbgemm",
|
||||
"@mkl",
|
||||
"@sleef",
|
||||
"@mkl_dnn//:mkl-dnn",
|
||||
],
|
||||
)
|
||||
|
||||
@ -472,7 +473,6 @@ filegroup(
|
||||
"caffe2/serialize/file_adapter.cc",
|
||||
"caffe2/serialize/inline_container.cc",
|
||||
"caffe2/serialize/istream_adapter.cc",
|
||||
"caffe2/serialize/read_adapter_interface.cc",
|
||||
],
|
||||
)
|
||||
|
||||
|
@ -57,7 +57,6 @@ nn/qat/ @jerryzh168
|
||||
# Docker
|
||||
/.ci/docker/ @jeffdaily
|
||||
/.ci/docker/ci_commit_pins/triton.txt @desertfire @Chillee @eellison @shunting314 @bertmaher @jeffdaily @jataylo @jithunnair-amd @pruthvistony
|
||||
/.ci/docker/ci_commit_pins/triton-rocm.txt @jeffdaily @jataylo @jithunnair-amd @pruthvistony
|
||||
/.ci/docker/ci_commit_pins/triton-xpu.txt @EikanWang @gujinghui
|
||||
|
||||
# Github Actions
|
||||
|
@ -50,6 +50,7 @@ Following is the Release Compatibility Matrix for PyTorch releases:
|
||||
|
||||
| PyTorch version | Python | Stable CUDA | Experimental CUDA | Stable ROCm |
|
||||
| --- | --- | --- | --- | --- |
|
||||
| 2.5 | >=3.9, <=3.12, (3.13 experimental) | CUDA 11.8, CUDA 12.1, CUDA 12.4, CUDNN 9.1.0.70 | None | ROCm 6.2 |
|
||||
| 2.4 | >=3.8, <=3.12 | CUDA 11.8, CUDA 12.1, CUDNN 9.1.0.70 | CUDA 12.4, CUDNN 9.1.0.70 | ROCm 6.1 |
|
||||
| 2.3 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 6.0 |
|
||||
| 2.2 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.7 |
|
||||
|
@ -299,6 +299,15 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
|
||||
AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)
|
||||
|
||||
#define AT_DISPATCH_CASE_FLOATING_TYPES_AND5( \
|
||||
SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, ...) \
|
||||
AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__)
|
||||
|
||||
#define AT_DISPATCH_FLOATING_TYPES_AND4( \
|
||||
SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, TYPE, NAME, ...) \
|
||||
AT_DISPATCH_SWITCH( \
|
||||
@ -307,6 +316,26 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
|
||||
AT_DISPATCH_CASE_FLOATING_TYPES_AND4( \
|
||||
SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, __VA_ARGS__))
|
||||
|
||||
#define AT_DISPATCH_FLOATING_TYPES_AND5( \
|
||||
SCALARTYPE1, \
|
||||
SCALARTYPE2, \
|
||||
SCALARTYPE3, \
|
||||
SCALARTYPE4, \
|
||||
SCALARTYPE5, \
|
||||
TYPE, \
|
||||
NAME, \
|
||||
...) \
|
||||
AT_DISPATCH_SWITCH( \
|
||||
TYPE, \
|
||||
NAME, \
|
||||
AT_DISPATCH_CASE_FLOATING_TYPES_AND5( \
|
||||
SCALARTYPE1, \
|
||||
SCALARTYPE2, \
|
||||
SCALARTYPE3, \
|
||||
SCALARTYPE4, \
|
||||
SCALARTYPE5, \
|
||||
__VA_ARGS__))
|
||||
|
||||
#define AT_DISPATCH_CASE_COMPLEX_TYPES(...) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::ComplexDouble, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::ComplexFloat, __VA_ARGS__)
|
||||
|
@ -707,7 +707,12 @@ bool are_all_mutations_under_no_grad_or_inference_mode(const Tensor& functional_
|
||||
}
|
||||
|
||||
bool isFunctionalTensor(const at::Tensor& tensor) {
|
||||
return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Functionalize);
|
||||
return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Functionalize);
|
||||
}
|
||||
|
||||
bool isBaseTensor(const at::Tensor& tensor) {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isFunctionalTensor(tensor));
|
||||
return unsafeGetFunctionalWrapper(tensor)->isBaseTensor();
|
||||
}
|
||||
|
||||
bool isFunctionalTensor(const std::optional<Tensor>& t) {
|
||||
|
@ -165,6 +165,12 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
|
||||
was_storage_changed_ = true;
|
||||
}
|
||||
|
||||
// A FunctionalTensor is considered a base if its not a view of another
|
||||
// tensor.
|
||||
bool isBaseTensor() const {
|
||||
return view_metas_.empty();
|
||||
}
|
||||
|
||||
c10::SymInt get_storage_size(bool before) {
|
||||
return functional_storage_impl()->get_storage_size(before);
|
||||
}
|
||||
@ -290,6 +296,8 @@ TORCH_API inline FunctionalTensorWrapper* unsafeGetFunctionalWrapper(
|
||||
return functional_impl;
|
||||
}
|
||||
|
||||
TORCH_API bool isBaseTensor(const at::Tensor& tensor);
|
||||
|
||||
TORCH_API bool isFunctionalTensor(const at::Tensor& tensor);
|
||||
TORCH_API bool isFunctionalTensor(const std::optional<Tensor>& t);
|
||||
TORCH_API bool isFunctionalTensor(
|
||||
|
@ -69,7 +69,7 @@ thread_local std::array<at::ScalarType, at::COMPILE_TIME_MAX_DEVICE_TYPES>
|
||||
at::ScalarType::Undefined, // Vulkan
|
||||
at::ScalarType::Undefined, // Metal
|
||||
at::kHalf, // XPU
|
||||
at::ScalarType::Undefined, // MPS
|
||||
at::kHalf, // MPS
|
||||
at::ScalarType::Undefined, // Meta (tensors with no data)
|
||||
at::kBFloat16, // HPU / HABANA
|
||||
at::ScalarType::Undefined, // SX-Aurora / NEC
|
||||
@ -206,6 +206,118 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
|
||||
TORCH_FN((&at::autocast::binary_cross_entropy_banned)));
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL(_, AutocastMPS, m) {
|
||||
m.fallback(torch::CppFunction::makeFallthrough());
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL(aten, AutocastMPS, m) {
|
||||
// lower_precision_fp
|
||||
KERNEL_MPS2(_convolution, deprecated, lower_precision_fp)
|
||||
KERNEL_MPS(_convolution, lower_precision_fp)
|
||||
KERNEL_MPS(conv1d, lower_precision_fp)
|
||||
KERNEL_MPS(conv2d, lower_precision_fp)
|
||||
KERNEL_MPS(conv_tbc, lower_precision_fp)
|
||||
KERNEL_MPS(conv_transpose1d, lower_precision_fp)
|
||||
KERNEL_MPS2(conv_transpose2d, input, lower_precision_fp)
|
||||
KERNEL_MPS(convolution, lower_precision_fp)
|
||||
KERNEL_MPS(_mps_convolution, lower_precision_fp)
|
||||
KERNEL_MPS(prelu, lower_precision_fp)
|
||||
KERNEL_MPS(addmm, lower_precision_fp)
|
||||
KERNEL_MPS(addmv, lower_precision_fp)
|
||||
KERNEL_MPS(addr, lower_precision_fp)
|
||||
KERNEL_MPS(matmul, lower_precision_fp)
|
||||
KERNEL_MPS(einsum, lower_precision_fp)
|
||||
KERNEL_MPS(mm, lower_precision_fp)
|
||||
KERNEL_MPS(mv, lower_precision_fp)
|
||||
KERNEL_MPS(linear, lower_precision_fp)
|
||||
KERNEL_MPS(addbmm, lower_precision_fp)
|
||||
KERNEL_MPS(baddbmm, lower_precision_fp)
|
||||
KERNEL_MPS(bmm, lower_precision_fp)
|
||||
KERNEL_MPS(chain_matmul, lower_precision_fp)
|
||||
KERNEL_MPS(linalg_multi_dot, lower_precision_fp)
|
||||
KERNEL_MPS(lstm_cell, lower_precision_fp)
|
||||
|
||||
// fp32
|
||||
KERNEL_MPS(acos, fp32)
|
||||
KERNEL_MPS(asin, fp32)
|
||||
KERNEL_MPS(cosh, fp32)
|
||||
KERNEL_MPS(erfinv, fp32)
|
||||
KERNEL_MPS(exp, fp32)
|
||||
KERNEL_MPS(expm1, fp32)
|
||||
KERNEL_MPS(log, fp32)
|
||||
KERNEL_MPS(log10, fp32)
|
||||
KERNEL_MPS(log2, fp32)
|
||||
KERNEL_MPS(log1p, fp32)
|
||||
KERNEL_MPS(reciprocal, fp32)
|
||||
KERNEL_MPS(rsqrt, fp32)
|
||||
KERNEL_MPS(sinh, fp32)
|
||||
KERNEL_MPS(tan, fp32)
|
||||
KERNEL_MPS2(pow, Tensor_Scalar, fp32)
|
||||
KERNEL_MPS2(pow, Tensor_Tensor, fp32)
|
||||
KERNEL_MPS2(pow, Scalar, fp32)
|
||||
KERNEL_MPS(softplus, fp32)
|
||||
KERNEL_MPS(layer_norm, fp32)
|
||||
KERNEL_MPS(native_layer_norm, fp32)
|
||||
KERNEL_MPS(group_norm, fp32)
|
||||
KERNEL_MPS2(frobenius_norm, dim, fp32)
|
||||
KERNEL_MPS(nuclear_norm, fp32)
|
||||
KERNEL_MPS2(nuclear_norm, dim, fp32)
|
||||
KERNEL_MPS(batch_norm, fp32)
|
||||
KERNEL_MPS(cosine_similarity, fp32)
|
||||
KERNEL_MPS(poisson_nll_loss, fp32)
|
||||
KERNEL_MPS(cosine_embedding_loss, fp32)
|
||||
KERNEL_MPS(nll_loss, fp32)
|
||||
KERNEL_MPS(nll_loss2d, fp32)
|
||||
KERNEL_MPS(hinge_embedding_loss, fp32)
|
||||
KERNEL_MPS(kl_div, fp32)
|
||||
KERNEL_MPS(l1_loss, fp32)
|
||||
KERNEL_MPS(smooth_l1_loss, fp32)
|
||||
KERNEL_MPS(huber_loss, fp32)
|
||||
KERNEL_MPS(mse_loss, fp32)
|
||||
KERNEL_MPS(margin_ranking_loss, fp32)
|
||||
KERNEL_MPS(multilabel_margin_loss, fp32)
|
||||
KERNEL_MPS(soft_margin_loss, fp32)
|
||||
KERNEL_MPS(triplet_margin_loss, fp32)
|
||||
KERNEL_MPS(multi_margin_loss, fp32)
|
||||
KERNEL_MPS(binary_cross_entropy_with_logits, fp32)
|
||||
KERNEL_MPS(dist, fp32)
|
||||
KERNEL_MPS(pdist, fp32)
|
||||
KERNEL_MPS(cdist, fp32)
|
||||
KERNEL_MPS(renorm, fp32)
|
||||
KERNEL_MPS(logsumexp, fp32)
|
||||
|
||||
// fp32_set_opt_dtype
|
||||
KERNEL_MPS(prod, fp32)
|
||||
KERNEL_MPS2(prod, dim_int, fp32)
|
||||
KERNEL_MPS2(prod, dim_Dimname, fp32)
|
||||
KERNEL_MPS2(softmax, int, fp32)
|
||||
KERNEL_MPS2(softmax, Dimname, fp32)
|
||||
KERNEL_MPS2(log_softmax, int, fp32)
|
||||
KERNEL_MPS2(log_softmax, Dimname, fp32)
|
||||
KERNEL_MPS(cumprod, fp32)
|
||||
KERNEL_MPS2(cumprod, dimname, fp32)
|
||||
KERNEL_MPS(cumsum, fp32)
|
||||
KERNEL_MPS2(cumsum, dimname, fp32)
|
||||
KERNEL_MPS(linalg_vector_norm, fp32)
|
||||
KERNEL_MPS(linalg_matrix_norm, fp32)
|
||||
KERNEL_MPS2(linalg_matrix_norm, str_ord, fp32)
|
||||
KERNEL_MPS(sum, fp32)
|
||||
KERNEL_MPS2(sum, dim_IntList, fp32)
|
||||
KERNEL_MPS2(sum, dim_DimnameList, fp32)
|
||||
//
|
||||
// promote
|
||||
KERNEL_MPS(addcdiv, promote)
|
||||
KERNEL_MPS(addcmul, promote)
|
||||
KERNEL_MPS(atan2, promote)
|
||||
KERNEL_MPS(bilinear, promote)
|
||||
KERNEL_MPS(cross, promote)
|
||||
KERNEL_MPS(dot, promote)
|
||||
KERNEL_MPS(grid_sampler, promote)
|
||||
KERNEL_MPS(index_put, promote)
|
||||
KERNEL_MPS(tensordot, promote)
|
||||
KERNEL_MPS(scatter_add, promote)
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL(_, AutocastCPU, m) {
|
||||
m.fallback(torch::CppFunction::makeFallthrough());
|
||||
}
|
||||
|
@ -145,6 +145,8 @@ inline bool is_autocast_eligible(
|
||||
return tensor.is_xla() && tensor.is_floating_point();
|
||||
case c10::DeviceType::PrivateUse1:
|
||||
return tensor.is_privateuseone() && tensor.is_floating_point();
|
||||
case c10::DeviceType::MPS:
|
||||
return tensor.is_mps() && tensor.is_floating_point();
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@ -168,6 +170,8 @@ inline DispatchKey get_autocast_dispatch_key_from_device_type(
|
||||
return DispatchKey::AutocastXLA;
|
||||
case c10::DeviceType::PrivateUse1:
|
||||
return DispatchKey::AutocastPrivateUse1;
|
||||
case c10::DeviceType::MPS:
|
||||
return DispatchKey::AutocastMPS;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"unknown device type for autocast in get_autocast_dispatch_key_from_device_type");
|
||||
@ -178,7 +182,7 @@ inline bool is_autocast_available(c10::DeviceType device_type) {
|
||||
if (device_type == at::kCPU || device_type == at::kCUDA ||
|
||||
device_type == at::kXPU || device_type == at::kIPU ||
|
||||
device_type == at::kHPU || device_type == at::kXLA ||
|
||||
device_type == at::kPrivateUse1) {
|
||||
device_type == at::kPrivateUse1 || device_type == at::kMPS) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
@ -745,6 +749,27 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions.
|
||||
REDISPATCH_SIGNATURE, \
|
||||
POLICY)
|
||||
|
||||
// KERNEL_MPS registration for AutocastMPS
|
||||
#define KERNEL_MPS(OP, POLICY) \
|
||||
m.impl( \
|
||||
TORCH_SELECTIVE_NAME("aten::" #OP), \
|
||||
&WrapFunction< \
|
||||
CastPolicy::POLICY, \
|
||||
DeviceType::MPS, \
|
||||
decltype(ATEN_FN(OP)), \
|
||||
decltype(ATEN_FN(OP)), \
|
||||
&ATEN_FN(OP)>::type::call);
|
||||
|
||||
#define KERNEL_MPS2(OP, OVERLOAD, POLICY) \
|
||||
m.impl( \
|
||||
TORCH_SELECTIVE_NAME("aten::" #OP "." #OVERLOAD), \
|
||||
&WrapFunction< \
|
||||
CastPolicy::POLICY, \
|
||||
DeviceType::MPS, \
|
||||
decltype(ATEN_FN2(OP, OVERLOAD)), \
|
||||
decltype(ATEN_FN2(OP, OVERLOAD)), \
|
||||
&ATEN_FN2(OP, OVERLOAD)>::type::call);
|
||||
|
||||
// Op lists for different policies.
|
||||
// To make sure other backends can reuse the policy op list.
|
||||
#define AT_FORALL_LOWER_PRECISION_FP(_) \
|
||||
|
@ -228,6 +228,7 @@ namespace c10 {
|
||||
_(aten, is_autocast_cpu_enabled) \
|
||||
_(aten, is_autocast_xla_enabled) \
|
||||
_(aten, get_autocast_dtype) \
|
||||
_(aten, is_autocast_mps_enabled) \
|
||||
FORALL_ATEN_BASE_SYMBOLS(_) \
|
||||
_(onnx, Add) \
|
||||
_(onnx, Concat) \
|
||||
|
@ -9,7 +9,7 @@
|
||||
#endif
|
||||
|
||||
namespace at::cpu {
|
||||
bool is_cpu_support_avx2() {
|
||||
bool is_avx2_supported() {
|
||||
#if !defined(__s390x__) && !defined(__powerpc__)
|
||||
return cpuinfo_initialize() && cpuinfo_has_x86_avx2();
|
||||
#else
|
||||
@ -17,7 +17,7 @@ bool is_cpu_support_avx2() {
|
||||
#endif
|
||||
}
|
||||
|
||||
bool is_cpu_support_avx512() {
|
||||
bool is_avx512_supported() {
|
||||
#if !defined(__s390x__) && !defined(__powerpc__)
|
||||
return cpuinfo_initialize() && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512vl() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq();
|
||||
#else
|
||||
@ -25,7 +25,7 @@ bool is_cpu_support_avx512() {
|
||||
#endif
|
||||
}
|
||||
|
||||
bool is_cpu_support_avx512_vnni() {
|
||||
bool is_avx512_vnni_supported() {
|
||||
#if !defined(__s390x__) && !defined(__powerpc__)
|
||||
return cpuinfo_initialize() && cpuinfo_has_x86_avx512vnni();
|
||||
#else
|
||||
@ -33,7 +33,15 @@ bool is_cpu_support_avx512_vnni() {
|
||||
#endif
|
||||
}
|
||||
|
||||
bool is_cpu_support_amx_tile() {
|
||||
bool is_avx512_bf16_supported() {
|
||||
#if !defined(__s390x__) && !defined(__powerpc__)
|
||||
return cpuinfo_initialize() && cpuinfo_has_x86_avx512bf16();
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool is_amx_tile_supported() {
|
||||
#if !defined(__s390x__) && !defined(__powerpc__)
|
||||
return cpuinfo_initialize() && cpuinfo_has_x86_amx_tile();
|
||||
#else
|
||||
@ -42,7 +50,7 @@ bool is_cpu_support_amx_tile() {
|
||||
}
|
||||
|
||||
bool init_amx() {
|
||||
if (!is_cpu_support_amx_tile()) {
|
||||
if (!is_amx_tile_supported()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -6,14 +6,17 @@
|
||||
|
||||
namespace at::cpu {
|
||||
|
||||
TORCH_API bool is_cpu_support_avx2();
|
||||
TORCH_API bool is_cpu_support_avx512();
|
||||
TORCH_API bool is_avx2_supported();
|
||||
TORCH_API bool is_avx512_supported();
|
||||
|
||||
// Detect if CPU support Vector Neural Network Instruction.
|
||||
TORCH_API bool is_cpu_support_avx512_vnni();
|
||||
TORCH_API bool is_avx512_vnni_supported();
|
||||
|
||||
// Detect if CPU supports AVX512_BF16 ISA
|
||||
TORCH_API bool is_avx512_bf16_supported();
|
||||
|
||||
// Detect if CPU support Advanced Matrix Extension.
|
||||
TORCH_API bool is_cpu_support_amx_tile();
|
||||
TORCH_API bool is_amx_tile_supported();
|
||||
|
||||
// Enable the system to use AMX instructions.
|
||||
TORCH_API bool init_amx();
|
||||
|
@ -636,6 +636,21 @@ inline void transpose_mxn<float, 8, 8>(
|
||||
_mm256_storeu_ps(&dst[7 * ld_dst], th);
|
||||
}
|
||||
|
||||
template<>
|
||||
inline void transpose_mxn<float, 16, 16>(
|
||||
const float* src,
|
||||
int64_t ld_src,
|
||||
float* dst,
|
||||
int64_t ld_dst) {
|
||||
transpose_mxn<float, 8, 8>(
|
||||
src , ld_src, dst, ld_dst);
|
||||
transpose_mxn<float, 8, 8>(
|
||||
src + 8, ld_src, dst + 8 * ld_dst, ld_dst);
|
||||
transpose_mxn<float, 8, 8>(
|
||||
src + 8 * ld_src, ld_src, dst + 8, ld_dst);
|
||||
transpose_mxn<float, 8, 8>(
|
||||
src + 8 * ld_src + 8, ld_src, dst + 8 * ld_dst + 8, ld_dst);
|
||||
}
|
||||
#endif
|
||||
|
||||
}} // namespace at::vec::CPU_CAPABILITY
|
||||
|
@ -582,8 +582,7 @@ Vectorized<float> inline fmsub(const Vectorized<float>& a, const Vectorized<floa
|
||||
// https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#L230-L304
|
||||
// kernel for transposing mxn where m, n <= 16
|
||||
// M + (M + 1) / 2 * 2 + (M + 3) / 4 * 4 + (M + 7) / 8 * 8 + 2 * N instructions
|
||||
template <>
|
||||
inline void transpose_mxn<float>(const float* src, int64_t ld_src, float* dst, int64_t ld_dst, int M, int N) {
|
||||
inline void transpose_mxn_16x16(const float* src, int64_t ld_src, float* dst, int64_t ld_dst, int M, int N) {
|
||||
TORCH_CHECK(M <= 16 && N <= 16, "transpose_mxn<float> expects M, N <= 16.");
|
||||
// load from src to registers
|
||||
__m512 input[16];
|
||||
@ -667,8 +666,39 @@ inline void transpose_mxn<float>(const float* src, int64_t ld_src, float* dst, i
|
||||
}
|
||||
}
|
||||
|
||||
template<>
|
||||
inline void transpose_mxn<float>(const float* src, int64_t ld_src, float* dst, int64_t ld_dst, int M, int N) {
|
||||
int64_t i = 0;
|
||||
for (; i < M / 16 * 16; i += 16) {
|
||||
int64_t j = 0;
|
||||
for (; j < N / 16 * 16; j += 16) {
|
||||
transpose_mxn_16x16(
|
||||
src + i * ld_src + j, ld_src, dst + j * ld_dst + i, ld_dst, 16, 16);
|
||||
}
|
||||
// handle remainder j
|
||||
int nrem = N - j;
|
||||
if (nrem > 0) {
|
||||
transpose_mxn_16x16(
|
||||
src + i * ld_src + j, ld_src, dst + j * ld_dst + i, ld_dst, 16, nrem);
|
||||
}
|
||||
}
|
||||
// handle remainder i
|
||||
int mrem = M - i;
|
||||
if (mrem > 0) {
|
||||
int j = 0;
|
||||
for (; j < N / 16 * 16; j += 16) {
|
||||
transpose_mxn_16x16(
|
||||
src + i * ld_src + j, ld_src, dst + j * ld_dst + i, ld_dst, mrem, 16);
|
||||
}
|
||||
// handle remainder j
|
||||
int nrem = N - j;
|
||||
transpose_mxn_16x16(
|
||||
src + i * ld_src + j, ld_src, dst + j * ld_dst + i, ld_dst, mrem, nrem);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, int M, int N,
|
||||
typename std::enable_if_t<std::is_same<T, float>::value && M <= 16 && N <= 16, int> = 0>
|
||||
typename std::enable_if_t<std::is_same<T, float>::value, int> = 0>
|
||||
inline void transpose_mxn(const float* src, int64_t ld_src, float* dst, int64_t ld_dst) {
|
||||
transpose_mxn<float>(src, ld_src, dst, ld_dst, M, N);
|
||||
}
|
||||
|
@ -1408,7 +1408,6 @@ void scaled_gemm(
|
||||
const void *result_scale_ptr,
|
||||
int64_t result_ld,
|
||||
ScalarType result_dtype,
|
||||
void* amax_ptr,
|
||||
bool use_fast_accum) {
|
||||
#if CUDA_VERSION >= 11080 || defined(USE_ROCM)
|
||||
const auto computeType = CUBLAS_COMPUTE_32F;
|
||||
@ -1421,13 +1420,9 @@ void scaled_gemm(
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb));
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr);
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr);
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
|
||||
#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 60200)
|
||||
// Amax support in ROCm as of 6.2
|
||||
if (isFloat8Type(result_dtype)) {
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_AMAX_D_POINTER, amax_ptr);
|
||||
if (result_scale_ptr != nullptr) {
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
|
||||
}
|
||||
#endif
|
||||
#ifndef USE_ROCM
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_FAST_ACCUM, fastAccuMode);
|
||||
#endif
|
||||
|
@ -140,7 +140,6 @@ void scaled_gemm(
|
||||
const void* result_scale_ptr,
|
||||
int64_t result_ld,
|
||||
ScalarType result_dtype,
|
||||
void* amax_ptr,
|
||||
bool use_fast_accum);
|
||||
|
||||
#define CUDABLAS_BGEMM_ARGTYPES(Dtype) \
|
||||
|
@ -188,7 +188,10 @@ TuningResultsValidator::TuningResultsValidator() {
|
||||
RegisterValidator(
|
||||
"ROCM_VERSION",
|
||||
[rocm_version]() { return rocm_version; },
|
||||
[rocm_version](auto&& k) { return rocm_version == k ? OK : FAIL; });
|
||||
[rocm_version](auto&& k) {
|
||||
TUNABLE_LOG1("ROCM_VERSION validation: expect ", k, " to match ", rocm_version);
|
||||
return rocm_version == k ? OK : FAIL;
|
||||
});
|
||||
}
|
||||
// gfx arch
|
||||
{
|
||||
@ -196,7 +199,10 @@ TuningResultsValidator::TuningResultsValidator() {
|
||||
RegisterValidator(
|
||||
"GCN_ARCH_NAME",
|
||||
[gcn_arch_name]() { return gcn_arch_name; },
|
||||
[gcn_arch_name](auto&& k) { return gcn_arch_name == k ? OK : FAIL; });
|
||||
[gcn_arch_name](auto&& k) {
|
||||
TUNABLE_LOG1("GCN_ARCH_NAME validation: expect ", k, " to match ", gcn_arch_name);
|
||||
return gcn_arch_name == k ? OK : FAIL;
|
||||
});
|
||||
}
|
||||
// rocblas
|
||||
{
|
||||
@ -212,7 +218,10 @@ TuningResultsValidator::TuningResultsValidator() {
|
||||
RegisterValidator(
|
||||
"ROCBLAS_VERSION",
|
||||
[rocblas_version]() { return rocblas_version; },
|
||||
[rocblas_version](auto&& k) { return rocblas_version == k ? OK : FAIL; });
|
||||
[rocblas_version](auto&& k) {
|
||||
TUNABLE_LOG1("ROCBLAS_VERSION validation: expect ", k, " to match ", rocblas_version);
|
||||
return rocblas_version == k ? OK : FAIL;
|
||||
});
|
||||
}
|
||||
// hipblaslt
|
||||
{
|
||||
@ -226,7 +235,10 @@ TuningResultsValidator::TuningResultsValidator() {
|
||||
RegisterValidator(
|
||||
"HIPBLASLT_VERSION",
|
||||
[hipblaslt_version]() { return hipblaslt_version; },
|
||||
[hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; });
|
||||
[hipblaslt_version](auto&& k) {
|
||||
TUNABLE_LOG1("HIPBLASLT_VERSION validation: expect ", k, " to match ", hipblaslt_version);
|
||||
return hipblaslt_version == k ? OK : FAIL;
|
||||
});
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -104,7 +104,6 @@ class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
|
||||
params->c_scale_ptr,
|
||||
params->ldc,
|
||||
params->c_dtype,
|
||||
params->amax_ptr,
|
||||
params->use_fast_accum);
|
||||
return OK;
|
||||
}
|
||||
|
@ -23,6 +23,9 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchVmapMode, m) {
|
||||
OP_DECOMPOSE(dropout_);
|
||||
OP_DECOMPOSE(feature_alpha_dropout_);
|
||||
OP_DECOMPOSE(feature_dropout_);
|
||||
OP_DECOMPOSE(dropout);
|
||||
OP_DECOMPOSE(_scaled_dot_product_attention_math);
|
||||
OP_DECOMPOSE(scaled_dot_product_attention);
|
||||
}
|
||||
|
||||
static void unsupportedData(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
|
||||
@ -235,7 +238,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
|
||||
OP_DECOMPOSE(relu6_);
|
||||
OP_DECOMPOSE(prelu);
|
||||
OP_DECOMPOSE2(softmax, int);
|
||||
OP_DECOMPOSE(scaled_dot_product_attention);
|
||||
OP_DECOMPOSE(special_gammainc);
|
||||
OP_DECOMPOSE(special_gammaincc);
|
||||
OP_DECOMPOSE(special_logit);
|
||||
@ -261,7 +263,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
|
||||
OP_DECOMPOSE(special_xlogy);
|
||||
OP_DECOMPOSE2(special_xlogy, other_scalar);
|
||||
OP_DECOMPOSE2(special_xlogy, self_scalar);
|
||||
OP_DECOMPOSE(_scaled_dot_product_attention_math);
|
||||
|
||||
|
||||
m.impl("split.sizes", native::split_symint);
|
||||
@ -386,6 +387,11 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
|
||||
OP_DECOMPOSE2(to, dtype);
|
||||
OP_DECOMPOSE2(to, dtype_layout);
|
||||
OP_DECOMPOSE2(to, other);
|
||||
|
||||
// Random ops that are also registered here
|
||||
OP_DECOMPOSE(dropout);
|
||||
OP_DECOMPOSE(_scaled_dot_product_attention_math);
|
||||
OP_DECOMPOSE(scaled_dot_product_attention);
|
||||
}
|
||||
|
||||
} // namespace at::functorch
|
||||
|
@ -496,6 +496,11 @@ _scaled_dot_product_flash_attention_batch_rule(
|
||||
bool return_debug_mask,
|
||||
c10::optional<double> scale
|
||||
) {
|
||||
if (dropout_p > 0) {
|
||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||
RandomnessType randomness = maybe_layer->randomness();
|
||||
check_randomness(randomness, query_bdim.has_value() || key_bdim.has_value() || value_bdim.has_value());
|
||||
}
|
||||
auto batch_size = get_bdim_size3(query, query_bdim, key, key_bdim, value, value_bdim);
|
||||
auto query_ = moveBatchDimToFront(query, query_bdim);
|
||||
auto key_ = moveBatchDimToFront(key, key_bdim);
|
||||
@ -540,6 +545,11 @@ fourOutputs _scaled_dot_product_efficient_attention_batch_rule(
|
||||
bool is_causal,
|
||||
c10::optional<double> scale
|
||||
) {
|
||||
if (dropout_p > 0) {
|
||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||
RandomnessType randomness = maybe_layer->randomness();
|
||||
check_randomness(randomness, query_bdim.has_value() || key_bdim.has_value() || value_bdim.has_value());
|
||||
}
|
||||
auto batch_size = get_bdim_size3(query, query_bdim, key, key_bdim, value, value_bdim);
|
||||
auto query_ = moveBatchDimToFront(query, query_bdim);
|
||||
auto key_ = moveBatchDimToFront(key, key_bdim);
|
||||
@ -577,6 +587,11 @@ _scaled_dot_product_cudnn_attention_batch_rule(
|
||||
bool return_debug_mask,
|
||||
c10::optional<double> scale
|
||||
) {
|
||||
if (dropout_p > 0) {
|
||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||
RandomnessType randomness = maybe_layer->randomness();
|
||||
check_randomness(randomness, query_bdim.has_value() || key_bdim.has_value() || value_bdim.has_value());
|
||||
}
|
||||
auto batch_size = get_bdim_size3(query, query_bdim, key, key_bdim, value, value_bdim);
|
||||
auto query_ = moveBatchDimToFront(query, query_bdim);
|
||||
auto key_ = moveBatchDimToFront(key, key_bdim);
|
||||
|
@ -28,6 +28,7 @@ MPSStream::MPSStream(Stream stream) : _stream(stream) {
|
||||
_executionDescriptor.enableCommitAndContinue = _enableCommitAndContinue;
|
||||
|
||||
// Choose level which optimizes for GPU
|
||||
[_compilationDescriptor disableTypeInference];
|
||||
_compilationDescriptor.optimizationLevel = MPSGraphOptimizationLevel0;
|
||||
_executionDescriptor.compilationDescriptor = _compilationDescriptor;
|
||||
}
|
||||
|
@ -41,6 +41,17 @@ extern "C" void zaxpy_(int *n, void *a, const void *x, int *incx, void *y, int *
|
||||
#include <fbgemm/FbgemmI64.h>
|
||||
#endif // USE_FBGEMM
|
||||
|
||||
#if AT_MKLDNN_ENABLED()
|
||||
#include <oneapi/dnnl/dnnl_version.h>
|
||||
#endif // oneDNN
|
||||
|
||||
#define ONEDNN_UKERNEL_ENABLED (DNNL_VERSION_MAJOR >=3 && DNNL_VERSION_MINOR >=5)
|
||||
|
||||
#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
|
||||
#include <oneapi/dnnl/dnnl_ukernel.hpp>
|
||||
#include <oneapi/dnnl/dnnl.hpp>
|
||||
#endif // oneDNN BRGEMM
|
||||
|
||||
namespace at::native::cpublas {
|
||||
namespace internal {
|
||||
|
||||
@ -822,4 +833,350 @@ void copy(int64_t n, const c10::complex<float> *x, int64_t incx, c10::complex<fl
|
||||
n, x, incx, y, incy);
|
||||
}
|
||||
|
||||
} // namespace at::native::cpublas
|
||||
// oneDNN BRGEMM
|
||||
#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
|
||||
struct BrgemmKey {
|
||||
int64_t M;
|
||||
int64_t N;
|
||||
int64_t K;
|
||||
int64_t batch_size;
|
||||
int64_t lda;
|
||||
int64_t ldb;
|
||||
int64_t ldc;
|
||||
ScalarType dt_a;
|
||||
ScalarType dt_b;
|
||||
ScalarType dt_c;
|
||||
float alpha;
|
||||
float beta;
|
||||
BrgemmKey(
|
||||
int64_t M,
|
||||
int64_t N,
|
||||
int64_t K,
|
||||
int64_t batch_size,
|
||||
int64_t lda,
|
||||
int64_t ldb,
|
||||
int64_t ldc,
|
||||
ScalarType dt_a,
|
||||
ScalarType dt_b,
|
||||
ScalarType dt_c,
|
||||
float alpha,
|
||||
float beta)
|
||||
: M(M),
|
||||
N(N),
|
||||
K(K),
|
||||
batch_size(batch_size),
|
||||
lda(lda),
|
||||
ldb(ldb),
|
||||
ldc(ldc),
|
||||
dt_a(dt_a),
|
||||
dt_b(dt_b),
|
||||
dt_c(dt_c),
|
||||
alpha(alpha),
|
||||
beta(beta) {}
|
||||
bool operator==(const BrgemmKey& other) const {
|
||||
return M == other.M && N == other.N && K == other.K &&
|
||||
batch_size == other.batch_size && lda == other.lda &&
|
||||
ldb == other.ldb && ldc == other.ldc && dt_a == other.dt_a &&
|
||||
dt_b == other.dt_b && dt_c == other.dt_c && alpha == other.alpha &&
|
||||
beta == other.beta;
|
||||
}
|
||||
};
|
||||
|
||||
struct PackKey {
|
||||
int64_t K;
|
||||
int64_t N;
|
||||
int64_t ld_in;
|
||||
int64_t ld_out;
|
||||
ScalarType dt_in;
|
||||
ScalarType dt_out;
|
||||
PackKey(
|
||||
int64_t K,
|
||||
int64_t N,
|
||||
int64_t ld_in,
|
||||
int64_t ld_out,
|
||||
ScalarType dt_in,
|
||||
ScalarType dt_out)
|
||||
: K(K),
|
||||
N(N),
|
||||
ld_in(ld_in),
|
||||
ld_out(ld_out),
|
||||
dt_in(dt_in),
|
||||
dt_out(dt_out) {}
|
||||
bool operator==(const PackKey& other) const {
|
||||
return N == other.N && K == other.K && ld_in == other.ld_in &&
|
||||
ld_out == other.ld_out && dt_in == other.dt_in &&
|
||||
dt_out == other.dt_out;
|
||||
}
|
||||
};
|
||||
|
||||
inline dnnl::memory::data_type get_dnnl_dtype(ScalarType dtype) {
|
||||
if (dtype == ScalarType::Float) {
|
||||
return dnnl::memory::data_type::f32;
|
||||
} else if (dtype == ScalarType::BFloat16) {
|
||||
return dnnl::memory::data_type::bf16;
|
||||
} else if (dtype == ScalarType::Half) {
|
||||
return dnnl::memory::data_type::f16;
|
||||
} else if (dtype == ScalarType::Byte) {
|
||||
return dnnl::memory::data_type::u8;
|
||||
} else if (dtype == ScalarType::Char) {
|
||||
return dnnl::memory::data_type::s8;
|
||||
} else {
|
||||
TORCH_CHECK(false, "get_dnnl_dtype expects float/bfloat16/half/int8 tensor input");
|
||||
}
|
||||
}
|
||||
|
||||
template<typename key_t>
|
||||
struct UnsafeUkernelKeyHasher {
|
||||
std::size_t operator()(const key_t& key) const;
|
||||
};
|
||||
|
||||
template<>
|
||||
std::size_t UnsafeUkernelKeyHasher<BrgemmKey>::operator()(const BrgemmKey& key) const {
|
||||
// Use beta, M, N, and K to compute hash to reduce the overhead as
|
||||
// batch size, alpha, and data types are unlikely to change within the same kernel and
|
||||
// leading dimensions are likely to be related to M, K, N or use fixed values.
|
||||
std::size_t h = std::hash<float>()(key.beta + 1);
|
||||
h = std::hash<int64_t>()(key.M) ^ (h << 1);
|
||||
h = std::hash<int64_t>()(key.N) ^ (h << 1);
|
||||
h = std::hash<int64_t>()(key.K) ^ (h << 1);
|
||||
h = std::hash<int64_t>()(key.ldc) ^ (h << 1);
|
||||
return h;
|
||||
}
|
||||
|
||||
template<>
|
||||
std::size_t UnsafeUkernelKeyHasher<PackKey>::operator()(const PackKey& key) const {
|
||||
// Use K and N to compute hash to reduce the overhead as
|
||||
// data types are unlikely to change and
|
||||
// ld_in/ld_out is likely to be related to K, N or use fixed values
|
||||
std::size_t h = std::hash<int64_t>()(key.K);
|
||||
h = std::hash<int64_t>()(key.N) ^ (h << 1);
|
||||
return h;
|
||||
}
|
||||
|
||||
template <typename key_t, typename value_t>
|
||||
struct KernelCache {
|
||||
using kstore_t = std::unordered_map<key_t, std::shared_ptr<value_t>, UnsafeUkernelKeyHasher<key_t>>;
|
||||
static inline std::shared_ptr<value_t>&& fetch_or_create(
|
||||
const key_t& key,
|
||||
const std::function<std::shared_ptr<value_t>()>& callback) {
|
||||
auto&& search = get_store().find(key);
|
||||
if (search != get_store().end()) {
|
||||
return std::move(search->second);
|
||||
} else {
|
||||
get_store().insert({key, callback()});
|
||||
return std::move(get_store()[key]);
|
||||
}
|
||||
}
|
||||
|
||||
static inline kstore_t& get_store() {
|
||||
static thread_local kstore_t cache_kernels;
|
||||
return cache_kernels;
|
||||
}
|
||||
};
|
||||
|
||||
// Helper struct for convenient brgemm configuration
|
||||
struct GemmHelper {
|
||||
GemmHelper(
|
||||
int64_t M,
|
||||
int64_t N,
|
||||
int64_t K,
|
||||
int64_t bs,
|
||||
int64_t ld_a,
|
||||
int64_t ld_b,
|
||||
int64_t ld_c,
|
||||
ScalarType dt_a,
|
||||
ScalarType dt_b,
|
||||
ScalarType dt_c,
|
||||
const float alpha,
|
||||
const float beta) {
|
||||
// Create brgemm
|
||||
brg = dnnl::ukernel::brgemm(
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
bs,
|
||||
ld_a,
|
||||
ld_b,
|
||||
ld_c,
|
||||
get_dnnl_dtype(dt_a),
|
||||
get_dnnl_dtype(dt_b),
|
||||
get_dnnl_dtype(dt_c),
|
||||
alpha,
|
||||
beta);
|
||||
// Create a scratchpad buffer for the brgemm execution
|
||||
scratchpad = std::vector<uint8_t>(brg.get_scratchpad_size());
|
||||
// Prepare default vector of pairs of tensors A and B offsets for each batch.
|
||||
A_B_offsets.reserve(1);
|
||||
A_B_offsets[0] = std::make_pair(0, 0);
|
||||
}
|
||||
dnnl::ukernel::brgemm brg;
|
||||
std::vector<uint8_t> scratchpad;
|
||||
std::vector<std::pair<int64_t, int64_t>> A_B_offsets;
|
||||
};
|
||||
|
||||
struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> {
|
||||
// Fetch/create GemmHelper object and execute brgemm with batch size = 1
|
||||
template <typename scalar_t_a, typename scalar_t_b, typename scalar_t_c>
|
||||
static inline void call(
|
||||
int64_t M,
|
||||
int64_t N,
|
||||
int64_t K,
|
||||
int64_t ld_a,
|
||||
int64_t ld_b,
|
||||
int64_t ld_c,
|
||||
const float alpha,
|
||||
const float beta,
|
||||
const scalar_t_a* A,
|
||||
const scalar_t_b* B,
|
||||
scalar_t_c* C) {
|
||||
auto&& key = BrgemmKey(
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
int64_t(1),
|
||||
ld_a,
|
||||
ld_b,
|
||||
ld_c,
|
||||
c10::CppTypeToScalarType<scalar_t_a>::value,
|
||||
c10::CppTypeToScalarType<scalar_t_b>::value,
|
||||
c10::CppTypeToScalarType<scalar_t_c>::value,
|
||||
alpha,
|
||||
beta);
|
||||
// Fetch/create GemmHelper object
|
||||
auto&& value = fetch_or_create(key, [&]() {
|
||||
auto&& v = std::make_shared<GemmHelper>(
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
1,
|
||||
ld_a,
|
||||
ld_b,
|
||||
ld_c,
|
||||
c10::CppTypeToScalarType<scalar_t_a>::value,
|
||||
c10::CppTypeToScalarType<scalar_t_b>::value,
|
||||
c10::CppTypeToScalarType<scalar_t_c>::value,
|
||||
alpha,
|
||||
beta);
|
||||
(*v).brg.generate();
|
||||
return std::move(v);
|
||||
});
|
||||
if (get_current() != value) {
|
||||
dnnl::ukernel::brgemm::release_hw_context();
|
||||
((*value).brg).set_hw_context();
|
||||
get_current() = value;
|
||||
}
|
||||
((*value).brg)
|
||||
.execute(A, B, (*value).A_B_offsets, C, (*value).scratchpad.data());
|
||||
}
|
||||
|
||||
static inline std::shared_ptr<GemmHelper>& get_current() {
|
||||
static thread_local std::shared_ptr<GemmHelper> current;
|
||||
return current;
|
||||
}
|
||||
|
||||
static inline bool device_check(ScalarType dtype) {
|
||||
if (!at::globalContext().userEnabledMkldnn()) {
|
||||
return false;
|
||||
}
|
||||
if (dtype == ScalarType::Half) {
|
||||
static bool fp16_support = dnnl::get_effective_cpu_isa() >= dnnl::cpu_isa::avx512_core_fp16;
|
||||
return fp16_support;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
using pack_t = dnnl::ukernel::brgemm_pack_B;
|
||||
struct Pack : public KernelCache <PackKey, pack_t> {
|
||||
static inline void call(
|
||||
int64_t K,
|
||||
int64_t N,
|
||||
int64_t ld_in,
|
||||
int64_t ld_out,
|
||||
ScalarType dt_in,
|
||||
ScalarType dt_out,
|
||||
const void* in,
|
||||
void* out) {
|
||||
auto&& key = PackKey(K, N, ld_in, ld_out, dt_in, dt_out);
|
||||
auto&& pack = fetch_or_create(key, [&]() {
|
||||
auto&& p = std::make_shared<pack_t>(
|
||||
K, N, ld_in, ld_out, get_dnnl_dtype(dt_in), get_dnnl_dtype(dt_out));
|
||||
if (need_pack(dt_in)) {
|
||||
(*p).generate();
|
||||
}
|
||||
return std::move(p);
|
||||
});
|
||||
if (need_pack(dt_in)) {
|
||||
(*pack).execute(in, out);
|
||||
} else {
|
||||
TORCH_CHECK(false, "No need to pack");
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool need_pack(ScalarType dtype) {
|
||||
if (!at::globalContext().userEnabledMkldnn()) {
|
||||
return false;
|
||||
}
|
||||
if (dtype == ScalarType::Half) {
|
||||
static bool fp16_pack = dnnl::get_effective_cpu_isa() >= dnnl::cpu_isa::avx512_core_amx_fp16;
|
||||
return fp16_pack;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
void brgemm(
|
||||
int64_t M,
|
||||
int64_t N,
|
||||
int64_t K,
|
||||
int64_t ld_a,
|
||||
int64_t ld_b,
|
||||
int64_t ld_c,
|
||||
const float alpha,
|
||||
const float beta,
|
||||
const at::Half* A,
|
||||
const at::Half* B,
|
||||
float* C) {
|
||||
#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
|
||||
if (Brgemm::device_check(ScalarType::Half)) {
|
||||
Brgemm::call<at::Half, at::Half, float>(
|
||||
M, N, K, ld_a, ld_b, ld_c, alpha, beta, A, B, C);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
TORCH_CHECK(false,
|
||||
"Half Brgemm is only supported on X64 when oneDNN ukernel is enabled and avx512_fp16 is supported");
|
||||
}
|
||||
|
||||
void brgemm_release() {
|
||||
#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
|
||||
dnnl::ukernel::brgemm::release_hw_context();
|
||||
#endif
|
||||
}
|
||||
|
||||
void pack(
|
||||
int64_t K,
|
||||
int64_t N,
|
||||
int64_t ld_in,
|
||||
int64_t ld_out,
|
||||
ScalarType dt_in,
|
||||
ScalarType dt_out,
|
||||
const void* in,
|
||||
void* out) {
|
||||
#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
|
||||
Pack::call(K, N, ld_in, ld_out, dt_in, dt_out, in, out);
|
||||
#else
|
||||
TORCH_CHECK(false, "pack is only supported on X64 with oneDNN ukernel enabled");
|
||||
#endif
|
||||
}
|
||||
|
||||
bool need_pack(ScalarType dt_in) {
|
||||
#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
|
||||
return Pack::need_pack(dt_in);
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace at::native::cpublas
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <c10/core/Scalar.h>
|
||||
|
||||
|
||||
namespace at::native::cpublas {
|
||||
|
||||
namespace internal {
|
||||
@ -186,4 +187,40 @@ void copy(int64_t n, const float *x, int64_t incx, float *y, int64_t incy);
|
||||
void copy(int64_t n, const c10::complex<double> *x, int64_t incx, c10::complex<double> *y, int64_t incy);
|
||||
void copy(int64_t n, const c10::complex<float> *x, int64_t incx, c10::complex<float> *y, int64_t incy);
|
||||
|
||||
} // namespace at::native::cpublas
|
||||
// Batch-reduce GEMM
|
||||
// Operates by the following formula:
|
||||
// C = alpha * SUM(A[i] x B[i]) + beta * C, i = 0 to batch size
|
||||
// A Base pointer to a tensor A.
|
||||
// B Base pointer to a tensor B.
|
||||
// C Pointer to a tensor C (accumulation buffer).
|
||||
TORCH_API void brgemm(
|
||||
int64_t M,
|
||||
int64_t N,
|
||||
int64_t K,
|
||||
int64_t ld_a,
|
||||
int64_t ld_b,
|
||||
int64_t ld_c,
|
||||
const float alpha,
|
||||
const float beta,
|
||||
const at::Half* A,
|
||||
const at::Half* B,
|
||||
float* C);
|
||||
|
||||
// Release brgemm hardware context
|
||||
void brgemm_release();
|
||||
|
||||
// Pack B matrix to get better performance if needed
|
||||
void pack(
|
||||
int64_t K,
|
||||
int64_t N,
|
||||
int64_t ld_in,
|
||||
int64_t ld_out,
|
||||
ScalarType dt_in,
|
||||
ScalarType dt_out,
|
||||
const void* in,
|
||||
void* out);
|
||||
|
||||
// Whether pack is needed in the platform.
|
||||
bool need_pack(ScalarType dt_in);
|
||||
|
||||
} // namespace at::native::cpublas
|
||||
|
@ -144,7 +144,7 @@ static void col2im_out_cpu_template(
|
||||
|
||||
output.resize_({batch_size, n_output_plane, output_height, output_width});
|
||||
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf,
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kBFloat16, kHalf, kBool,
|
||||
input.scalar_type(), "col2im_out_cpu", [&] {
|
||||
Tensor input_n = Tensor();
|
||||
Tensor output_n = Tensor();
|
||||
|
@ -421,12 +421,18 @@ struct ConvParams {
|
||||
// cudnn and miopen are guaranteed not to be on mobile, and T102591915 / T110194934 suggest
|
||||
// that maybe the compiledWithCuDNN() check sometimes segfaults (though I can't imagine how)
|
||||
#if !defined(C10_MOBILE)
|
||||
if (needs_64bit_indexing_no_split(input, weight)) {
|
||||
return false;
|
||||
}
|
||||
if (!detail::getCUDAHooks().compiledWithCuDNN()) {
|
||||
return false;
|
||||
}
|
||||
if (needs_64bit_indexing_no_split(input, weight)) {
|
||||
static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
|
||||
if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
|
||||
TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
|
||||
" if the V8 API is not enabled or before cuDNN version 9.3+."
|
||||
" Consider upgrading cuDNN and/or enabling the V8 API for better efficiency.");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (!input.is_cuda() || !cudnn_enabled) {
|
||||
return false;
|
||||
}
|
||||
|
@ -94,7 +94,7 @@ static void im2col_out_cpu_template(
|
||||
|
||||
output.resize_({batch_size, n_output_plane, output_length});
|
||||
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf,
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kBFloat16, kHalf, kBool,
|
||||
input.scalar_type(), "im2col_out_cpu", [&] {
|
||||
Tensor input_n;
|
||||
Tensor output_n;
|
||||
|
@ -19,6 +19,7 @@
|
||||
#include <ATen/native/ReduceOpsUtils.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <ATen/native/mkldnn/Matmul.h>
|
||||
#include <ATen/native/mkldnn/Utils.h>
|
||||
#include <c10/core/GradMode.h>
|
||||
#include <c10/util/accumulate.h>
|
||||
#include <c10/util/irange.h>
|
||||
@ -1358,13 +1359,8 @@ static inline int64_t get_mkldnn_matmul_min_dim() {
|
||||
static auto value = [&] {
|
||||
const int64_t default_min_dim = [&] {
|
||||
// Minimum dimension requirement for MKLDNN; derived based on experiments.
|
||||
// By default, it's only enabled on Neoverse V1.
|
||||
#if !defined(__s390x__) && !defined(__powerpc__)
|
||||
if (cpuinfo_initialize() && cpuinfo_get_uarchs_count() == 1 && cpuinfo_get_uarch(0)->uarch == cpuinfo_uarch_neoverse_v1) {
|
||||
return 8;
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
//it's enabled on all Neoverse cpus.
|
||||
return is_arm_neoverse() ? 8 : 0;
|
||||
}();
|
||||
const char* ptr = std::getenv("TORCH_MKLDNN_MATMUL_MIN_DIM");
|
||||
return ptr != nullptr ? std::atoi(ptr) : default_min_dim;
|
||||
@ -1377,13 +1373,8 @@ static inline int64_t get_mkldnn_matmul_min_size() {
|
||||
static auto value = [&] {
|
||||
const int64_t default_min_size = [&] {
|
||||
// Minimum size requirement for MKLDNN; derived based on experiments.
|
||||
// By default, it's only enabled on Neoverse V1.
|
||||
#if !defined(__s390x__) && !defined(__powerpc__)
|
||||
if (cpuinfo_initialize() && cpuinfo_get_uarchs_count() == 1 && cpuinfo_get_uarch(0)->uarch == cpuinfo_uarch_neoverse_v1) {
|
||||
return 8 * 1024;
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
// it's enabled on all Neoverse cpus.
|
||||
return is_arm_neoverse() ? 8 * 1024 : 0;
|
||||
}();
|
||||
const char* ptr = std::getenv("TORCH_MKLDNN_MATMUL_MIN_SIZE");
|
||||
return ptr != nullptr ? std::atoi(ptr) : default_min_size;
|
||||
|
@ -209,7 +209,13 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
|
||||
|
||||
bool all_contiguous = is_contiguous(input);
|
||||
constexpr bool mixed_type = !std::is_same_v<scalar_t, param_t>;
|
||||
const auto dtype = mixed_type ? kFloat : input.scalar_type();
|
||||
// Using float data type for Half _var_sum in batchnorm stats updating on CPU
|
||||
// to avoid _var_sum overflow since the representation range of Half is small.
|
||||
using opmath_t = std::conditional_t<std::is_same_v<param_t, at::Half>, at::opmath_type<param_t>, param_t>;
|
||||
auto dtype = mixed_type ? kFloat : input.scalar_type();
|
||||
if (dtype == kHalf) {
|
||||
dtype = kFloat;
|
||||
}
|
||||
|
||||
auto save_mean_a = save_mean.accessor<param_t, 1>();
|
||||
auto save_var_transform_a = save_var_transform.accessor<param_t, 1>();
|
||||
@ -220,9 +226,9 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
|
||||
if (all_contiguous) {
|
||||
auto _mean = at::empty({n_input}, input.options().dtype(dtype));
|
||||
auto _var_sum = at::empty({n_input}, input.options().dtype(dtype));
|
||||
auto _mean_a = _mean.accessor<param_t, 1>();
|
||||
auto _var_sum_a = _var_sum.accessor<param_t, 1>();
|
||||
auto momentum_ = static_cast<param_t>(momentum);
|
||||
auto _mean_a = _mean.accessor<opmath_t, 1>();
|
||||
auto _var_sum_a = _var_sum.accessor<opmath_t, 1>();
|
||||
auto momentum_ = static_cast<opmath_t>(momentum);
|
||||
|
||||
batch_norm_cpu_collect_stats_stub(kCPU, _mean, _var_sum, input);
|
||||
|
||||
|
@ -284,7 +284,7 @@ void resize_bytes_nocuda(const Storage& storage, const c10::SymInt& newsize) {
|
||||
} else if (device_type == at::kPrivateUse1) {
|
||||
at::detail::getPrivateUse1Hooks().resizePrivateUse1Bytes(
|
||||
storage, newsize.expect_int());
|
||||
} else if (device_type == at::kXPU || device_type == at::kHPU) {
|
||||
} else if (device_type == at::kXPU || device_type == at::kHPU || device_type == at::kMTIA) {
|
||||
ptrdiff_t size_bytes_i = newsize.expect_int();
|
||||
TORCH_CHECK(
|
||||
!c10::overflows<int64_t>(size_bytes_i),
|
||||
|
@ -11,18 +11,18 @@ namespace ao {
|
||||
namespace sparse {
|
||||
|
||||
namespace {
|
||||
constexpr int64_t serialization_version_index = 0;
|
||||
constexpr int64_t bias_index = 1;
|
||||
constexpr int64_t out_features_block_size_index = 2;
|
||||
constexpr int64_t in_features_block_size_index = 3;
|
||||
constexpr int64_t weight_scales_index = 4;
|
||||
constexpr int64_t weight_zero_point_index = 5;
|
||||
constexpr int64_t quantization_scheme_index = 6;
|
||||
constexpr int64_t row_block_indices_index = 7;
|
||||
constexpr int64_t col_block_indices_index = 8;
|
||||
constexpr int64_t weight_values_index = 9;
|
||||
constexpr int64_t num_output_channels_index = 10;
|
||||
constexpr int64_t num_input_channels_index = 11;
|
||||
constexpr int64_t serialization_version_index [[maybe_unused]] = 0;
|
||||
constexpr int64_t bias_index [[maybe_unused]] = 1;
|
||||
constexpr int64_t out_features_block_size_index [[maybe_unused]] = 2;
|
||||
constexpr int64_t in_features_block_size_index [[maybe_unused]] = 3;
|
||||
constexpr int64_t weight_scales_index [[maybe_unused]] = 4;
|
||||
constexpr int64_t weight_zero_point_index [[maybe_unused]] = 5;
|
||||
constexpr int64_t quantization_scheme_index [[maybe_unused]] = 6;
|
||||
constexpr int64_t row_block_indices_index [[maybe_unused]] = 7;
|
||||
constexpr int64_t col_block_indices_index [[maybe_unused]] = 8;
|
||||
constexpr int64_t weight_values_index [[maybe_unused]] = 9;
|
||||
constexpr int64_t num_output_channels_index [[maybe_unused]] = 10;
|
||||
constexpr int64_t num_input_channels_index [[maybe_unused]] = 11;
|
||||
|
||||
template <typename TENSOR_DTYPE, typename VEC_DTYPE>
|
||||
std::vector<VEC_DTYPE> unwrap_vector(at::Tensor tensor) {
|
||||
|
@ -81,6 +81,12 @@ void atan2_kernel(TensorIteratorBase& iter) {
|
||||
}
|
||||
|
||||
#if !defined(C10_MOBILE)
|
||||
#define _AT_DISPATCH_INTEGRAL_TYPES_V2(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_V2( \
|
||||
TYPE, \
|
||||
NAME, \
|
||||
AT_WRAP(__VA_ARGS__), \
|
||||
AT_EXPAND(AT_INTEGRAL_TYPES_V2))
|
||||
#define _AT_DISPATCH_ALL_TYPES_AND_BOOL(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_V2( \
|
||||
TYPE, \
|
||||
@ -104,6 +110,8 @@ void atan2_kernel(TensorIteratorBase& iter) {
|
||||
AT_DISPATCH_V2(TYPE, NAME, AT_WRAP(__VA_ARGS__), \
|
||||
kHalf, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
|
||||
#else
|
||||
#define _AT_DISPATCH_INTEGRAL_TYPES_V2(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, __VA_ARGS__)
|
||||
#define _AT_DISPATCH_ALL_TYPES_AND_BOOL(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( \
|
||||
kComplexHalf, kHalf, kBool, kBFloat16, TYPE, NAME, __VA_ARGS__)
|
||||
@ -382,7 +390,7 @@ void bitwise_and_kernel(TensorIteratorBase& iter) {
|
||||
if (iter.dtype() == ScalarType::Bool) {
|
||||
cpu_kernel(iter, [](bool a, bool b) { return a && b; });
|
||||
} else {
|
||||
AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "bitwise_and_cpu", [&]() {
|
||||
_AT_DISPATCH_INTEGRAL_TYPES_V2(iter.dtype(), "bitwise_and_cpu", [&]() {
|
||||
cpu_kernel_vec(
|
||||
iter,
|
||||
[](scalar_t a, scalar_t b) -> scalar_t { return a & b; },
|
||||
@ -395,7 +403,7 @@ void bitwise_or_kernel(TensorIteratorBase& iter) {
|
||||
if (iter.dtype() == ScalarType::Bool) {
|
||||
cpu_kernel(iter, [](bool a, bool b) { return a || b; });
|
||||
} else {
|
||||
AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "bitwise_or_cpu", [&]() {
|
||||
_AT_DISPATCH_INTEGRAL_TYPES_V2(iter.dtype(), "bitwise_or_cpu", [&]() {
|
||||
cpu_kernel_vec(
|
||||
iter,
|
||||
[](scalar_t a, scalar_t b) -> scalar_t { return a | b; },
|
||||
@ -410,7 +418,7 @@ void bitwise_xor_kernel(TensorIteratorBase& iter) {
|
||||
// this operation for both Boolean and integral types.
|
||||
cpu_kernel(iter, [](bool a, bool b) { return a != b; });
|
||||
} else {
|
||||
AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "bitwise_xor_cpu", [&]() {
|
||||
_AT_DISPATCH_INTEGRAL_TYPES_V2(iter.dtype(), "bitwise_xor_cpu", [&]() {
|
||||
cpu_kernel_vec(
|
||||
iter,
|
||||
[](scalar_t a, scalar_t b) -> scalar_t { return a ^ b; },
|
||||
|
@ -16,7 +16,6 @@
|
||||
#else
|
||||
#include <ATen/ops/empty.h>
|
||||
#endif
|
||||
|
||||
namespace at::native {
|
||||
|
||||
namespace {
|
||||
@ -202,7 +201,97 @@ void reshape_attn_mask_to_4d(
|
||||
.expand({attn_mask_size_0, attn_mask_size_1, qSize, kvSize});
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename mask_t, int64_t q_split_size, int64_t kv_split_size>
|
||||
template <typename scalar_t>
|
||||
inline void copy_value_with_pad(
|
||||
const scalar_t* value_ptr,
|
||||
scalar_t* dst_ptr,
|
||||
int64_t rows,
|
||||
int64_t cols,
|
||||
int64_t prows,
|
||||
int64_t pcols,
|
||||
int64_t ldi) {
|
||||
auto vec_size = at::vec::Vectorized<scalar_t>::size();
|
||||
int64_t i = 0;
|
||||
for (; i < rows; i++) {
|
||||
int64_t j = 0;
|
||||
for (; j < cols - (cols % vec_size); j += vec_size) {
|
||||
auto vec_v =
|
||||
at::vec::Vectorized<scalar_t>::loadu(value_ptr + i * ldi + j);
|
||||
vec_v.store(dst_ptr + i * pcols + j);
|
||||
}
|
||||
|
||||
if (j < cols) {
|
||||
auto vec_v = at::vec::Vectorized<scalar_t>::loadu(
|
||||
value_ptr + i * ldi + j, cols - j);
|
||||
vec_v.store(dst_ptr + i * pcols + j, cols - j);
|
||||
}
|
||||
|
||||
// col padding
|
||||
auto psize = pcols - cols;
|
||||
if (psize > 0) {
|
||||
auto zero_vec = at::vec::Vectorized<scalar_t>(0);
|
||||
int64_t pj = 0;
|
||||
for (; pj < psize - (psize % vec_size); pj += vec_size) {
|
||||
zero_vec.store(dst_ptr + i * pcols + cols + pj);
|
||||
}
|
||||
if (pj < psize) {
|
||||
zero_vec.store(dst_ptr + i * pcols + cols + pj, psize - pj);
|
||||
}
|
||||
}
|
||||
}
|
||||
// row padding
|
||||
for (; i < prows; i++) {
|
||||
auto zero_vec = at::vec::Vectorized<scalar_t>(0);
|
||||
int64_t j = 0;
|
||||
for (; j < pcols - (pcols % vec_size); j += vec_size) {
|
||||
zero_vec.store(dst_ptr + i * pcols + j);
|
||||
}
|
||||
if (j < pcols) {
|
||||
zero_vec.store(dst_ptr + i * pcols + j, pcols - j);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
inline void pad_remain_row_col_zero(
|
||||
scalar_t* value_ptr,
|
||||
int rows,
|
||||
int cols,
|
||||
int prows,
|
||||
int pcols,
|
||||
int ldi) {
|
||||
auto psize = pcols - cols;
|
||||
if (psize == 0 && prows == rows) {
|
||||
return;
|
||||
}
|
||||
auto vec_size = at::vec::Vectorized<scalar_t>::size();
|
||||
auto zero = at::vec::Vectorized<scalar_t>(0);
|
||||
if (psize > 0) {
|
||||
for (int i = 0; i < rows; i++) {
|
||||
int j = 0;
|
||||
for (; j < psize - (psize % vec_size); j += vec_size) {
|
||||
zero.store(value_ptr + i * ldi + cols + j);
|
||||
}
|
||||
if (j < psize) {
|
||||
zero.store(value_ptr + i * ldi + cols + j, psize - j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = rows; i < prows; i++) {
|
||||
int j = 0;
|
||||
for (; j < pcols - (pcols % vec_size); j += vec_size) {
|
||||
zero.store(value_ptr + i * ldi + j);
|
||||
}
|
||||
if (j < pcols) {
|
||||
zero.store(value_ptr + i * ldi + j, pcols - j);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename mask_t, int64_t q_split_size, int64_t kv_split_size, bool with_pack=false>
|
||||
void cpu_flash_attention(
|
||||
const Tensor& output,
|
||||
const Tensor& logsumexp,
|
||||
@ -278,21 +367,70 @@ void cpu_flash_attention(
|
||||
|
||||
int64_t qSplitSize = q_split_size > qSize ? qSize : q_split_size;
|
||||
int64_t kvSplitSize = kv_split_size > kvSize ? kvSize : kv_split_size;
|
||||
int64_t qSlice = (qSize - 1) / qSplitSize + 1;
|
||||
int64_t qSlice = (qSize + qSplitSize - 1) / qSplitSize;
|
||||
int64_t kvSlice = (kvSize + kvSplitSize - 1) / kvSplitSize;
|
||||
int64_t kvTail = (kvSize - 1) % kvSplitSize + 1;
|
||||
int64_t num_thread = at::get_num_threads();
|
||||
|
||||
const auto dtype = query.scalar_type();
|
||||
const auto accumulate_dtype = toOpMathType(dtype);
|
||||
|
||||
// Whether pack is needed
|
||||
bool need_pack = false;
|
||||
// Block size of packing B matrix
|
||||
int64_t packb_size = 64;
|
||||
// Use packb_size due to the limitation:
|
||||
// oneDNN pack only supports output leading dimention being one of (16, 32, 48, 64)
|
||||
// For instance,
|
||||
// for q @ k.T [qSplitSize, headSize] * [headSize, kvSplitSize] = [qSplitSize, kvSplitSize],
|
||||
// we need to split kvSplitSize with packb_size for packing k.T,
|
||||
// for (q @ k.T) @ v [qSplitSize, kvSplitSize] x [kvSplitSize, headSize] -> [qSplitSize, headSize],
|
||||
// we need to split headSize with packb_size for packing v
|
||||
// TODO Simplify the check when oneDNN supports fused pack with transpose and has better performance
|
||||
if (with_pack) {
|
||||
need_pack = num_head >= 4 && headSize % packb_size == 0 && kvSize >= packb_size;
|
||||
if (need_pack) {
|
||||
float pack_size = batchSize * num_head * kvSize * headSize / 1024;
|
||||
float gemm_size_per_thread =
|
||||
(batchSize * num_head * qSlice + num_thread - 1) / num_thread *
|
||||
qSplitSize * (is_causal ? qSize : kvSize) * headSize / 1024;
|
||||
float gsize = gemm_size_per_thread / pack_size;
|
||||
// When the number of gemm is much greater than the number of pack,
|
||||
// the pack and padding overhead can be overlaped.
|
||||
if (pack_size < 2688) {
|
||||
need_pack = gsize >= 36 || (gsize >= 24 && headSize > packb_size);
|
||||
} else if (pack_size < 16384) {
|
||||
need_pack = gsize >= (is_causal ? 54 : 52);
|
||||
} else {
|
||||
need_pack = gsize >= (is_causal ? 54 : 40);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int64_t rHeadSize = need_pack ? (headSize + packb_size - 1) / packb_size * packb_size : headSize;
|
||||
int64_t rkvSplitSize = need_pack ? (kvSplitSize + packb_size - 1) / packb_size * packb_size : kvSplitSize;
|
||||
int64_t rkvTail = need_pack ? (kvTail + packb_size - 1) / packb_size * packb_size : kvTail;
|
||||
int64_t rkvSize = kv_split_size > kvSize ? rkvTail : rkvSplitSize * kvSlice + rkvTail;
|
||||
|
||||
// oneDNN pack does not support odd K now, we need also pad odd K
|
||||
bool headSize_even = headSize % 2 == 0;
|
||||
int64_t eheadSize = need_pack && !headSize_even ? headSize + 1: headSize;
|
||||
int64_t ekvSplitSize = need_pack && (kvSplitSize % 2 != 0) ? kvSplitSize + 1 : kvSplitSize;
|
||||
int64_t ekvTail = need_pack && (kvTail % 2 != 0) ? kvTail + 1 : kvTail;
|
||||
|
||||
// allocate per thread temp buf (accumulate type)
|
||||
int64_t size_per_thread =
|
||||
/* qk */ qSplitSize * kvSplitSize +
|
||||
/* qk */ qSplitSize * rkvSplitSize +
|
||||
/* qk_max */ qSplitSize +
|
||||
/* qk_sum */ qSplitSize +
|
||||
/* dst */ qSplitSize * headSize;
|
||||
/* dst */ qSplitSize * rHeadSize;
|
||||
|
||||
at::Tensor buf = at::empty({num_thread, size_per_thread}, query.options().dtype(accumulate_dtype));
|
||||
at::Tensor buf_reduced = at::empty({num_thread, qSplitSize, is_reduced_type ? kvSplitSize : 0}, query.options());
|
||||
at::Tensor buf_reduced = at::empty(
|
||||
{num_thread,
|
||||
qSplitSize,
|
||||
is_reduced_type ? ekvSplitSize : 0},
|
||||
query.options());
|
||||
|
||||
// Data ptrs
|
||||
const scalar_t* q_data = query.const_data_ptr<scalar_t>();
|
||||
@ -306,16 +444,128 @@ void cpu_flash_attention(
|
||||
accum_t* buf_data = buf.data_ptr<accum_t>();
|
||||
scalar_t* buf_reduced_data = is_reduced_type ? buf_reduced.data_ptr<scalar_t>() : nullptr;
|
||||
|
||||
// Buffer to store padding query
|
||||
scalar_t* query_padding_ptr = nullptr;
|
||||
std::unique_ptr<scalar_t[]> query_padding_data;
|
||||
if (!headSize_even && need_pack) {
|
||||
query_padding_data = std::make_unique<scalar_t[]>(num_thread * qSplitSize * eheadSize);
|
||||
query_padding_ptr = query_padding_data.get();
|
||||
}
|
||||
// Buffer to store Key and Value after transforms
|
||||
scalar_t* key_reorder_ptr = nullptr;
|
||||
std::unique_ptr<scalar_t[]> key_reorder_data;
|
||||
scalar_t* value_reorder_ptr = nullptr;
|
||||
std::unique_ptr<scalar_t[]> value_reorder_data;
|
||||
int kv_padding_size = (kvSize - 1) / kvSplitSize * ekvSplitSize + ekvTail;
|
||||
if (need_pack) {
|
||||
key_reorder_data = std::make_unique<scalar_t[]>(batchSize * num_head * eheadSize * rkvSize);
|
||||
key_reorder_ptr = key_reorder_data.get();
|
||||
value_reorder_data = std::make_unique<scalar_t[]>(batchSize * num_head * kv_padding_size * rHeadSize);
|
||||
value_reorder_ptr = value_reorder_data.get();
|
||||
}
|
||||
|
||||
// Reorder K, V
|
||||
if (need_pack) {
|
||||
at::parallel_for(0, batchSize * num_head * kvSlice, 1, [&](int64_t begin, int64_t end) {
|
||||
int64_t i = 0, j = 0, l = 0, n = 0;
|
||||
at::native::data_index_init(begin, i, batchSize, j, num_head, l, kvSlice);
|
||||
std::unique_ptr<scalar_t[]> transpose_buffer = std::make_unique<scalar_t[]>(eheadSize * packb_size);
|
||||
scalar_t* transpose_buffer_ptr = transpose_buffer.get();
|
||||
std::unique_ptr<scalar_t[]> v_copy_buffer = std::make_unique<scalar_t[]>(ekvSplitSize * packb_size);
|
||||
scalar_t* v_copy_buffer_ptr = v_copy_buffer.get();
|
||||
for (const auto z : c10::irange(begin, end)) {
|
||||
(void)z; // Suppress unused variable
|
||||
n = l * kvSplitSize;
|
||||
int64_t kvBlockSize = std::min(kvSplitSize, kvSize - n);
|
||||
int64_t ekvBlockSize = kvBlockSize % 2 == 0 ? kvBlockSize : kvBlockSize + 1;
|
||||
|
||||
// Split kvSplitSize with packb_size
|
||||
// [kvSplitSize, headSize] -> [div_up(kvSplitSize, packb_size), packb_size, headSize]
|
||||
// Transpose [packb_size, headSize] -> [headSize, packb_size]
|
||||
// Pack transposed buffer
|
||||
|
||||
for (int64_t b = 0; b < kvBlockSize; b += packb_size) {
|
||||
bool tail = kvBlockSize - b < packb_size;
|
||||
// TODO Use fused pack with transpose support when oneDNN supports such usage
|
||||
utils::transpose<uint16_t>(
|
||||
tail ? kvBlockSize - b : packb_size,
|
||||
headSize,
|
||||
/* src_ptr */
|
||||
reinterpret_cast<const uint16_t*>(
|
||||
k_data + i * kStrideB + j * kStrideH + n * kStrideN +
|
||||
b * kStrideN),
|
||||
/* ld_src */ kStrideN,
|
||||
/* dst */ reinterpret_cast<uint16_t*>(transpose_buffer_ptr),
|
||||
/* ld_dst */ packb_size);
|
||||
// Pad [headSize, x] -> [eheadSize, x]
|
||||
if (!headSize_even) {
|
||||
pad_remain_row_col_zero<scalar_t>(
|
||||
transpose_buffer_ptr,
|
||||
headSize,
|
||||
packb_size,
|
||||
eheadSize,
|
||||
packb_size,
|
||||
packb_size);
|
||||
}
|
||||
// Pack
|
||||
cpublas::pack(
|
||||
/* K */ eheadSize,
|
||||
/* N */ packb_size,
|
||||
/* ld_in */ packb_size,
|
||||
/* ld_out */ packb_size,
|
||||
/* dt_in */ dtype,
|
||||
/* dt_out */ dtype,
|
||||
transpose_buffer_ptr,
|
||||
key_reorder_ptr + i * num_head * eheadSize * rkvSize +
|
||||
j * eheadSize * rkvSize + n * eheadSize + b * eheadSize);
|
||||
}
|
||||
|
||||
// Split headSize with packb_size
|
||||
// [kvSplitSize, headSize] -> [kvSplitSize, div_up(headSize, packb_size), packb_size]
|
||||
for (int64_t b = 0; b < headSize; b += packb_size) {
|
||||
// Do copy due to the limitation of input_ld of oneDNN pack:
|
||||
// Regarding packing [K, N], only input_ld == N is supported
|
||||
// TODO: remove the copy when pack supports input_ld >= N
|
||||
copy_value_with_pad<scalar_t>(
|
||||
v_data + i * vStrideB + j * vStrideH + n * vStrideN + b,
|
||||
v_copy_buffer_ptr,
|
||||
kvBlockSize,
|
||||
(headSize - b < packb_size) ? headSize - b : packb_size,
|
||||
ekvBlockSize,
|
||||
packb_size,
|
||||
vStrideN);
|
||||
cpublas::pack(
|
||||
ekvBlockSize,
|
||||
packb_size,
|
||||
packb_size,
|
||||
packb_size,
|
||||
dtype,
|
||||
dtype,
|
||||
v_copy_buffer_ptr,
|
||||
value_reorder_ptr +
|
||||
i * num_head * kv_padding_size * rHeadSize +
|
||||
j * kv_padding_size * rHeadSize + n * rHeadSize +
|
||||
ekvBlockSize * b);
|
||||
}
|
||||
// Move to the next query
|
||||
at::native::data_index_step(i, batchSize, j, num_head, l, kvSlice);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
at::parallel_for(0, batchSize * num_head * qSlice, 1, [&](int64_t begin, int64_t end) {
|
||||
int64_t i = 0, j = 0, k = 0;
|
||||
data_index_init(begin, i, batchSize, j, num_head, k, qSlice);
|
||||
int ompIdx = at::get_thread_num();
|
||||
accum_t* buf_ptr = buf_data + ompIdx * size_per_thread;
|
||||
accum_t* qk_data = buf_ptr;
|
||||
accum_t* qk_max_data = qk_data + qSplitSize * kvSplitSize;
|
||||
accum_t* qk_max_data = qk_data + qSplitSize * rkvSplitSize;
|
||||
accum_t* qk_sum_data = qk_max_data + qSplitSize;
|
||||
accum_t* dst_data = qk_sum_data + qSplitSize;
|
||||
scalar_t* qk_reduced_data = is_reduced_type ? buf_reduced_data + ompIdx * qSplitSize * kvSplitSize : nullptr;
|
||||
scalar_t* qk_reduced_data = is_reduced_type ? buf_reduced_data + ompIdx * qSplitSize * ekvSplitSize : nullptr;
|
||||
scalar_t* query_t_padding_ptr = (!headSize_even && need_pack)
|
||||
? query_padding_ptr + ompIdx * qSplitSize * eheadSize
|
||||
: nullptr;
|
||||
|
||||
for (const auto z : c10::irange(begin, end)) {
|
||||
(void)z; // Suppress unused variable
|
||||
@ -327,10 +577,46 @@ void cpu_flash_attention(
|
||||
fill_stub(qk_sum_data,
|
||||
static_cast<accum_t>(0), qBlockSize);
|
||||
int64_t num_keys = is_causal ? std::min(m + qBlockSize, kvSize) : kvSize;
|
||||
if (!headSize_even && need_pack) {
|
||||
// Pad query if headSize is not even
|
||||
// [qBlockSize, headSize] -> [qBlockSize, eheadSize]
|
||||
copy_value_with_pad<scalar_t>(
|
||||
q_data + i * qStrideB + j * qStrideH + m * qStrideM,
|
||||
query_t_padding_ptr,
|
||||
qBlockSize,
|
||||
headSize,
|
||||
qBlockSize,
|
||||
eheadSize,
|
||||
qStrideM
|
||||
);
|
||||
}
|
||||
for (int64_t n = 0; n < num_keys; n += kvSplitSize) {
|
||||
int64_t kvBlockSize = std::min(kvSplitSize, kvSize - n);
|
||||
int64_t ekvBlockSize = (need_pack && kvBlockSize % 2 != 0) ? kvBlockSize + 1 : kvBlockSize;
|
||||
int64_t rkvBlockSize = kvBlockSize == kvSplitSize ? rkvSplitSize : rkvTail;
|
||||
// Calculate scale * q @ k.T
|
||||
cpublas::gemm(
|
||||
if (need_pack) {
|
||||
if constexpr (std::is_same_v<scalar_t, at::Half>) {
|
||||
for (int64_t b = 0; b < kvBlockSize; b += packb_size) {
|
||||
cpublas::brgemm(
|
||||
qBlockSize,
|
||||
packb_size,
|
||||
eheadSize,
|
||||
headSize_even ? qStrideM : eheadSize,
|
||||
packb_size,
|
||||
rkvBlockSize,
|
||||
1.f,
|
||||
0.f,
|
||||
!headSize_even
|
||||
? query_t_padding_ptr
|
||||
: q_data + i * qStrideB + j * qStrideH + m * qStrideM,
|
||||
key_reorder_ptr + i * num_head * eheadSize * rkvSize +
|
||||
j * eheadSize * rkvSize + n * eheadSize + b * eheadSize,
|
||||
qk_data + b);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
cpublas::gemm(
|
||||
TransposeType::Transpose,
|
||||
TransposeType::NoTranspose,
|
||||
kvBlockSize,
|
||||
@ -346,11 +632,12 @@ void cpu_flash_attention(
|
||||
static_cast<accum_t>(0),
|
||||
qk_data,
|
||||
kvBlockSize);
|
||||
}
|
||||
// Apply causal mask, fill unused with -inf
|
||||
if (is_causal && num_keys - n <= kvSplitSize) {
|
||||
for (const auto row : c10::irange(qBlockSize)) {
|
||||
int64_t last_col = m + row - n;
|
||||
accum_t* row_ptr = qk_data + row * kvBlockSize;
|
||||
accum_t* row_ptr = qk_data + row * rkvBlockSize;
|
||||
fill_stub(row_ptr + last_col + 1,
|
||||
-std::numeric_limits<accum_t>::infinity(),
|
||||
kvBlockSize - last_col - 1);
|
||||
@ -363,29 +650,29 @@ void cpu_flash_attention(
|
||||
for (int64_t row = 0; row < qBlockSize; ++row) {
|
||||
#if __GNUC__ == 11 && __GNUC_MINOR__ >= 4 && defined(__ARM_FEATURE_SVE)
|
||||
_scale_attn_mask_fusion_kernel(
|
||||
qk_data + row * kvBlockSize,
|
||||
qk_data + row * rkvBlockSize,
|
||||
mask_data + i * mStrideB + j * mStrideH +
|
||||
(m + row) * mStrideM + (mStrideN == 0 ? 0 : n),
|
||||
kvBlockSize,
|
||||
qk_data + row * kvBlockSize,
|
||||
qk_data + row * rkvBlockSize,
|
||||
scaling_factor,
|
||||
mStrideN == 0);
|
||||
#else
|
||||
if (mStrideN == 0) {
|
||||
_scale_attn_mask_fusion_kernel</*is_stride_0*/ true>(
|
||||
qk_data + row * kvBlockSize,
|
||||
qk_data + row * rkvBlockSize,
|
||||
mask_data + i * mStrideB + j * mStrideH +
|
||||
(m + row) * mStrideM,
|
||||
kvBlockSize,
|
||||
qk_data + row * kvBlockSize,
|
||||
qk_data + row * rkvBlockSize,
|
||||
scaling_factor);
|
||||
} else {
|
||||
_scale_attn_mask_fusion_kernel</*is_stride_0*/ false>(
|
||||
qk_data + row * kvBlockSize,
|
||||
qk_data + row * rkvBlockSize,
|
||||
mask_data + i * mStrideB + j * mStrideH +
|
||||
(m + row) * mStrideM + n,
|
||||
kvBlockSize,
|
||||
qk_data + row * kvBlockSize,
|
||||
qk_data + row * rkvBlockSize,
|
||||
scaling_factor);
|
||||
}
|
||||
#endif
|
||||
@ -398,28 +685,28 @@ void cpu_flash_attention(
|
||||
// max per row
|
||||
tmp_max = at::vec::reduce_all<accum_t>(
|
||||
[](Vec& x, Vec& y) { return at::vec::maximum(x, y); },
|
||||
qk_data + row * kvBlockSize,
|
||||
qk_data + row * rkvBlockSize,
|
||||
kvBlockSize);
|
||||
} else {
|
||||
// apply scaling factor and max per row in fusion
|
||||
_mul_reduce_max_fusion_kernel(
|
||||
qk_data + row * kvBlockSize,
|
||||
qk_data + row * rkvBlockSize,
|
||||
scaling_factor,
|
||||
kvBlockSize,
|
||||
qk_data + row * kvBlockSize,
|
||||
qk_data + row * rkvBlockSize,
|
||||
tmp_max);
|
||||
}
|
||||
tmp_max = qk_max_data[row] > tmp_max ? qk_max_data[row] : tmp_max;
|
||||
if (tmp_max == -std::numeric_limits<accum_t>::infinity()) {
|
||||
// to avoid `nan = exp2f(-inf - (-inf))`
|
||||
fill_stub(conditional_data_ptr(qk_data, qk_reduced_data) + row * kvBlockSize,
|
||||
fill_stub(conditional_data_ptr(qk_data, qk_reduced_data) + row * ekvBlockSize,
|
||||
static_cast<scalar_t>(0), kvBlockSize);
|
||||
} else {
|
||||
tmp_sum = tmp_max;
|
||||
// qk <- exp(qk - max) and sum per row
|
||||
_exp_reduce_sum_fusion_kernel(
|
||||
qk_data + row * kvBlockSize, kvBlockSize,
|
||||
conditional_data_ptr(qk_data, qk_reduced_data) + row * kvBlockSize,
|
||||
qk_data + row * rkvBlockSize, kvBlockSize,
|
||||
conditional_data_ptr(qk_data, qk_reduced_data) + row * ekvBlockSize,
|
||||
tmp_sum);
|
||||
// exp_tmp <- exp(max[row] - max)
|
||||
exp_tmp = std::exp(qk_max_data[row] - tmp_max);
|
||||
@ -431,12 +718,40 @@ void cpu_flash_attention(
|
||||
if (n > 0) {
|
||||
vec::map<accum_t>(
|
||||
[exp_tmp](Vec x) { return x * Vec(exp_tmp); },
|
||||
dst_data + row * headSize, dst_data + row * headSize, headSize);
|
||||
dst_data + row * rHeadSize,
|
||||
dst_data + row * rHeadSize,
|
||||
headSize);
|
||||
}
|
||||
}
|
||||
if (need_pack && kvBlockSize % 2 != 0) {
|
||||
// Pad: [qSplitSize,kvSplitSize] -> [qSplitSize,kvSplitSize + 1]
|
||||
*(qk_reduced_data + row * (1 + kvBlockSize) + kvBlockSize) = scalar_t(0);
|
||||
}
|
||||
}
|
||||
// Calculate Softmax(q @ k.T) @ v
|
||||
cpublas::gemm(
|
||||
if (need_pack) {
|
||||
int64_t psize = n / kvSplitSize * ekvSplitSize;
|
||||
if constexpr (std::is_same_v<scalar_t, at::Half>) {
|
||||
for (int64_t b = 0; b < headSize; b += packb_size) {
|
||||
cpublas::brgemm(
|
||||
qBlockSize,
|
||||
packb_size,
|
||||
ekvBlockSize,
|
||||
ekvBlockSize,
|
||||
packb_size,
|
||||
rHeadSize,
|
||||
1.0,
|
||||
n == 0 ? 0.f : 1.f,
|
||||
qk_reduced_data,
|
||||
value_reorder_ptr +
|
||||
i * num_head * kv_padding_size * rHeadSize +
|
||||
j * kv_padding_size * rHeadSize + psize * rHeadSize +
|
||||
b * ekvBlockSize,
|
||||
dst_data + b);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
cpublas::gemm(
|
||||
TransposeType::NoTranspose,
|
||||
TransposeType::NoTranspose,
|
||||
headSize,
|
||||
@ -451,6 +766,7 @@ void cpu_flash_attention(
|
||||
n == 0 ? static_cast<accum_t>(0) : static_cast<accum_t>(1),
|
||||
dst_data,
|
||||
headSize);
|
||||
}
|
||||
}
|
||||
|
||||
// dst <- dst / sum[row]
|
||||
@ -465,7 +781,7 @@ void cpu_flash_attention(
|
||||
vec::map<scalar_t>(
|
||||
[sum_reciprocal](Vec x) { return x * Vec(sum_reciprocal); },
|
||||
out_data + i * oStrideB + j * oStrideH + m * oStrideM + row * oStrideM,
|
||||
dst_data + row * headSize,
|
||||
dst_data + row * rHeadSize,
|
||||
headSize);
|
||||
}
|
||||
// Store logsumexp for backward
|
||||
@ -478,7 +794,9 @@ void cpu_flash_attention(
|
||||
data_index_step(i, batchSize, j, num_head, k, qSlice);
|
||||
}
|
||||
});
|
||||
|
||||
if (need_pack) {
|
||||
cpublas::brgemm_release();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename mask_t, int64_t q_split_size, int64_t kv_split_size>
|
||||
@ -826,6 +1144,13 @@ void cpu_flash_attention_backward(
|
||||
AT_PRIVATE_CASE_TYPE_USING_HINT( \
|
||||
at::ScalarType::Half, mask_t, __VA_ARGS__))
|
||||
|
||||
#define FLASH_ATTENTION_KERNEL(FNAME, PACK, TYPE1, TYPE2, SEQ1, SEQ2, ...) \
|
||||
if (PACK) { \
|
||||
FNAME<TYPE1, TYPE2, SEQ1, SEQ2, true>(__VA_ARGS__); \
|
||||
} else { \
|
||||
FNAME<TYPE1, TYPE2, SEQ1, SEQ2>(__VA_ARGS__); \
|
||||
}
|
||||
|
||||
void flash_attention_kernel_impl(
|
||||
const Tensor& output,
|
||||
const Tensor& logsumexp,
|
||||
@ -838,33 +1163,37 @@ void flash_attention_kernel_impl(
|
||||
std::optional<double> scale) {
|
||||
auto q_seq_len = query.size(2);
|
||||
|
||||
// When q_seq_len and k_seq_len are long enough,
|
||||
// cpu_flash_attention with pack has better performance.
|
||||
bool could_pack = (query.scalar_type() == kHalf && cpublas::need_pack(kHalf));
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, query.scalar_type(), "flash_attention", [&] {
|
||||
if (!attn_mask.has_value()) {
|
||||
if (q_seq_len >= 768) {
|
||||
cpu_flash_attention<scalar_t, scalar_t, 256, 512>(
|
||||
FLASH_ATTENTION_KERNEL(cpu_flash_attention, could_pack, scalar_t, scalar_t, 256, 512,
|
||||
output, logsumexp, query, key, value,
|
||||
dropout_p, is_causal, attn_mask, scale);
|
||||
} else if (q_seq_len >= 192) {
|
||||
cpu_flash_attention<scalar_t, scalar_t, 64, 512>(
|
||||
FLASH_ATTENTION_KERNEL(cpu_flash_attention, could_pack, scalar_t, scalar_t, 64, 512,
|
||||
output, logsumexp, query, key, value,
|
||||
dropout_p, is_causal, attn_mask, scale);
|
||||
} else {
|
||||
cpu_flash_attention<scalar_t, scalar_t, 32, 512>(
|
||||
FLASH_ATTENTION_KERNEL(cpu_flash_attention, could_pack, scalar_t, scalar_t, 32, 512,
|
||||
output, logsumexp, query, key, value,
|
||||
dropout_p, is_causal, attn_mask, scale);
|
||||
}
|
||||
} else {
|
||||
AT_DISPATCH_MASK_TYPES(attn_mask.value().scalar_type(), "flash_attention_mask", [&]() {
|
||||
if (q_seq_len >= 768) {
|
||||
cpu_flash_attention<scalar_t, mask_t, 256, 512>(
|
||||
FLASH_ATTENTION_KERNEL(cpu_flash_attention, could_pack, scalar_t, mask_t, 256, 512,
|
||||
output, logsumexp, query, key, value,
|
||||
dropout_p, is_causal, attn_mask, scale);
|
||||
} else if (q_seq_len >= 192) {
|
||||
cpu_flash_attention<scalar_t, mask_t, 64, 512>(
|
||||
FLASH_ATTENTION_KERNEL(cpu_flash_attention, could_pack, scalar_t, mask_t, 64, 512,
|
||||
output, logsumexp, query, key, value,
|
||||
dropout_p, is_causal, attn_mask, scale);
|
||||
} else {
|
||||
cpu_flash_attention<scalar_t, mask_t, 32, 512>(
|
||||
FLASH_ATTENTION_KERNEL(cpu_flash_attention, could_pack, scalar_t, mask_t, 32, 512,
|
||||
output, logsumexp, query, key, value,
|
||||
dropout_p, is_causal, attn_mask, scale);
|
||||
}
|
||||
@ -873,6 +1202,8 @@ void flash_attention_kernel_impl(
|
||||
});
|
||||
}
|
||||
|
||||
#undef FLASH_ATTENTION_KERNEL
|
||||
|
||||
void flash_attention_backward_kernel_impl(
|
||||
const at::Tensor& grad_q,
|
||||
const at::Tensor& grad_k,
|
||||
|
@ -159,6 +159,12 @@ inline void transpose<float>(int64_t M, int64_t N, const float* src, int64_t ld_
|
||||
TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
|
||||
fbgemm::transpose_simd<float>(M, N, src, ld_src, dst, ld_dst);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void transpose<uint16_t>(int64_t M, int64_t N, const uint16_t* src, int64_t ld_src, uint16_t* dst, int64_t ld_dst) {
|
||||
TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
|
||||
fbgemm::transpose_simd<uint16_t>(M, N, src, ld_src, dst, ld_dst);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename index_t, typename F>
|
||||
|
@ -964,9 +964,9 @@ ScalingType get_scaling_type(
|
||||
|
||||
} // namespace
|
||||
|
||||
// Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax
|
||||
// Computes matrix multiply + bias while applying scaling to input and output matrices
|
||||
// Scales are only applicable when matrices are of Float8 type and assumbed to be equal to 1.0 by default.
|
||||
// If output matrix type is 16 or 32-bit type, neither scale_result is applied nor amax is computed.
|
||||
// If output matrix type is 16 or 32-bit type, scale_result is not applied.
|
||||
// Known limitations:
|
||||
// - Only works if mat1 is row-major and mat2 is column-major
|
||||
// - Only works if matrices sizes are divisible by 32
|
||||
@ -1068,9 +1068,6 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
|
||||
const auto out_dtype_ = args.result->scalar_type();
|
||||
TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt");
|
||||
|
||||
// Some scaled_gemms require an amax to populate lets create one here
|
||||
Tensor amax = at::empty({0}, mat1.options().dtype(ScalarType::Float));
|
||||
|
||||
#ifdef USE_ROCM
|
||||
auto tuning_ctx = at::cuda::tunable::getTuningContext();
|
||||
if (tuning_ctx->IsTunableOpEnabled()) {
|
||||
@ -1126,7 +1123,6 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
|
||||
params.c_scale_ptr = scale_result ? scale_result->data_ptr() : nullptr;
|
||||
params.ldc = args.result_ld;
|
||||
params.c_dtype = out_dtype_;
|
||||
params.amax_ptr = amax.data_ptr();
|
||||
params.use_fast_accum = use_fast_accum;
|
||||
if (transa_ && transb_) {
|
||||
TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::T)
|
||||
@ -1150,11 +1146,6 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
|
||||
else
|
||||
#endif
|
||||
{
|
||||
#if defined(USE_ROCM) && ROCM_VERSION >= 60200
|
||||
// hipBlasLT requires scaleD to be set to something in order to use AMAX
|
||||
auto dummy_options = TensorOptions().dtype(kFloat).device(kCUDA);
|
||||
auto dummy_scale = at::ones(1, dummy_options);
|
||||
#endif
|
||||
at::cuda::blas::scaled_gemm(
|
||||
args.transa,
|
||||
args.transb,
|
||||
@ -1172,14 +1163,9 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
|
||||
bias ? bias->data_ptr(): nullptr,
|
||||
bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_,
|
||||
args.result->data_ptr(),
|
||||
#if defined(USE_ROCM) && ROCM_VERSION >= 60200
|
||||
scale_result ? scale_result->data_ptr() : dummy_scale.data_ptr(),
|
||||
#else
|
||||
scale_result ? scale_result->data_ptr() : nullptr,
|
||||
#endif
|
||||
args.result_ld,
|
||||
out_dtype_,
|
||||
amax.data_ptr(),
|
||||
use_fast_accum);
|
||||
}
|
||||
|
||||
|
@ -102,7 +102,7 @@ void col2im_out_cuda_template(
|
||||
output.resize_({batch_size, n_output_plane, output_height, output_width});
|
||||
int64_t output_batch_stride = output.stride(0);
|
||||
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kHalf, kBFloat16, kBool,
|
||||
input.scalar_type(), "col2im_out_cuda", [&] {
|
||||
int64_t height_col = (output_height + 2 * pad_height -
|
||||
(dilation_height * (kernel_height - 1) + 1)) /
|
||||
|
@ -103,7 +103,7 @@ static void im2col_out_cuda_template(
|
||||
output.resize_({batch_size, n_output_plane, output_length});
|
||||
|
||||
// Launch kernel
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kHalf, kBFloat16, kBool,
|
||||
input.scalar_type(), "im2col_out_cuda", [&] {
|
||||
Tensor input_n;
|
||||
Tensor output_n;
|
||||
|
@ -1092,7 +1092,11 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
|
||||
}
|
||||
|
||||
constexpr int min_values_per_thread = 16;
|
||||
#ifndef USE_ROCM
|
||||
constexpr int max_values_per_thread = 256;
|
||||
#else
|
||||
constexpr int max_values_per_thread = 1024;
|
||||
#endif
|
||||
|
||||
if (config.values_per_thread() >= block_height * 16 || config.values_per_thread() >= max_values_per_thread) {
|
||||
// Divide the input across warps in a thread-block, if that leaves at least
|
||||
|
@ -22,6 +22,7 @@ void run_cudnn_SDP_fprop(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
Tensor& softmaxstats,
|
||||
Tensor& o,
|
||||
Tensor& dropoutseed,
|
||||
@ -43,6 +44,7 @@ void run_cudnn_SDP_bprop(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
const Tensor& o,
|
||||
const Tensor& dO,
|
||||
const Tensor& softmaxstats,
|
||||
@ -86,9 +88,9 @@ using graph_and_tensors = std::tuple<
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // Q,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // K,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // V,
|
||||
std::optional<std::shared_ptr<fe::graph::Tensor_attributes>>, // Bias
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // Attn_scale,
|
||||
// TODO(eqy): additional options
|
||||
// std::shared_ptr<fe::graph::Tensor_attributes>, // Bias,
|
||||
// std::shared_ptr<fe::graph::Tensor_attributes>, // SEQ_LEN_Q,
|
||||
// std::shared_ptr<fe::graph::Tensor_attributes>, // SEQ_LEN_KV,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // Seed,
|
||||
@ -104,7 +106,8 @@ using graph_and_tensors_backward = std::tuple<
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // Q,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // K,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // V,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // Attn_scale
|
||||
std::optional<std::shared_ptr<fe::graph::Tensor_attributes>>, // Bias,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // Attn_scale,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // Seed,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // Offset,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // O,
|
||||
@ -126,6 +129,8 @@ struct MHAParams {
|
||||
std::array<int, MAX_MHA_DIM> q_stride;
|
||||
std::array<int, MAX_MHA_DIM> k_stride;
|
||||
std::array<int, MAX_MHA_DIM> v_stride;
|
||||
std::array<int, MAX_MHA_DIM> bias_dim;
|
||||
std::array<int, MAX_MHA_DIM> bias_stride;
|
||||
int64_t b;
|
||||
int64_t h;
|
||||
int64_t s_q;
|
||||
@ -135,6 +140,9 @@ struct MHAParams {
|
||||
double dropout_probability;
|
||||
bool is_causal;
|
||||
bool return_softmaxstats;
|
||||
// might be redundant if we take 0 dim/stride
|
||||
// as signaling no-bias
|
||||
bool has_attn_bias;
|
||||
};
|
||||
|
||||
void setMHAParams(
|
||||
@ -148,6 +156,7 @@ void setMHAParams(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
double dropout_probability,
|
||||
bool is_causal,
|
||||
bool return_softmaxstats) {
|
||||
@ -166,6 +175,7 @@ void setMHAParams(
|
||||
params.dropout_probability = dropout_probability;
|
||||
params.is_causal = is_causal;
|
||||
params.return_softmaxstats = return_softmaxstats;
|
||||
params.has_attn_bias = attn_bias.has_value();
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
q.sizes().size() == MAX_MHA_DIM,
|
||||
"Q tensor has unexpected number of dims, please report a bug to PyTorch.");
|
||||
@ -190,6 +200,17 @@ void setMHAParams(
|
||||
std::copy(k.strides().begin(), k.strides().end(), params.k_stride.begin());
|
||||
std::copy(v.sizes().begin(), v.sizes().end(), params.v_dim.begin());
|
||||
std::copy(v.strides().begin(), v.strides().end(), params.v_stride.begin());
|
||||
// uninit is OK as the struct is memset 0'd
|
||||
if (params.has_attn_bias) {
|
||||
std::copy(
|
||||
attn_bias.value().sizes().begin(),
|
||||
attn_bias.value().sizes().end(),
|
||||
params.bias_dim.begin());
|
||||
std::copy(
|
||||
attn_bias.value().strides().begin(),
|
||||
attn_bias.value().strides().end(),
|
||||
params.bias_stride.begin());
|
||||
}
|
||||
}
|
||||
|
||||
struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
|
||||
@ -203,6 +224,7 @@ struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
double dropout_probability,
|
||||
bool is_causal,
|
||||
bool return_softmaxstats) {
|
||||
@ -217,6 +239,7 @@ struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
attn_bias,
|
||||
dropout_probability,
|
||||
is_causal,
|
||||
return_softmaxstats);
|
||||
@ -285,6 +308,7 @@ auto build_graph_and_tensors(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
Tensor& softmaxstats,
|
||||
Tensor& o,
|
||||
Tensor& dropoutseed,
|
||||
@ -301,36 +325,6 @@ auto build_graph_and_tensors(
|
||||
mha_graph->set_io_data_type(dtype)
|
||||
.set_intermediate_data_type(fe::DataType_t::FLOAT)
|
||||
.set_compute_data_type(fe::DataType_t::FLOAT);
|
||||
auto Q = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("Q")
|
||||
.set_dim(std::vector<int64_t>(
|
||||
q.sizes().data(), q.sizes().data() + q.sizes().size()))
|
||||
.set_stride(fixSizeOneDimStrideSDPA(
|
||||
q.sizes(),
|
||||
std::vector<int64_t>(
|
||||
q.strides().data(),
|
||||
q.strides().data() + q.strides().size()))));
|
||||
auto K = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("K")
|
||||
.set_dim(std::vector<int64_t>(
|
||||
k.sizes().data(), k.sizes().data() + k.sizes().size()))
|
||||
.set_stride(fixSizeOneDimStrideSDPA(
|
||||
k.sizes(),
|
||||
std::vector<int64_t>(
|
||||
k.strides().data(),
|
||||
k.strides().data() + k.strides().size()))));
|
||||
auto V = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("V")
|
||||
.set_dim(std::vector<int64_t>(
|
||||
v.sizes().data(), v.sizes().data() + v.sizes().size()))
|
||||
.set_stride(fixSizeOneDimStrideSDPA(
|
||||
v.sizes(),
|
||||
std::vector<int64_t>(
|
||||
v.strides().data(),
|
||||
v.strides().data() + v.strides().size()))));
|
||||
auto attn_scale =
|
||||
mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Attn_scale")
|
||||
@ -338,11 +332,6 @@ auto build_graph_and_tensors(
|
||||
.set_stride({1, 1, 1, 1})
|
||||
.set_is_pass_by_value(true)
|
||||
.set_data_type(fe::DataType_t::FLOAT));
|
||||
// TODO(eqy): support bias in the future in a follow-up PR
|
||||
// auto bias = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
// .set_name("bias")
|
||||
// .set_dim({b, 1, s_q, s_kv})
|
||||
// .set_stride({s_q * s_kv, s_q * s_kv, s_kv, 1}));
|
||||
auto seed = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Seed")
|
||||
.set_dim({1, 1, 1, 1})
|
||||
@ -360,11 +349,30 @@ auto build_graph_and_tensors(
|
||||
.set_causal_mask(is_causal)
|
||||
.set_attn_scale(attn_scale)
|
||||
.set_dropout(dropout_probability, seed, offset);
|
||||
// Optional bias in flash attention is only supported 8.9.3 onwards
|
||||
if (cudnnGetVersion() >= 8904) {
|
||||
// scaled_dot_product_flash_attention_options.set_alibi_mask(true);
|
||||
auto Q = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("Q")
|
||||
.set_dim(q.sizes().vec())
|
||||
.set_stride(fixSizeOneDimStrideSDPA(q.sizes(), q.strides().vec())));
|
||||
auto K = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("K")
|
||||
.set_dim(k.sizes().vec())
|
||||
.set_stride(fixSizeOneDimStrideSDPA(k.sizes(), k.strides().vec())));
|
||||
auto V = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("V")
|
||||
.set_dim(v.sizes().vec())
|
||||
.set_stride(fixSizeOneDimStrideSDPA(v.sizes(), v.strides().vec())));
|
||||
std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
|
||||
if (attn_bias.has_value()) {
|
||||
bias =
|
||||
mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("bias")
|
||||
.set_dim(attn_bias.value().sizes().vec())
|
||||
.set_stride(attn_bias.value().strides().vec()));
|
||||
scaled_dot_product_flash_attention_options.set_bias(bias.value());
|
||||
}
|
||||
|
||||
auto seq_q = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Seq_q")
|
||||
.set_dim({b, 1, 1, 1})
|
||||
@ -376,20 +384,9 @@ auto build_graph_and_tensors(
|
||||
.set_stride({1, 1, 1, 1})
|
||||
.set_data_type(fe::DataType_t::INT32));
|
||||
|
||||
// if (cudnnGetVersion() >= 8903) {
|
||||
// scaled_dot_product_flash_attention_options.set_bias(bias)
|
||||
// .set_padding_mask(true)
|
||||
// .set_seq_len_q(seq_q)
|
||||
// .set_seq_len_kv(seq_kv);
|
||||
// }
|
||||
|
||||
auto [O, Stats] =
|
||||
mha_graph->sdpa(Q, K, V, scaled_dot_product_flash_attention_options);
|
||||
O->set_output(true)
|
||||
.set_dim(std::vector<int64_t>(
|
||||
o.sizes().data(), o.sizes().data() + o.sizes().size()))
|
||||
.set_stride(std::vector<int64_t>(
|
||||
o.strides().data(), o.strides().data() + o.strides().size()));
|
||||
O->set_output(true).set_dim(o.sizes().vec()).set_stride(o.strides().vec());
|
||||
|
||||
if (Stats) {
|
||||
Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT);
|
||||
@ -407,6 +404,7 @@ auto build_graph_and_tensors(
|
||||
std::move(Q),
|
||||
std::move(K),
|
||||
std::move(V),
|
||||
std::move(bias),
|
||||
std::move(attn_scale),
|
||||
std::move(seed),
|
||||
std::move(offset),
|
||||
@ -427,6 +425,7 @@ auto build_graph_and_tensors_backward(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
const Tensor& o,
|
||||
const Tensor& dO,
|
||||
const Tensor& softmaxstats,
|
||||
@ -447,24 +446,6 @@ auto build_graph_and_tensors_backward(
|
||||
mha_graph->set_io_data_type(dtype)
|
||||
.set_intermediate_data_type(fe::DataType_t::FLOAT)
|
||||
.set_compute_data_type(fe::DataType_t::FLOAT);
|
||||
auto Q = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("Q")
|
||||
.set_dim(std::vector<int64_t>(q.sizes().begin(), q.sizes().end()))
|
||||
.set_stride(
|
||||
std::vector<int64_t>(q.strides().begin(), q.strides().end())));
|
||||
auto K = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("K")
|
||||
.set_dim(std::vector<int64_t>(k.sizes().begin(), k.sizes().end()))
|
||||
.set_stride(
|
||||
std::vector<int64_t>(k.strides().begin(), k.strides().end())));
|
||||
auto V = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("V")
|
||||
.set_dim(std::vector<int64_t>(v.sizes().begin(), v.sizes().end()))
|
||||
.set_stride(
|
||||
std::vector<int64_t>(v.strides().begin(), v.strides().end())));
|
||||
auto attn_scale =
|
||||
mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Attn_scale")
|
||||
@ -472,6 +453,31 @@ auto build_graph_and_tensors_backward(
|
||||
.set_stride({1, 1, 1, 1})
|
||||
.set_is_pass_by_value(true)
|
||||
.set_data_type(fe::DataType_t::FLOAT));
|
||||
auto sdpa_backward_options = fe::graph::SDPA_backward_attributes()
|
||||
.set_name("CUDNN_SDPA_BACKWARD")
|
||||
.set_causal_mask(is_causal)
|
||||
.set_attn_scale(attn_scale);
|
||||
auto Q = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Q")
|
||||
.set_dim(q.sizes().vec())
|
||||
.set_stride(q.strides().vec()));
|
||||
auto K = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("K")
|
||||
.set_dim(k.sizes().vec())
|
||||
.set_stride(k.strides().vec()));
|
||||
auto V = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("V")
|
||||
.set_dim(v.sizes().vec())
|
||||
.set_stride(v.strides().vec()));
|
||||
std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
|
||||
if (attn_bias.has_value()) {
|
||||
bias =
|
||||
mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("bias")
|
||||
.set_dim(attn_bias.value().sizes().vec())
|
||||
.set_stride(attn_bias.value().strides().vec()));
|
||||
sdpa_backward_options.set_bias(bias.value());
|
||||
}
|
||||
auto Seed = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Seed")
|
||||
.set_dim({1, 1, 1, 1})
|
||||
@ -482,47 +488,27 @@ auto build_graph_and_tensors_backward(
|
||||
.set_dim({1, 1, 1, 1})
|
||||
.set_stride({1, 1, 1, 1})
|
||||
.set_data_type(fe::DataType_t::INT32));
|
||||
auto O = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("O")
|
||||
.set_dim(std::vector<int64_t>(o.sizes().begin(), o.sizes().end()))
|
||||
.set_stride(
|
||||
std::vector<int64_t>(o.strides().begin(), o.strides().end())));
|
||||
auto STATS = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("Stats")
|
||||
.set_dim(std::vector<int64_t>(
|
||||
softmaxstats.sizes().begin(), softmaxstats.sizes().end()))
|
||||
.set_stride(std::vector<int64_t>(
|
||||
softmaxstats.strides().begin(), softmaxstats.strides().end()))
|
||||
.set_data_type(fe::DataType_t::FLOAT));
|
||||
auto DO = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("DO")
|
||||
.set_dim(std::vector<int64_t>(dO.sizes().begin(), dO.sizes().end()))
|
||||
.set_stride(
|
||||
std::vector<int64_t>(dO.strides().begin(), dO.strides().end())));
|
||||
auto sdpa_backward_options = fe::graph::SDPA_backward_attributes()
|
||||
.set_name("CUDNN_SDPA_BACKWARD")
|
||||
.set_causal_mask(is_causal)
|
||||
.set_attn_scale(attn_scale);
|
||||
auto O = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("O")
|
||||
.set_dim(o.sizes().vec())
|
||||
.set_stride(o.strides().vec()));
|
||||
auto STATS = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Stats")
|
||||
.set_dim(softmaxstats.sizes().vec())
|
||||
.set_stride(softmaxstats.strides().vec())
|
||||
.set_data_type(fe::DataType_t::FLOAT));
|
||||
auto DO = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("DO")
|
||||
.set_dim(dO.sizes().vec())
|
||||
.set_stride(dO.strides().vec()));
|
||||
if (dropout_probability != 0.0f) {
|
||||
sdpa_backward_options.set_dropout(dropout_probability, Seed, Offset);
|
||||
}
|
||||
auto [DQ, DK, DV] =
|
||||
mha_graph->sdpa_backward(Q, K, V, O, DO, STATS, sdpa_backward_options);
|
||||
DQ->set_output(true)
|
||||
.set_dim(std::vector<int64_t>(dQ.sizes().begin(), dQ.sizes().end()))
|
||||
.set_stride(
|
||||
std::vector<int64_t>(dQ.strides().begin(), dQ.strides().end()));
|
||||
DK->set_output(true)
|
||||
.set_dim(std::vector<int64_t>(dK.sizes().begin(), dK.sizes().end()))
|
||||
.set_stride(
|
||||
std::vector<int64_t>(dK.strides().begin(), dK.strides().end()));
|
||||
DV->set_output(true)
|
||||
.set_dim(std::vector<int64_t>(dV.sizes().begin(), dV.sizes().end()))
|
||||
.set_stride(
|
||||
std::vector<int64_t>(dV.strides().begin(), dV.strides().end()));
|
||||
DQ->set_output(true).set_dim(dQ.sizes().vec()).set_stride(dQ.strides().vec());
|
||||
DK->set_output(true).set_dim(dK.sizes().vec()).set_stride(dK.strides().vec());
|
||||
DV->set_output(true).set_dim(dV.sizes().vec()).set_stride(dV.strides().vec());
|
||||
AT_CUDNN_FRONTEND_CHECK(mha_graph->validate());
|
||||
AT_CUDNN_FRONTEND_CHECK(mha_graph->build_operation_graph(handle));
|
||||
AT_CUDNN_FRONTEND_CHECK(
|
||||
@ -534,6 +520,7 @@ auto build_graph_and_tensors_backward(
|
||||
std::move(Q),
|
||||
std::move(K),
|
||||
std::move(V),
|
||||
std::move(bias),
|
||||
std::move(attn_scale),
|
||||
std::move(Seed),
|
||||
std::move(Offset),
|
||||
@ -559,6 +546,7 @@ void run_cudnn_SDP_fprop(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
Tensor& softmaxstats,
|
||||
Tensor& o,
|
||||
Tensor& dropoutseed,
|
||||
@ -573,6 +561,11 @@ void run_cudnn_SDP_fprop(
|
||||
softmaxstats = at::empty({b, h, s_q}, q.options().dtype(kFloat));
|
||||
}
|
||||
|
||||
// do nothing if we got 0-element tensors
|
||||
if (!q.numel() || !k.numel() || !v.numel()) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto key = MHACacheKeyWrapper(
|
||||
b,
|
||||
h,
|
||||
@ -583,6 +576,7 @@ void run_cudnn_SDP_fprop(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
attn_bias,
|
||||
dropout_probability,
|
||||
is_causal,
|
||||
return_softmaxstats);
|
||||
@ -605,13 +599,14 @@ void run_cudnn_SDP_fprop(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
attn_bias,
|
||||
softmaxstats,
|
||||
o,
|
||||
dropoutseed,
|
||||
dropoutoffset,
|
||||
handle);
|
||||
}
|
||||
auto [mha_graph, Q, K, V, attn_scale, seed, offset, O, Stats] =
|
||||
auto [mha_graph, Q, K, V, bias, attn_scale, seed, offset, O, Stats] =
|
||||
graph_and_tensors_values;
|
||||
std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*>
|
||||
variant_pack = {
|
||||
@ -619,13 +614,15 @@ void run_cudnn_SDP_fprop(
|
||||
{K, k.data_ptr()},
|
||||
{V, v.data_ptr()},
|
||||
{attn_scale, &scaling_factor},
|
||||
//{bias, bias.data_ptr()},
|
||||
{seed, dropoutseed.data_ptr()},
|
||||
{offset, dropoutoffset.data_ptr()},
|
||||
{O, o.data_ptr()}};
|
||||
if (return_softmaxstats) {
|
||||
variant_pack[Stats] = softmaxstats.data_ptr();
|
||||
}
|
||||
if (attn_bias.has_value()) {
|
||||
variant_pack[bias.value()] = attn_bias.value().data_ptr();
|
||||
}
|
||||
auto workspace_size = mha_graph->get_workspace_size();
|
||||
auto workspace_ptr =
|
||||
c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
|
||||
@ -647,6 +644,7 @@ void run_cudnn_SDP_bprop(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
const Tensor& o,
|
||||
const Tensor& dO,
|
||||
const Tensor& softmaxstats,
|
||||
@ -655,6 +653,12 @@ void run_cudnn_SDP_bprop(
|
||||
Tensor& dV,
|
||||
const Tensor& dropoutseed,
|
||||
const Tensor& dropoutoffset) {
|
||||
// do nothing if we got 0-element tensors
|
||||
if (!q.numel() || !k.numel() || !v.numel() || !o.numel() || !dO.numel() ||
|
||||
!softmaxstats.numel()) {
|
||||
return;
|
||||
}
|
||||
|
||||
Tensor dO_ = dO;
|
||||
if (!dO.strides()[dO.strides().size() - 1]) {
|
||||
TORCH_WARN(
|
||||
@ -694,6 +698,7 @@ void run_cudnn_SDP_bprop(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
attn_bias,
|
||||
dropout_probability,
|
||||
is_causal,
|
||||
true);
|
||||
@ -715,6 +720,7 @@ void run_cudnn_SDP_bprop(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
attn_bias,
|
||||
o,
|
||||
dO_,
|
||||
softmaxstats,
|
||||
@ -726,8 +732,20 @@ void run_cudnn_SDP_bprop(
|
||||
handle);
|
||||
}
|
||||
auto
|
||||
[mha_graph, Q, K, V, attn_scale, Seed, Offset, O, Do, Stats, Dq, Dk, Dv] =
|
||||
graph_and_tensors_backward_values;
|
||||
[mha_graph,
|
||||
Q,
|
||||
K,
|
||||
V,
|
||||
bias,
|
||||
attn_scale,
|
||||
Seed,
|
||||
Offset,
|
||||
O,
|
||||
Do,
|
||||
Stats,
|
||||
Dq,
|
||||
Dk,
|
||||
Dv] = graph_and_tensors_backward_values;
|
||||
std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*>
|
||||
variant_pack = {// inputs
|
||||
{Q, q.data_ptr()},
|
||||
@ -746,6 +764,9 @@ void run_cudnn_SDP_bprop(
|
||||
variant_pack[Seed] = dropoutseed.data_ptr();
|
||||
variant_pack[Offset] = dropoutoffset.data_ptr();
|
||||
}
|
||||
if (attn_bias.has_value()) {
|
||||
variant_pack[bias.value()] = attn_bias.value().data_ptr();
|
||||
}
|
||||
auto workspace_size = mha_graph->get_workspace_size();
|
||||
auto workspace_ptr =
|
||||
c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
|
||||
|
@ -18,6 +18,7 @@ void run_cudnn_SDP_fprop(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
Tensor& softmaxstats,
|
||||
Tensor& o,
|
||||
Tensor& dropoutseed,
|
||||
@ -36,6 +37,7 @@ void run_cudnn_SDP_bprop(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
const Tensor& o,
|
||||
const Tensor& dO,
|
||||
const Tensor& softmaxstats,
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user