mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 21:49:24 +08:00
Compare commits
233 Commits
v2.5.0-rc5
...
invoke-sub
Author | SHA1 | Date | |
---|---|---|---|
08d8d957e7 | |||
a30d5ba16c | |||
46935c8241 | |||
4f407c1884 | |||
2e461e54e8 | |||
a3d827a28c | |||
4312794b92 | |||
b856f3539b | |||
835e7bb077 | |||
b6d6aa49b8 | |||
deee21cb78 | |||
3f69410976 | |||
18f9331e5d | |||
bc0f330169 | |||
7834c0bb2c | |||
6ef49fe8f1 | |||
a15774563b | |||
564d00f364 | |||
ae02d663cd | |||
ad2f0e9f81 | |||
21ffa18ad1 | |||
2519e5a8de | |||
ba6e0f31ab | |||
7ed0563cad | |||
eb7dd91dd1 | |||
3f30360d05 | |||
4734e356d6 | |||
ac169795a9 | |||
fca58bfda1 | |||
dc71e7a7d4 | |||
1cdf658f4a | |||
b5c52e96e8 | |||
ea2ecab15b | |||
2f53d570fe | |||
31007cf200 | |||
c56728b643 | |||
7d5e0dd4b1 | |||
2af3b8ffd8 | |||
0c080cb2c7 | |||
30b007bea3 | |||
fafdd588f2 | |||
e504fb7069 | |||
b346e99376 | |||
7dc1788396 | |||
9fd54d787d | |||
b38be727eb | |||
e54b559e88 | |||
eea5e6ff0f | |||
6df91b5917 | |||
0cdc6a8dcd | |||
6cdc70bccd | |||
e6b68359d7 | |||
1c04cbfba6 | |||
062681a0ed | |||
8c356ce3da | |||
bf68e16e94 | |||
d732df7e56 | |||
c9de2efde6 | |||
1f15c0c7a5 | |||
a72124add9 | |||
10ca4c0564 | |||
d3aab9642b | |||
67a929eea8 | |||
f576960bbc | |||
1aba224cfd | |||
d383325392 | |||
00dc7d4356 | |||
1760bbc259 | |||
fb9d8e3248 | |||
aaabfc8930 | |||
63d6cd351a | |||
3de9e474df | |||
3e1a4ea132 | |||
e157ce3ebb | |||
b897ab0540 | |||
3d24313809 | |||
cd472bb1e3 | |||
f032135bbf | |||
525bec804c | |||
83c594ebd6 | |||
c1277945d3 | |||
dab7d646d5 | |||
7647c398ff | |||
d67cc58181 | |||
dddaadac6c | |||
02169364e1 | |||
c30042fbeb | |||
6700175531 | |||
de8a8653c0 | |||
86335e9135 | |||
14e3f3c062 | |||
9852c6d236 | |||
6354271178 | |||
12902f6ecf | |||
3decb676aa | |||
8d68a02905 | |||
28330a8a39 | |||
eaba287adb | |||
f5f1d0a753 | |||
5bc238c73e | |||
79223114db | |||
7cfd23636c | |||
0d1d69fd25 | |||
21a64d57b1 | |||
1a74952925 | |||
a130ed828a | |||
eb0fe02933 | |||
d270e2d240 | |||
16b37b309f | |||
13ee85ca5e | |||
94d2471d1f | |||
5ca46be15e | |||
9a04cfbeff | |||
66db61f0d1 | |||
c025f7becc | |||
8c4e1148b8 | |||
e20ee39558 | |||
74fd1bf965 | |||
5d964a5eb7 | |||
118d7e1480 | |||
dd47f6f623 | |||
e05ea2b179 | |||
ad75b09d89 | |||
a2cb9b7331 | |||
451eaf0ff2 | |||
09519eb195 | |||
5314ae2660 | |||
da587de9cb | |||
82a4df2d5f | |||
18a9030952 | |||
03f23d07b4 | |||
8c738c9270 | |||
7ddacaf40a | |||
183c32fd3b | |||
3ab12e2596 | |||
596e93b506 | |||
f96e8041b1 | |||
7cf9c81918 | |||
49e0b88aab | |||
ee8c5cc1cc | |||
ce4d146f56 | |||
0226fcaacf | |||
4cde5096c4 | |||
443c015393 | |||
4ae6d7c18f | |||
3084b7b5c0 | |||
5c3d0a2ded | |||
c608b17f60 | |||
444b52ff40 | |||
160c228a4b | |||
0d15122092 | |||
6a3edfcc1e | |||
356f14e7b7 | |||
34dc8f69a1 | |||
cd9ee49a69 | |||
26e5572dd2 | |||
693897df42 | |||
3bf6be457d | |||
492f064f15 | |||
29408ea81a | |||
02dcb07765 | |||
5c38aa72c0 | |||
5134ba7458 | |||
e48ee2cf50 | |||
eb38ee21ba | |||
8057b72763 | |||
7b17918dc9 | |||
66c45f3ed9 | |||
0a9d55d2ee | |||
4ca65d3323 | |||
c932b39739 | |||
1f15973657 | |||
fc88ba260f | |||
bf8d0e3107 | |||
3a1239a248 | |||
4f9f1775d8 | |||
5e0788befb | |||
440f8f57af | |||
e004d539da | |||
c4b84a46a9 | |||
bc1b8f094d | |||
f65a564fa2 | |||
386b313028 | |||
6d7cbc20d2 | |||
ca16956b20 | |||
67735d1ee8 | |||
6e13f5eb38 | |||
23b1486185 | |||
9902b349cb | |||
5a9ac83e94 | |||
1adf28a5c0 | |||
c18052da0e | |||
c0d2f991b1 | |||
e889252493 | |||
6546c6186d | |||
1d9fefff19 | |||
7ec17b49cf | |||
146921007a | |||
a71e5509bc | |||
136e28f616 | |||
39a61795e3 | |||
b4feec9782 | |||
d81731615f | |||
e2f9a83b85 | |||
70a65a8bd5 | |||
689d278543 | |||
9b764491e3 | |||
cbc6b30a24 | |||
5b368de7f7 | |||
09a5e88bef | |||
a4e6a0b240 | |||
4ab232d0c4 | |||
2032f107d7 | |||
5f7d956362 | |||
a13c118994 | |||
21241bfeee | |||
73a6fc6e30 | |||
09287e3af4 | |||
16c3b8f87c | |||
9c6dff4941 | |||
0eb425a563 | |||
011cae9570 | |||
dfb2b661f7 | |||
5a69e0ebbe | |||
5e145861f2 | |||
c35b953531 | |||
dced0d6d9f | |||
c0436c5701 | |||
60e8dc4374 | |||
e6c3f58584 | |||
90e12cf63d | |||
44c08f4984 | |||
b6186353c6 |
@ -1,5 +1,5 @@
|
||||
0.6b
|
||||
0.7b
|
||||
manylinux_2_17
|
||||
rocm6.2
|
||||
7f07e8a1cb1f99627eb6d77f5c0e9295c775f3c7
|
||||
e4ab195d2bd19e939c675a13280c29714c6ef9f2cf420690da150fa0cac043b1
|
||||
9be04068c3c0857a4cfd17d7e39e71d0423ebac2
|
||||
3e9e1959d23b93d78a08fcc5f868125dc3854dece32fd9458be9ef4467982291
|
||||
|
@ -1 +1 @@
|
||||
cc981feba10a3f4c2e46f3fe368e8fcf5f5643df
|
||||
91b14bf5593cf58a8541f3e6b9125600a867d4ef
|
||||
|
@ -1 +1 @@
|
||||
757b6a61e7df814ba806f498f8bb3160f84b120c
|
||||
5fe38ffd73c2ac6ed6323b554205186696631c6f
|
||||
|
@ -4,12 +4,12 @@ set -ex
|
||||
|
||||
source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
|
||||
|
||||
TARBALL='aotriton.tar.bz2'
|
||||
TARBALL='aotriton.tar.gz'
|
||||
# This read command alwasy returns with exit code 1
|
||||
read -d "\n" VER MANYLINUX ROCMBASE PINNED_COMMIT SHA256 < aotriton_version.txt || true
|
||||
ARCH=$(uname -m)
|
||||
AOTRITON_INSTALL_PREFIX="$1"
|
||||
AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}-shared.tar.bz2"
|
||||
AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}-shared.tar.gz"
|
||||
|
||||
cd "${AOTRITON_INSTALL_PREFIX}"
|
||||
# Must use -L to follow redirects
|
||||
|
@ -337,3 +337,8 @@ onnxscript==0.1.0.dev20240817
|
||||
#Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
|
||||
parameterized==0.8.1
|
||||
#Description: Parameterizes unittests, both the tests themselves and the entire testing class
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
|
@ -1 +1 @@
|
||||
3.0.0
|
||||
3.1.0
|
||||
|
@ -43,6 +43,9 @@ python -m pip install z3-solver==4.12.2.0
|
||||
# Install tlparse for test\dynamo\test_structured_trace.py UTs.
|
||||
python -m pip install tlparse==0.3.25
|
||||
|
||||
# Install parameterized
|
||||
python -m pip install parameterized==0.8.1
|
||||
|
||||
run_tests() {
|
||||
# Run nvidia-smi if available
|
||||
for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
|
||||
|
12
.github/merge_rules.yaml
vendored
12
.github/merge_rules.yaml
vendored
@ -86,6 +86,18 @@
|
||||
- pull
|
||||
- inductor
|
||||
|
||||
- name: OSS CI / pytorchbot / slow tests
|
||||
patterns:
|
||||
- test/slow_tests.json
|
||||
approved_by:
|
||||
- pytorchbot
|
||||
ignore_flaky_failures: false
|
||||
mandatory_checks_name:
|
||||
- EasyCLA
|
||||
- Lint
|
||||
- pull
|
||||
- slow
|
||||
|
||||
- name: OSS CI /pytorchbot / Executorch
|
||||
patterns:
|
||||
- .ci/docker/ci_commit_pins/executorch.txt
|
||||
|
@ -31,3 +31,4 @@ optree==0.12.1
|
||||
# NB: test_hparams_* from test_tensorboard is failing with protobuf 5.26.0 in
|
||||
# which the stringify metadata is wrong when escaping double quote
|
||||
protobuf==3.20.2
|
||||
parameterized==0.8.1
|
||||
|
@ -412,8 +412,8 @@ def generate_wheels_matrix(
|
||||
),
|
||||
}
|
||||
)
|
||||
# Special build building to use on Colab. PyThon 3.10 for 12.1 CUDA
|
||||
if python_version == "3.10" and arch_version == "12.1":
|
||||
# Special build building to use on Colab. Python 3.11 for 12.1 CUDA
|
||||
if python_version == "3.11" and arch_version == "12.1":
|
||||
ret.append(
|
||||
{
|
||||
"python_version": python_version,
|
||||
|
8
.github/scripts/generate_ci_workflows.py
vendored
8
.github/scripts/generate_ci_workflows.py
vendored
@ -70,17 +70,15 @@ class BinaryBuildWorkflow:
|
||||
)
|
||||
else:
|
||||
self.build_environment = f"{self.os}-binary-{self.package_type}"
|
||||
if self.use_split_build:
|
||||
# added to distinguish concurrency groups
|
||||
self.build_environment += "-split"
|
||||
|
||||
def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
|
||||
output_file_path = (
|
||||
GITHUB_DIR
|
||||
/ f"workflows/generated-{self.build_environment}-{self.branches}.yml"
|
||||
)
|
||||
if self.use_split_build:
|
||||
output_file_path = (
|
||||
GITHUB_DIR
|
||||
/ f"workflows/generated-{self.build_environment}-{self.branches}-split.yml"
|
||||
)
|
||||
with open(output_file_path, "w") as output_file:
|
||||
GENERATED = "generated" # Note that please keep the variable GENERATED otherwise phabricator will hide the whole file
|
||||
output_file.writelines([f"# @{GENERATED} DO NOT EDIT MANUALLY\n"])
|
||||
|
8
.github/scripts/github_utils.py
vendored
8
.github/scripts/github_utils.py
vendored
@ -168,6 +168,14 @@ def gh_post_commit_comment(
|
||||
)
|
||||
|
||||
|
||||
def gh_close_pr(org: str, repo: str, pr_num: int, dry_run: bool = False) -> None:
|
||||
url = f"{GITHUB_API_URL}/repos/{org}/{repo}/pulls/{pr_num}"
|
||||
if dry_run:
|
||||
print(f"Dry run closing PR {pr_num}")
|
||||
else:
|
||||
gh_fetch_url(url, method="PATCH", data={"state": "closed"})
|
||||
|
||||
|
||||
def gh_delete_comment(org: str, repo: str, comment_id: int) -> None:
|
||||
url = f"{GITHUB_API_URL}/repos/{org}/{repo}/issues/comments/{comment_id}"
|
||||
gh_fetch_url(url, method="DELETE")
|
||||
|
350
.github/scripts/runner_determinator.py
vendored
350
.github/scripts/runner_determinator.py
vendored
@ -3,49 +3,94 @@
|
||||
"""
|
||||
This runner determinator is used to determine which set of runners to run a
|
||||
GitHub job on. It uses the first comment of a GitHub issue (by default
|
||||
https://github.com/pytorch/test-infra/issues/5132) as a user list to determine
|
||||
which users will get their jobs to run on experimental runners. This user list
|
||||
is also a comma separated list of additional features or experiments which the
|
||||
user could be opted in to.
|
||||
https://github.com/pytorch/test-infra/issues/5132) to define the configuration
|
||||
of which runners should be used to run which job.
|
||||
|
||||
The configuration has two parts, the settings and a list of opted-in users,
|
||||
separated by a line containing "---". If the line is not present, the
|
||||
settings are considered to be empty with only the second part, the user
|
||||
list, defined.
|
||||
|
||||
The first part is a YAML block that defines the rollout settings. This can be
|
||||
used to define any settings that are needed to determine which runners to use.
|
||||
It's fields are defined by the RolloutSettings class below.
|
||||
|
||||
The second part is a list of users who are explicitly opted in to the LF fleet.
|
||||
The user list is also a comma separated list of additional features or
|
||||
experiments which the user could be opted in to.
|
||||
|
||||
The user list has the following rules:
|
||||
|
||||
- Users are GitHub usernames with the @ prefix
|
||||
- If the first line is a "*" then all users will use the new runners
|
||||
- If the first line is a "!" then all users will use the old runners
|
||||
- Users are GitHub usernames, which must start with the @ prefix
|
||||
- Each user is also a comma-separated list of features/experiments to enable
|
||||
- A "#" prefix indicates the user is opted out of the new runners but is opting
|
||||
into features/experiments.
|
||||
- A "#" prefix opts the user out of all experiments
|
||||
|
||||
Example user list:
|
||||
Example config:
|
||||
# A list of experiments that can be opted into.
|
||||
# This defines the behavior they'll induce when opted into.
|
||||
# Expected syntax is:
|
||||
# [experiment_name]: # Name of the experiment. Also used for the label prefix.
|
||||
# rollout_perc: [int] # % of workflows to run with this experiment when users are not opted in.
|
||||
|
||||
@User1
|
||||
@User2,amz2023
|
||||
#@UserOptOutOfNewRunner,amz2023
|
||||
experiments:
|
||||
lf:
|
||||
rollout_percent: 25
|
||||
|
||||
---
|
||||
|
||||
# Opt-ins:
|
||||
# Users can opt into the LF fleet by adding their GitHub username to this list
|
||||
# and specifying experiments to enable in a comma-separated list.
|
||||
# Experiments should be from the above list.
|
||||
|
||||
@User1,lf,split_build
|
||||
@User2,lf
|
||||
@User3,split_build
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
from argparse import ArgumentParser
|
||||
from logging import LogRecord
|
||||
from typing import Any, Iterable
|
||||
from typing import Any, Dict, Iterable, List, NamedTuple, Tuple
|
||||
|
||||
import yaml
|
||||
from github import Auth, Github
|
||||
from github.Issue import Issue
|
||||
|
||||
|
||||
WORKFLOW_LABEL_META = "" # use meta runners
|
||||
DEFAULT_LABEL_PREFIX = "" # use meta runners
|
||||
WORKFLOW_LABEL_LF = "lf." # use runners from the linux foundation
|
||||
WORKFLOW_LABEL_LF_CANARY = "lf.c." # use canary runners from the linux foundation
|
||||
|
||||
RUNNER_AMI_LEGACY = ""
|
||||
RUNNER_AMI_AMZ2023 = "amz2023"
|
||||
|
||||
GITHUB_OUTPUT = os.getenv("GITHUB_OUTPUT", "")
|
||||
GH_OUTPUT_KEY_AMI = "runner-ami"
|
||||
GH_OUTPUT_KEY_LABEL_TYPE = "label-type"
|
||||
|
||||
|
||||
SETTING_EXPERIMENTS = "experiments"
|
||||
|
||||
LF_FLEET_EXPERIMENT = "lf"
|
||||
CANARY_FLEET_SUFFIX = ".c"
|
||||
|
||||
|
||||
class Experiment(NamedTuple):
|
||||
rollout_perc: float = (
|
||||
0 # Percentage of workflows to experiment on when user is not opted-in.
|
||||
)
|
||||
|
||||
# Add more fields as needed
|
||||
|
||||
|
||||
class Settings(NamedTuple):
|
||||
"""
|
||||
Settings for the experiments that can be opted into.
|
||||
"""
|
||||
|
||||
experiments: Dict[str, Experiment] = {}
|
||||
|
||||
|
||||
class ColorFormatter(logging.Formatter):
|
||||
"""Color codes the log messages based on the log level"""
|
||||
|
||||
@ -172,85 +217,180 @@ def is_exception_branch(branch: str) -> bool:
|
||||
return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}
|
||||
|
||||
|
||||
def get_fleet(rollout_state: str, workflow_requestors: Iterable[str]) -> str:
|
||||
"""
|
||||
Determines if the job should run on the LF fleet or the Meta fleet
|
||||
|
||||
Returns:
|
||||
The appropriate label prefix for the runner, corresponding to the fleet to use.
|
||||
This gets prefixed to the very start of the runner label.
|
||||
"""
|
||||
|
||||
def load_yaml(yaml_text: str) -> Any:
|
||||
try:
|
||||
if rollout_state[0] == "!":
|
||||
log.info("LF Workflows are disabled for everyone. Using meta runners.")
|
||||
return WORKFLOW_LABEL_META
|
||||
elif rollout_state[0] == "*":
|
||||
log.info("LF Workflows are enabled for everyone. Using LF runners.")
|
||||
return WORKFLOW_LABEL_LF
|
||||
else:
|
||||
all_opted_in_users = {
|
||||
usr_raw.strip("\n\t@ ").split(",")[0]
|
||||
for usr_raw in rollout_state.split()
|
||||
}
|
||||
opted_in_requestors = {
|
||||
usr for usr in workflow_requestors if usr in all_opted_in_users
|
||||
}
|
||||
if opted_in_requestors:
|
||||
log.info(
|
||||
f"LF Workflows are enabled for {', '.join(opted_in_requestors)}. Using LF runners."
|
||||
)
|
||||
return WORKFLOW_LABEL_LF
|
||||
else:
|
||||
log.info(
|
||||
f"LF Workflows are disabled for {', '.join(workflow_requestors)}. Using meta runners."
|
||||
)
|
||||
return WORKFLOW_LABEL_META
|
||||
|
||||
except Exception as e:
|
||||
log.error(
|
||||
f"Failed to get determine workflow type. Falling back to meta runners. Exception: {e}"
|
||||
)
|
||||
return WORKFLOW_LABEL_META
|
||||
data = yaml.safe_load(yaml_text)
|
||||
return data
|
||||
except yaml.YAMLError as exc:
|
||||
log.exception("Error loading YAML")
|
||||
raise
|
||||
|
||||
|
||||
def get_optin_feature(
|
||||
rollout_state: str, workflow_requestors: Iterable[str], feature: str, fallback: str
|
||||
def extract_settings_user_opt_in_from_text(rollout_state: str) -> Tuple[str, str]:
|
||||
"""
|
||||
Extracts the text with settings, if any, and the opted in users from the rollout state.
|
||||
|
||||
If the issue body contains "---" then the text above that is the settings
|
||||
and the text below is the list of opted in users.
|
||||
|
||||
If it doesn't contain "---" then the settings are empty and the rest is the users.
|
||||
"""
|
||||
rollout_state_parts = rollout_state.split("---")
|
||||
if len(rollout_state_parts) >= 2:
|
||||
return rollout_state_parts[0], rollout_state_parts[1]
|
||||
else:
|
||||
return "", rollout_state
|
||||
|
||||
|
||||
class UserOptins(Dict[str, List[str]]):
|
||||
"""
|
||||
Dictionary of users with a list of features they have opted into
|
||||
"""
|
||||
|
||||
|
||||
def parse_user_opt_in_from_text(user_optin_text: str) -> UserOptins:
|
||||
"""
|
||||
Parse the user opt-in text into a key value pair of username and the list of features they have opted into
|
||||
|
||||
Users are GitHub usernames with the @ prefix. Each user is also a comma-separated list of features/experiments to enable.
|
||||
- Example line: "@User1,lf,split_build"
|
||||
- A "#" prefix indicates the user is opted out of all experiments
|
||||
|
||||
|
||||
"""
|
||||
optins = UserOptins()
|
||||
for user in user_optin_text.split("\n"):
|
||||
user = user.strip("\r\n\t -")
|
||||
if not user or not user.startswith("@"):
|
||||
# Not a valid user. Skip
|
||||
continue
|
||||
|
||||
if user:
|
||||
usr_name = user.split(",")[0].strip("@")
|
||||
optins[usr_name] = [exp.strip(" ") for exp in user.split(",")[1:]]
|
||||
|
||||
return optins
|
||||
|
||||
|
||||
def parse_settings_from_text(settings_text: str) -> Settings:
|
||||
"""
|
||||
Parse the experiments from the issue body into a list of ExperimentSettings
|
||||
"""
|
||||
try:
|
||||
if settings_text:
|
||||
# Escape the backtick as well so that we can have the settings in a code block on the GH issue
|
||||
# for easy reading
|
||||
# Note: Using ascii for the backtick so that the cat step in _runner-determinator.yml doesn't choke on
|
||||
# the backtick character in shell commands.
|
||||
backtick = chr(96) # backtick character
|
||||
settings_text = settings_text.strip(f"\r\n\t{backtick} ")
|
||||
settings = load_yaml(settings_text)
|
||||
|
||||
# For now we just load experiments. We can expand this if/when we add more settings
|
||||
experiments = {}
|
||||
|
||||
for exp_name, exp_settings in settings.get(SETTING_EXPERIMENTS).items():
|
||||
valid_settings = {}
|
||||
for setting in exp_settings:
|
||||
if setting not in Experiment._fields:
|
||||
log.warning(
|
||||
f"Unexpected setting in experiment: {setting} = {exp_settings[setting]}"
|
||||
)
|
||||
else:
|
||||
valid_settings[setting] = exp_settings[setting]
|
||||
|
||||
experiments[exp_name] = Experiment(**valid_settings)
|
||||
return Settings(experiments)
|
||||
|
||||
except Exception:
|
||||
log.exception("Failed to parse settings")
|
||||
|
||||
return Settings()
|
||||
|
||||
|
||||
def parse_settings(rollout_state: str) -> Settings:
|
||||
"""
|
||||
Parse settings, if any, from the rollout state.
|
||||
|
||||
If the issue body contains "---" then the text above that is the settings
|
||||
and the text below is the list of opted in users.
|
||||
|
||||
If it doesn't contain "---" then the settings are empty and the default values are used.
|
||||
"""
|
||||
settings_text, _ = extract_settings_user_opt_in_from_text(rollout_state)
|
||||
return parse_settings_from_text(settings_text)
|
||||
|
||||
|
||||
def parse_users(rollout_state: str) -> UserOptins:
|
||||
"""
|
||||
Parse users from the rollout state.
|
||||
|
||||
"""
|
||||
_, users_text = extract_settings_user_opt_in_from_text(rollout_state)
|
||||
return parse_user_opt_in_from_text(users_text)
|
||||
|
||||
|
||||
def is_user_opted_in(user: str, user_optins: UserOptins, experiment_name: str) -> bool:
|
||||
"""
|
||||
Check if a user is opted into an experiment
|
||||
"""
|
||||
return experiment_name in user_optins.get(user, [])
|
||||
|
||||
|
||||
def get_runner_prefix(
|
||||
rollout_state: str, workflow_requestors: Iterable[str], is_canary: bool = False
|
||||
) -> str:
|
||||
"""
|
||||
Used to dynamically opt in jobs to specific runner-type variants.
|
||||
settings = parse_settings(rollout_state)
|
||||
user_optins = parse_users(rollout_state)
|
||||
|
||||
Returns:
|
||||
The runner-type's variant name if the user has opted in to the feature, otherwise returns an empty string.
|
||||
This variant name is prefixed to the runner-type in the label.
|
||||
"""
|
||||
try:
|
||||
userlist = {u.lstrip("#").strip("\n\t@ ") for u in rollout_state.split()}
|
||||
all_opted_in_users = set()
|
||||
for user in userlist:
|
||||
for i in user.split(","):
|
||||
if i == feature:
|
||||
all_opted_in_users.add(user.split(",")[0])
|
||||
opted_in_requestors = {
|
||||
usr for usr in workflow_requestors if usr in all_opted_in_users
|
||||
}
|
||||
fleet_prefix = ""
|
||||
prefixes = []
|
||||
for experiment_name, experiment_settings in settings.experiments.items():
|
||||
enabled = False
|
||||
|
||||
if opted_in_requestors:
|
||||
# Is any workflow_requestor opted in to this experiment?
|
||||
opted_in_users = [
|
||||
requestor
|
||||
for requestor in workflow_requestors
|
||||
if is_user_opted_in(requestor, user_optins, experiment_name)
|
||||
]
|
||||
|
||||
if opted_in_users:
|
||||
log.info(
|
||||
f"Feature {feature} is enabled for {', '.join(opted_in_requestors)}. Using feature {feature}."
|
||||
f"{', '.join(opted_in_users)} have opted into experiment {experiment_name}."
|
||||
)
|
||||
return feature
|
||||
else:
|
||||
log.info(
|
||||
f"Feature {feature} is disabled for {', '.join(workflow_requestors)}. Using fallback \"{fallback}\"."
|
||||
)
|
||||
return fallback
|
||||
enabled = True
|
||||
elif experiment_settings.rollout_perc:
|
||||
# If no user is opted in, then we randomly enable the experiment based on the rollout percentage
|
||||
if random.uniform(0, 100) <= experiment_settings.rollout_perc:
|
||||
log.info(
|
||||
f"Based on rollout percentage of {experiment_settings.rollout_perc}%, enabling experiment {experiment_name}."
|
||||
)
|
||||
enabled = True
|
||||
|
||||
except Exception as e:
|
||||
if enabled:
|
||||
label = experiment_name
|
||||
if experiment_name == LF_FLEET_EXPERIMENT:
|
||||
# We give some special treatment to the "lf" experiment since determines the fleet we use
|
||||
# - If it's enabled, then we always list it's prefix first
|
||||
# - If we're in the canary branch, then we append ".c" to the lf prefix
|
||||
if is_canary:
|
||||
label += CANARY_FLEET_SUFFIX
|
||||
fleet_prefix = label
|
||||
else:
|
||||
prefixes.append(label)
|
||||
|
||||
if len(prefixes) > 1:
|
||||
log.error(
|
||||
f'Failed to determine if user has opted-in to feature {feature}. Using fallback "{fallback}". Exception: {e}'
|
||||
f"Only a fleet and one other experiment can be enabled for a job at any time. Enabling {prefixes[0]} and ignoring the rest, which are {', '.join(prefixes[1:])}"
|
||||
)
|
||||
return fallback
|
||||
prefixes = prefixes[:1]
|
||||
|
||||
# Fleet always comes first
|
||||
if fleet_prefix:
|
||||
prefixes.insert(0, fleet_prefix)
|
||||
|
||||
return ".".join(prefixes) + "." if prefixes else ""
|
||||
|
||||
|
||||
def get_rollout_state_from_issue(github_token: str, repo: str, issue_num: int) -> str:
|
||||
@ -268,9 +408,10 @@ def main() -> None:
|
||||
args = parse_args()
|
||||
|
||||
if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
|
||||
log.info(f"Exception branch: '{args.github_branch}', using meta runners")
|
||||
label_type = WORKFLOW_LABEL_META
|
||||
runner_ami = RUNNER_AMI_LEGACY
|
||||
log.info(
|
||||
f"Exception branch: '{args.github_branch}', using Meta runners and no experiments."
|
||||
)
|
||||
runner_label_prefix = DEFAULT_LABEL_PREFIX
|
||||
else:
|
||||
try:
|
||||
rollout_state = get_rollout_state_from_issue(
|
||||
@ -285,35 +426,18 @@ def main() -> None:
|
||||
args.github_branch,
|
||||
)
|
||||
|
||||
label_type = get_fleet(
|
||||
rollout_state,
|
||||
(
|
||||
args.github_issue_owner,
|
||||
username,
|
||||
),
|
||||
)
|
||||
runner_ami = get_optin_feature(
|
||||
rollout_state=rollout_state,
|
||||
workflow_requestors=(
|
||||
args.github_issue_owner,
|
||||
username,
|
||||
),
|
||||
feature=RUNNER_AMI_AMZ2023,
|
||||
fallback=RUNNER_AMI_LEGACY,
|
||||
is_canary = args.github_repo == "pytorch/pytorch-canary"
|
||||
|
||||
runner_label_prefix = get_runner_prefix(
|
||||
rollout_state, (args.github_issue_owner, username), is_canary
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
log.error(
|
||||
f"Failed to get issue. Falling back to meta runners. Exception: {e}"
|
||||
f"Failed to get issue. Defaulting to Meta runners and no experiments. Exception: {e}"
|
||||
)
|
||||
label_type = WORKFLOW_LABEL_META
|
||||
runner_ami = RUNNER_AMI_LEGACY
|
||||
|
||||
# For Canary builds use canary runners
|
||||
if args.github_repo == "pytorch/pytorch-canary" and label_type == WORKFLOW_LABEL_LF:
|
||||
label_type = WORKFLOW_LABEL_LF_CANARY
|
||||
|
||||
set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, label_type)
|
||||
set_github_output(GH_OUTPUT_KEY_AMI, runner_ami)
|
||||
set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, runner_label_prefix)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -51,6 +51,8 @@ def main() -> None:
|
||||
|
||||
for platform_image in platform_images: # type: ignore[attr-defined]
|
||||
for arch in platform_image.keys(): # type: ignore[attr-defined]
|
||||
if arch == "cpu-s390x":
|
||||
continue
|
||||
tag_image(
|
||||
platform_image[arch], # type: ignore[index]
|
||||
default_tag,
|
||||
|
237
.github/scripts/test_runner_determinator.py
vendored
Normal file
237
.github/scripts/test_runner_determinator.py
vendored
Normal file
@ -0,0 +1,237 @@
|
||||
from unittest import main, TestCase
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import runner_determinator as rd
|
||||
|
||||
|
||||
class TestRunnerDeterminatorIssueParser(TestCase):
|
||||
def test_parse_settings(self) -> None:
|
||||
settings_text = """
|
||||
experiments:
|
||||
lf:
|
||||
rollout_perc: 25
|
||||
otherExp:
|
||||
rollout_perc: 0
|
||||
---
|
||||
|
||||
Users:
|
||||
@User1,lf
|
||||
@User2,lf,otherExp
|
||||
|
||||
"""
|
||||
|
||||
settings = rd.parse_settings(settings_text)
|
||||
|
||||
self.assertTupleEqual(
|
||||
rd.Experiment(rollout_perc=25),
|
||||
settings.experiments["lf"],
|
||||
"lf settings not parsed correctly",
|
||||
)
|
||||
self.assertTupleEqual(
|
||||
rd.Experiment(rollout_perc=0),
|
||||
settings.experiments["otherExp"],
|
||||
"otherExp settings not parsed correctly",
|
||||
)
|
||||
|
||||
def test_parse_settings_in_code_block(self) -> None:
|
||||
settings_text = """
|
||||
|
||||
```
|
||||
experiments:
|
||||
lf:
|
||||
rollout_perc: 25
|
||||
otherExp:
|
||||
rollout_perc: 0
|
||||
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
Users:
|
||||
@User1,lf
|
||||
@User2,lf,otherExp
|
||||
|
||||
"""
|
||||
|
||||
settings = rd.parse_settings(settings_text)
|
||||
|
||||
self.assertTupleEqual(
|
||||
rd.Experiment(rollout_perc=25),
|
||||
settings.experiments["lf"],
|
||||
"lf settings not parsed correctly",
|
||||
)
|
||||
self.assertTupleEqual(
|
||||
rd.Experiment(rollout_perc=0),
|
||||
settings.experiments["otherExp"],
|
||||
"otherExp settings not parsed correctly",
|
||||
)
|
||||
|
||||
def test_parse_users(self) -> None:
|
||||
settings_text = """
|
||||
experiments:
|
||||
lf:
|
||||
rollout_perc: 0
|
||||
otherExp:
|
||||
rollout_perc: 0
|
||||
---
|
||||
|
||||
Users:
|
||||
@User1,lf
|
||||
@User2,lf,otherExp
|
||||
|
||||
"""
|
||||
|
||||
users = rd.parse_users(settings_text)
|
||||
self.assertDictEqual(
|
||||
{"User1": ["lf"], "User2": ["lf", "otherExp"]},
|
||||
users,
|
||||
"Users not parsed correctly",
|
||||
)
|
||||
|
||||
def test_parse_users_without_settings(self) -> None:
|
||||
settings_text = """
|
||||
|
||||
@User1,lf
|
||||
@User2,lf,otherExp
|
||||
|
||||
"""
|
||||
|
||||
users = rd.parse_users(settings_text)
|
||||
self.assertDictEqual(
|
||||
{"User1": ["lf"], "User2": ["lf", "otherExp"]},
|
||||
users,
|
||||
"Users not parsed correctly",
|
||||
)
|
||||
|
||||
|
||||
class TestRunnerDeterminatorGetRunnerPrefix(TestCase):
|
||||
def test_opted_in_user(self) -> None:
|
||||
settings_text = """
|
||||
experiments:
|
||||
lf:
|
||||
rollout_perc: 0
|
||||
otherExp:
|
||||
rollout_perc: 0
|
||||
---
|
||||
|
||||
Users:
|
||||
@User1,lf
|
||||
@User2,lf,otherExp
|
||||
|
||||
"""
|
||||
prefix = rd.get_runner_prefix(settings_text, ["User1"])
|
||||
self.assertEqual("lf.", prefix, "Runner prefix not correct for User1")
|
||||
|
||||
def test_opted_in_user_two_experiments(self) -> None:
|
||||
settings_text = """
|
||||
experiments:
|
||||
lf:
|
||||
rollout_perc: 0
|
||||
otherExp:
|
||||
rollout_perc: 0
|
||||
---
|
||||
|
||||
Users:
|
||||
@User1,lf
|
||||
@User2,lf,otherExp
|
||||
|
||||
"""
|
||||
prefix = rd.get_runner_prefix(settings_text, ["User2"])
|
||||
self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for User2")
|
||||
|
||||
@patch("random.uniform", return_value=50)
|
||||
def test_opted_out_user(self, mock_uniform: Mock) -> None:
|
||||
settings_text = """
|
||||
experiments:
|
||||
lf:
|
||||
rollout_perc: 25
|
||||
otherExp:
|
||||
rollout_perc: 25
|
||||
---
|
||||
|
||||
Users:
|
||||
@User1,lf
|
||||
@User2,lf,otherExp
|
||||
|
||||
"""
|
||||
prefix = rd.get_runner_prefix(settings_text, ["User3"])
|
||||
self.assertEqual("", prefix, "Runner prefix not correct for user")
|
||||
|
||||
@patch("random.uniform", return_value=10)
|
||||
def test_opted_out_user_was_pulled_in_by_rollout(self, mock_uniform: Mock) -> None:
|
||||
settings_text = """
|
||||
experiments:
|
||||
lf:
|
||||
rollout_perc: 25
|
||||
otherExp:
|
||||
rollout_perc: 25
|
||||
---
|
||||
|
||||
Users:
|
||||
@User1,lf
|
||||
@User2,lf,otherExp
|
||||
|
||||
"""
|
||||
|
||||
# User3 is opted out, but is pulled into both experiments by the 10% rollout
|
||||
prefix = rd.get_runner_prefix(settings_text, ["User3"])
|
||||
self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")
|
||||
|
||||
def test_lf_prefix_always_comes_first(self) -> None:
|
||||
settings_text = """
|
||||
experiments:
|
||||
otherExp:
|
||||
rollout_perc: 0
|
||||
lf:
|
||||
rollout_perc: 0
|
||||
---
|
||||
|
||||
Users:
|
||||
@User1,lf
|
||||
@User2,otherExp,lf
|
||||
|
||||
"""
|
||||
|
||||
prefix = rd.get_runner_prefix(settings_text, ["User2"])
|
||||
self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")
|
||||
|
||||
def test_ignores_commented_users(self) -> None:
|
||||
settings_text = """
|
||||
experiments:
|
||||
lf:
|
||||
rollout_perc: 0
|
||||
otherExp:
|
||||
rollout_perc: 0
|
||||
---
|
||||
|
||||
Users:
|
||||
#@User1,lf
|
||||
@User2,lf,otherExp
|
||||
|
||||
"""
|
||||
|
||||
prefix = rd.get_runner_prefix(settings_text, ["User1"])
|
||||
self.assertEqual("", prefix, "Runner prefix not correct for user")
|
||||
|
||||
def test_ignores_extra_experiments(self) -> None:
|
||||
settings_text = """
|
||||
experiments:
|
||||
lf:
|
||||
rollout_perc: 0
|
||||
otherExp:
|
||||
rollout_perc: 0
|
||||
foo:
|
||||
rollout_perc: 0
|
||||
---
|
||||
|
||||
Users:
|
||||
@User1,lf,otherExp,foo
|
||||
|
||||
"""
|
||||
|
||||
prefix = rd.get_runner_prefix(settings_text, ["User1"])
|
||||
self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
48
.github/scripts/trymerge.py
vendored
48
.github/scripts/trymerge.py
vendored
@ -36,6 +36,7 @@ from warnings import warn
|
||||
|
||||
import yaml
|
||||
from github_utils import (
|
||||
gh_close_pr,
|
||||
gh_fetch_json_list,
|
||||
gh_fetch_merge_base,
|
||||
gh_fetch_url,
|
||||
@ -1174,11 +1175,11 @@ class GitHubPR:
|
||||
for pr in additional_merged_prs:
|
||||
pr.add_numbered_label(MERGE_COMPLETE_LABEL, dry_run)
|
||||
|
||||
if comment_id and self.pr_num:
|
||||
# When the merge process reaches this part, we can assume that the commit
|
||||
# has been successfully pushed to trunk
|
||||
merge_commit_sha = repo.rev_parse(name=REMOTE_MAIN_BRANCH)
|
||||
# When the merge process reaches this part, we can assume that the commit
|
||||
# has been successfully pushed to trunk
|
||||
merge_commit_sha = repo.rev_parse(name=self.default_branch())
|
||||
|
||||
if comment_id and self.pr_num:
|
||||
# Finally, upload the record to Rockset. The list of pending and failed
|
||||
# checks are at the time of the merge
|
||||
save_merge_record(
|
||||
@ -1203,6 +1204,17 @@ class GitHubPR:
|
||||
else:
|
||||
print("Missing comment ID or PR number, couldn't upload to Rockset")
|
||||
|
||||
# Usually Github will see that the commit has "resolves <pr_num>" in the
|
||||
# commit message and close the PR, but sometimes it doesn't, leading to
|
||||
# confusion. When it doesn't, we close it manually.
|
||||
time.sleep(60) # Give Github some time to close the PR
|
||||
manually_close_merged_pr(
|
||||
pr=self,
|
||||
additional_merged_prs=additional_merged_prs,
|
||||
merge_commit_sha=merge_commit_sha,
|
||||
dry_run=dry_run,
|
||||
)
|
||||
|
||||
def merge_changes(
|
||||
self,
|
||||
repo: GitRepo,
|
||||
@ -1503,6 +1515,34 @@ def checks_to_markdown_bullets(
|
||||
]
|
||||
|
||||
|
||||
def manually_close_merged_pr(
|
||||
pr: GitHubPR,
|
||||
additional_merged_prs: List[GitHubPR],
|
||||
merge_commit_sha: str,
|
||||
dry_run: bool,
|
||||
) -> None:
|
||||
def _comment_and_close(pr: GitHubPR, comment: str) -> None:
|
||||
pr = GitHubPR(pr.org, pr.project, pr.pr_num) # Refresh the PR
|
||||
if not pr.is_closed():
|
||||
gh_post_pr_comment(pr.org, pr.project, pr.pr_num, comment, dry_run)
|
||||
gh_close_pr(pr.org, pr.project, pr.pr_num, dry_run)
|
||||
|
||||
message = (
|
||||
f"This PR (#{pr.pr_num}) was merged in {merge_commit_sha} but it is still open, likely due to a Github bug, "
|
||||
"so mergebot is closing it manually. If you think this is a mistake, please feel free to reopen and contact Dev Infra."
|
||||
)
|
||||
_comment_and_close(pr, message)
|
||||
for additional_pr in additional_merged_prs:
|
||||
message = (
|
||||
f"This PR (#{additional_pr.pr_num}) was merged as part of PR #{pr.pr_num} in the stack under {merge_commit_sha} "
|
||||
"but it is still open, likely due to a Github bug, so mergebot is closing it manually. "
|
||||
"If you think this is a mistake, please feel free to reopen and contact Dev Infra."
|
||||
)
|
||||
_comment_and_close(additional_pr, message)
|
||||
|
||||
print(f"PR {pr.pr_num} and all additional PRs in the stack have been closed.")
|
||||
|
||||
|
||||
@retries_decorator()
|
||||
def save_merge_record(
|
||||
comment_id: int,
|
||||
|
350
.github/workflows/_runner-determinator.yml
vendored
350
.github/workflows/_runner-determinator.yml
vendored
@ -62,49 +62,94 @@ jobs:
|
||||
"""
|
||||
This runner determinator is used to determine which set of runners to run a
|
||||
GitHub job on. It uses the first comment of a GitHub issue (by default
|
||||
https://github.com/pytorch/test-infra/issues/5132) as a user list to determine
|
||||
which users will get their jobs to run on experimental runners. This user list
|
||||
is also a comma separated list of additional features or experiments which the
|
||||
user could be opted in to.
|
||||
https://github.com/pytorch/test-infra/issues/5132) to define the configuration
|
||||
of which runners should be used to run which job.
|
||||
|
||||
The configuration has two parts, the settings and a list of opted-in users,
|
||||
separated by a line containing "---". If the line is not present, the
|
||||
settings are considered to be empty with only the second part, the user
|
||||
list, defined.
|
||||
|
||||
The first part is a YAML block that defines the rollout settings. This can be
|
||||
used to define any settings that are needed to determine which runners to use.
|
||||
It's fields are defined by the RolloutSettings class below.
|
||||
|
||||
The second part is a list of users who are explicitly opted in to the LF fleet.
|
||||
The user list is also a comma separated list of additional features or
|
||||
experiments which the user could be opted in to.
|
||||
|
||||
The user list has the following rules:
|
||||
|
||||
- Users are GitHub usernames with the @ prefix
|
||||
- If the first line is a "*" then all users will use the new runners
|
||||
- If the first line is a "!" then all users will use the old runners
|
||||
- Users are GitHub usernames, which must start with the @ prefix
|
||||
- Each user is also a comma-separated list of features/experiments to enable
|
||||
- A "#" prefix indicates the user is opted out of the new runners but is opting
|
||||
into features/experiments.
|
||||
- A "#" prefix opts the user out of all experiments
|
||||
|
||||
Example user list:
|
||||
Example config:
|
||||
# A list of experiments that can be opted into.
|
||||
# This defines the behavior they'll induce when opted into.
|
||||
# Expected syntax is:
|
||||
# [experiment_name]: # Name of the experiment. Also used for the label prefix.
|
||||
# rollout_perc: [int] # % of workflows to run with this experiment when users are not opted in.
|
||||
|
||||
@User1
|
||||
@User2,amz2023
|
||||
#@UserOptOutOfNewRunner,amz2023
|
||||
experiments:
|
||||
lf:
|
||||
rollout_percent: 25
|
||||
|
||||
---
|
||||
|
||||
# Opt-ins:
|
||||
# Users can opt into the LF fleet by adding their GitHub username to this list
|
||||
# and specifying experiments to enable in a comma-separated list.
|
||||
# Experiments should be from the above list.
|
||||
|
||||
@User1,lf,split_build
|
||||
@User2,lf
|
||||
@User3,split_build
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
from argparse import ArgumentParser
|
||||
from logging import LogRecord
|
||||
from typing import Any, Iterable
|
||||
from typing import Any, Dict, Iterable, List, NamedTuple, Tuple
|
||||
|
||||
import yaml
|
||||
from github import Auth, Github
|
||||
from github.Issue import Issue
|
||||
|
||||
|
||||
WORKFLOW_LABEL_META = "" # use meta runners
|
||||
DEFAULT_LABEL_PREFIX = "" # use meta runners
|
||||
WORKFLOW_LABEL_LF = "lf." # use runners from the linux foundation
|
||||
WORKFLOW_LABEL_LF_CANARY = "lf.c." # use canary runners from the linux foundation
|
||||
|
||||
RUNNER_AMI_LEGACY = ""
|
||||
RUNNER_AMI_AMZ2023 = "amz2023"
|
||||
|
||||
GITHUB_OUTPUT = os.getenv("GITHUB_OUTPUT", "")
|
||||
GH_OUTPUT_KEY_AMI = "runner-ami"
|
||||
GH_OUTPUT_KEY_LABEL_TYPE = "label-type"
|
||||
|
||||
|
||||
SETTING_EXPERIMENTS = "experiments"
|
||||
|
||||
LF_FLEET_EXPERIMENT = "lf"
|
||||
CANARY_FLEET_SUFFIX = ".c"
|
||||
|
||||
|
||||
class Experiment(NamedTuple):
|
||||
rollout_perc: float = (
|
||||
0 # Percentage of workflows to experiment on when user is not opted-in.
|
||||
)
|
||||
|
||||
# Add more fields as needed
|
||||
|
||||
|
||||
class Settings(NamedTuple):
|
||||
"""
|
||||
Settings for the experiments that can be opted into.
|
||||
"""
|
||||
|
||||
experiments: Dict[str, Experiment] = {}
|
||||
|
||||
|
||||
class ColorFormatter(logging.Formatter):
|
||||
"""Color codes the log messages based on the log level"""
|
||||
|
||||
@ -231,85 +276,180 @@ jobs:
|
||||
return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}
|
||||
|
||||
|
||||
def get_fleet(rollout_state: str, workflow_requestors: Iterable[str]) -> str:
|
||||
"""
|
||||
Determines if the job should run on the LF fleet or the Meta fleet
|
||||
|
||||
Returns:
|
||||
The appropriate label prefix for the runner, corresponding to the fleet to use.
|
||||
This gets prefixed to the very start of the runner label.
|
||||
"""
|
||||
|
||||
def load_yaml(yaml_text: str) -> Any:
|
||||
try:
|
||||
if rollout_state[0] == "!":
|
||||
log.info("LF Workflows are disabled for everyone. Using meta runners.")
|
||||
return WORKFLOW_LABEL_META
|
||||
elif rollout_state[0] == "*":
|
||||
log.info("LF Workflows are enabled for everyone. Using LF runners.")
|
||||
return WORKFLOW_LABEL_LF
|
||||
else:
|
||||
all_opted_in_users = {
|
||||
usr_raw.strip("\n\t@ ").split(",")[0]
|
||||
for usr_raw in rollout_state.split()
|
||||
}
|
||||
opted_in_requestors = {
|
||||
usr for usr in workflow_requestors if usr in all_opted_in_users
|
||||
}
|
||||
if opted_in_requestors:
|
||||
log.info(
|
||||
f"LF Workflows are enabled for {', '.join(opted_in_requestors)}. Using LF runners."
|
||||
)
|
||||
return WORKFLOW_LABEL_LF
|
||||
else:
|
||||
log.info(
|
||||
f"LF Workflows are disabled for {', '.join(workflow_requestors)}. Using meta runners."
|
||||
)
|
||||
return WORKFLOW_LABEL_META
|
||||
|
||||
except Exception as e:
|
||||
log.error(
|
||||
f"Failed to get determine workflow type. Falling back to meta runners. Exception: {e}"
|
||||
)
|
||||
return WORKFLOW_LABEL_META
|
||||
data = yaml.safe_load(yaml_text)
|
||||
return data
|
||||
except yaml.YAMLError as exc:
|
||||
log.exception("Error loading YAML")
|
||||
raise
|
||||
|
||||
|
||||
def get_optin_feature(
|
||||
rollout_state: str, workflow_requestors: Iterable[str], feature: str, fallback: str
|
||||
def extract_settings_user_opt_in_from_text(rollout_state: str) -> Tuple[str, str]:
|
||||
"""
|
||||
Extracts the text with settings, if any, and the opted in users from the rollout state.
|
||||
|
||||
If the issue body contains "---" then the text above that is the settings
|
||||
and the text below is the list of opted in users.
|
||||
|
||||
If it doesn't contain "---" then the settings are empty and the rest is the users.
|
||||
"""
|
||||
rollout_state_parts = rollout_state.split("---")
|
||||
if len(rollout_state_parts) >= 2:
|
||||
return rollout_state_parts[0], rollout_state_parts[1]
|
||||
else:
|
||||
return "", rollout_state
|
||||
|
||||
|
||||
class UserOptins(Dict[str, List[str]]):
|
||||
"""
|
||||
Dictionary of users with a list of features they have opted into
|
||||
"""
|
||||
|
||||
|
||||
def parse_user_opt_in_from_text(user_optin_text: str) -> UserOptins:
|
||||
"""
|
||||
Parse the user opt-in text into a key value pair of username and the list of features they have opted into
|
||||
|
||||
Users are GitHub usernames with the @ prefix. Each user is also a comma-separated list of features/experiments to enable.
|
||||
- Example line: "@User1,lf,split_build"
|
||||
- A "#" prefix indicates the user is opted out of all experiments
|
||||
|
||||
|
||||
"""
|
||||
optins = UserOptins()
|
||||
for user in user_optin_text.split("\n"):
|
||||
user = user.strip("\r\n\t -")
|
||||
if not user or not user.startswith("@"):
|
||||
# Not a valid user. Skip
|
||||
continue
|
||||
|
||||
if user:
|
||||
usr_name = user.split(",")[0].strip("@")
|
||||
optins[usr_name] = [exp.strip(" ") for exp in user.split(",")[1:]]
|
||||
|
||||
return optins
|
||||
|
||||
|
||||
def parse_settings_from_text(settings_text: str) -> Settings:
|
||||
"""
|
||||
Parse the experiments from the issue body into a list of ExperimentSettings
|
||||
"""
|
||||
try:
|
||||
if settings_text:
|
||||
# Escape the backtick as well so that we can have the settings in a code block on the GH issue
|
||||
# for easy reading
|
||||
# Note: Using ascii for the backtick so that the cat step in _runner-determinator.yml doesn't choke on
|
||||
# the backtick character in shell commands.
|
||||
backtick = chr(96) # backtick character
|
||||
settings_text = settings_text.strip(f"\r\n\t{backtick} ")
|
||||
settings = load_yaml(settings_text)
|
||||
|
||||
# For now we just load experiments. We can expand this if/when we add more settings
|
||||
experiments = {}
|
||||
|
||||
for exp_name, exp_settings in settings.get(SETTING_EXPERIMENTS).items():
|
||||
valid_settings = {}
|
||||
for setting in exp_settings:
|
||||
if setting not in Experiment._fields:
|
||||
log.warning(
|
||||
f"Unexpected setting in experiment: {setting} = {exp_settings[setting]}"
|
||||
)
|
||||
else:
|
||||
valid_settings[setting] = exp_settings[setting]
|
||||
|
||||
experiments[exp_name] = Experiment(**valid_settings)
|
||||
return Settings(experiments)
|
||||
|
||||
except Exception:
|
||||
log.exception("Failed to parse settings")
|
||||
|
||||
return Settings()
|
||||
|
||||
|
||||
def parse_settings(rollout_state: str) -> Settings:
|
||||
"""
|
||||
Parse settings, if any, from the rollout state.
|
||||
|
||||
If the issue body contains "---" then the text above that is the settings
|
||||
and the text below is the list of opted in users.
|
||||
|
||||
If it doesn't contain "---" then the settings are empty and the default values are used.
|
||||
"""
|
||||
settings_text, _ = extract_settings_user_opt_in_from_text(rollout_state)
|
||||
return parse_settings_from_text(settings_text)
|
||||
|
||||
|
||||
def parse_users(rollout_state: str) -> UserOptins:
|
||||
"""
|
||||
Parse users from the rollout state.
|
||||
|
||||
"""
|
||||
_, users_text = extract_settings_user_opt_in_from_text(rollout_state)
|
||||
return parse_user_opt_in_from_text(users_text)
|
||||
|
||||
|
||||
def is_user_opted_in(user: str, user_optins: UserOptins, experiment_name: str) -> bool:
|
||||
"""
|
||||
Check if a user is opted into an experiment
|
||||
"""
|
||||
return experiment_name in user_optins.get(user, [])
|
||||
|
||||
|
||||
def get_runner_prefix(
|
||||
rollout_state: str, workflow_requestors: Iterable[str], is_canary: bool = False
|
||||
) -> str:
|
||||
"""
|
||||
Used to dynamically opt in jobs to specific runner-type variants.
|
||||
settings = parse_settings(rollout_state)
|
||||
user_optins = parse_users(rollout_state)
|
||||
|
||||
Returns:
|
||||
The runner-type's variant name if the user has opted in to the feature, otherwise returns an empty string.
|
||||
This variant name is prefixed to the runner-type in the label.
|
||||
"""
|
||||
try:
|
||||
userlist = {u.lstrip("#").strip("\n\t@ ") for u in rollout_state.split()}
|
||||
all_opted_in_users = set()
|
||||
for user in userlist:
|
||||
for i in user.split(","):
|
||||
if i == feature:
|
||||
all_opted_in_users.add(user.split(",")[0])
|
||||
opted_in_requestors = {
|
||||
usr for usr in workflow_requestors if usr in all_opted_in_users
|
||||
}
|
||||
fleet_prefix = ""
|
||||
prefixes = []
|
||||
for experiment_name, experiment_settings in settings.experiments.items():
|
||||
enabled = False
|
||||
|
||||
if opted_in_requestors:
|
||||
# Is any workflow_requestor opted in to this experiment?
|
||||
opted_in_users = [
|
||||
requestor
|
||||
for requestor in workflow_requestors
|
||||
if is_user_opted_in(requestor, user_optins, experiment_name)
|
||||
]
|
||||
|
||||
if opted_in_users:
|
||||
log.info(
|
||||
f"Feature {feature} is enabled for {', '.join(opted_in_requestors)}. Using feature {feature}."
|
||||
f"{', '.join(opted_in_users)} have opted into experiment {experiment_name}."
|
||||
)
|
||||
return feature
|
||||
else:
|
||||
log.info(
|
||||
f"Feature {feature} is disabled for {', '.join(workflow_requestors)}. Using fallback \"{fallback}\"."
|
||||
)
|
||||
return fallback
|
||||
enabled = True
|
||||
elif experiment_settings.rollout_perc:
|
||||
# If no user is opted in, then we randomly enable the experiment based on the rollout percentage
|
||||
if random.uniform(0, 100) <= experiment_settings.rollout_perc:
|
||||
log.info(
|
||||
f"Based on rollout percentage of {experiment_settings.rollout_perc}%, enabling experiment {experiment_name}."
|
||||
)
|
||||
enabled = True
|
||||
|
||||
except Exception as e:
|
||||
if enabled:
|
||||
label = experiment_name
|
||||
if experiment_name == LF_FLEET_EXPERIMENT:
|
||||
# We give some special treatment to the "lf" experiment since determines the fleet we use
|
||||
# - If it's enabled, then we always list it's prefix first
|
||||
# - If we're in the canary branch, then we append ".c" to the lf prefix
|
||||
if is_canary:
|
||||
label += CANARY_FLEET_SUFFIX
|
||||
fleet_prefix = label
|
||||
else:
|
||||
prefixes.append(label)
|
||||
|
||||
if len(prefixes) > 1:
|
||||
log.error(
|
||||
f'Failed to determine if user has opted-in to feature {feature}. Using fallback "{fallback}". Exception: {e}'
|
||||
f"Only a fleet and one other experiment can be enabled for a job at any time. Enabling {prefixes[0]} and ignoring the rest, which are {', '.join(prefixes[1:])}"
|
||||
)
|
||||
return fallback
|
||||
prefixes = prefixes[:1]
|
||||
|
||||
# Fleet always comes first
|
||||
if fleet_prefix:
|
||||
prefixes.insert(0, fleet_prefix)
|
||||
|
||||
return ".".join(prefixes) + "." if prefixes else ""
|
||||
|
||||
|
||||
def get_rollout_state_from_issue(github_token: str, repo: str, issue_num: int) -> str:
|
||||
@ -327,9 +467,10 @@ jobs:
|
||||
args = parse_args()
|
||||
|
||||
if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
|
||||
log.info(f"Exception branch: '{args.github_branch}', using meta runners")
|
||||
label_type = WORKFLOW_LABEL_META
|
||||
runner_ami = RUNNER_AMI_LEGACY
|
||||
log.info(
|
||||
f"Exception branch: '{args.github_branch}', using Meta runners and no experiments."
|
||||
)
|
||||
runner_label_prefix = DEFAULT_LABEL_PREFIX
|
||||
else:
|
||||
try:
|
||||
rollout_state = get_rollout_state_from_issue(
|
||||
@ -344,35 +485,18 @@ jobs:
|
||||
args.github_branch,
|
||||
)
|
||||
|
||||
label_type = get_fleet(
|
||||
rollout_state,
|
||||
(
|
||||
args.github_issue_owner,
|
||||
username,
|
||||
),
|
||||
)
|
||||
runner_ami = get_optin_feature(
|
||||
rollout_state=rollout_state,
|
||||
workflow_requestors=(
|
||||
args.github_issue_owner,
|
||||
username,
|
||||
),
|
||||
feature=RUNNER_AMI_AMZ2023,
|
||||
fallback=RUNNER_AMI_LEGACY,
|
||||
is_canary = args.github_repo == "pytorch/pytorch-canary"
|
||||
|
||||
runner_label_prefix = get_runner_prefix(
|
||||
rollout_state, (args.github_issue_owner, username), is_canary
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
log.error(
|
||||
f"Failed to get issue. Falling back to meta runners. Exception: {e}"
|
||||
f"Failed to get issue. Defaulting to Meta runners and no experiments. Exception: {e}"
|
||||
)
|
||||
label_type = WORKFLOW_LABEL_META
|
||||
runner_ami = RUNNER_AMI_LEGACY
|
||||
|
||||
# For Canary builds use canary runners
|
||||
if args.github_repo == "pytorch/pytorch-canary" and label_type == WORKFLOW_LABEL_LF:
|
||||
label_type = WORKFLOW_LABEL_LF_CANARY
|
||||
|
||||
set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, label_type)
|
||||
set_github_output(GH_OUTPUT_KEY_AMI, runner_ami)
|
||||
set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, runner_label_prefix)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
18
.github/workflows/build-libtorch-images.yml
vendored
18
.github/workflows/build-libtorch-images.yml
vendored
@ -29,9 +29,19 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
build-docker-cuda:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: linux.9xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
|
||||
strategy:
|
||||
matrix:
|
||||
cuda_version: ["12.4", "12.1", "11.8"]
|
||||
@ -66,7 +76,8 @@ jobs:
|
||||
.ci/docker/libtorch/build.sh libtorch-cxx11-builder:cuda${{matrix.cuda_version}}
|
||||
build-docker-rocm:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: linux.9xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
|
||||
strategy:
|
||||
matrix:
|
||||
rocm_version: ["6.1", "6.2"]
|
||||
@ -101,7 +112,8 @@ jobs:
|
||||
.ci/docker/libtorch/build.sh libtorch-cxx11-builder:rocm${{matrix.rocm_version}}
|
||||
build-docker-cpu:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: linux.9xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
|
||||
|
39
.github/workflows/build-manywheel-images.yml
vendored
39
.github/workflows/build-manywheel-images.yml
vendored
@ -33,9 +33,19 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
build-docker-cuda:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: am2.linux.9xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}am2.linux.9xlarge.ephemeral"
|
||||
strategy:
|
||||
matrix:
|
||||
cuda_version: ["12.4", "12.1", "11.8"]
|
||||
@ -73,7 +83,8 @@ jobs:
|
||||
# NOTE: manylinux_2_28 are still experimental, see https://github.com/pytorch/pytorch/issues/123649
|
||||
build-docker-cuda-manylinux_2_28:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: linux.9xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
|
||||
strategy:
|
||||
matrix:
|
||||
cuda_version: ["12.4", "12.1", "11.8"]
|
||||
@ -110,7 +121,8 @@ jobs:
|
||||
.ci/docker/manywheel/build.sh manylinux2_28-builder:cuda${{matrix.cuda_version}}
|
||||
build-docker-cuda-aarch64:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: linux.arm64.2xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
|
||||
strategy:
|
||||
matrix:
|
||||
cuda_version: ["12.4"]
|
||||
@ -143,7 +155,8 @@ jobs:
|
||||
.ci/docker/manywheel/build.sh manylinuxaarch64-builder:cuda${{matrix.cuda_version}}
|
||||
build-docker-rocm:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: am2.linux.9xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}am2.linux.9xlarge.ephemeral"
|
||||
strategy:
|
||||
matrix:
|
||||
rocm_version: ["6.1", "6.2"]
|
||||
@ -178,7 +191,8 @@ jobs:
|
||||
.ci/docker/manywheel/build.sh manylinux-builder:rocm${{matrix.rocm_version}}
|
||||
build-docker-cpu:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: am2.linux.9xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}am2.linux.9xlarge.ephemeral"
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
|
||||
@ -207,7 +221,8 @@ jobs:
|
||||
.ci/docker/manywheel/build.sh manylinux-builder:cpu
|
||||
build-docker-cpu-manylinux_2_28:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: linux.9xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
|
||||
env:
|
||||
GPU_ARCH_TYPE: cpu-manylinux_2_28
|
||||
steps:
|
||||
@ -238,7 +253,8 @@ jobs:
|
||||
.ci/docker/manywheel/build.sh manylinux2_28-builder:cpu
|
||||
build-docker-cpu-aarch64:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: linux.arm64.2xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
|
||||
env:
|
||||
GPU_ARCH_TYPE: cpu-aarch64
|
||||
steps:
|
||||
@ -269,7 +285,8 @@ jobs:
|
||||
.ci/docker/manywheel/build.sh manylinuxaarch64-builder:cpu-aarch64
|
||||
build-docker-cpu-aarch64-2_28:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: linux.arm64.2xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
|
||||
env:
|
||||
GPU_ARCH_TYPE: cpu-aarch64-2_28
|
||||
steps:
|
||||
@ -303,7 +320,8 @@ jobs:
|
||||
.ci/docker/manywheel/build.sh manylinux2_28_aarch64-builder:cpu-aarch64
|
||||
build-docker-cpu-cxx11-abi:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: linux.9xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
|
||||
env:
|
||||
GPU_ARCH_TYPE: cpu-cxx11-abi
|
||||
steps:
|
||||
@ -334,7 +352,8 @@ jobs:
|
||||
.ci/docker/manywheel/build.sh manylinuxcxx11-abi-builder:cpu-cxx11-abi
|
||||
build-docker-xpu:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
runs-on: linux.9xlarge.ephemeral
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
|
||||
env:
|
||||
GPU_ARCH_TYPE: xpu
|
||||
steps:
|
||||
|
15
.github/workflows/build-triton-wheel.yml
vendored
15
.github/workflows/build-triton-wheel.yml
vendored
@ -27,9 +27,19 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
build-wheel:
|
||||
name: "Build Triton Wheel"
|
||||
runs-on: [self-hosted, linux.4xlarge]
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@ -199,7 +209,8 @@ jobs:
|
||||
|
||||
build-conda:
|
||||
name: "Build Triton Conda"
|
||||
runs-on: [self-hosted, linux.2xlarge]
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
|
17
.github/workflows/create_release.yml
vendored
17
.github/workflows/create_release.yml
vendored
@ -16,6 +16,15 @@ on:
|
||||
paths: [.github/workflows/create_release.yml]
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
release:
|
||||
if: ${{ github.repository == 'pytorch/pytorch' }}
|
||||
name: Create Release
|
||||
@ -63,7 +72,7 @@ jobs:
|
||||
files: ${{env.PT_RELEASE_FILE}}
|
||||
- name: Upload source distribution to GHA artifacts for release tags
|
||||
if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
|
||||
uses: actions/upload-artifact@v2
|
||||
uses: actions/upload-artifact@v4.4.0
|
||||
with:
|
||||
name: ${{ env.PT_RELEASE_FILE }}
|
||||
path: ${{ env.PT_RELEASE_FILE }}
|
||||
@ -73,12 +82,14 @@ jobs:
|
||||
|
||||
upload_source_code_to_s3:
|
||||
if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
|
||||
runs-on: linux.2xlarge
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
|
||||
environment: sourcecode-upload
|
||||
name: Upload source code to S3 for release tags
|
||||
permissions:
|
||||
id-token: write
|
||||
needs: release
|
||||
needs:
|
||||
- get-label-type
|
||||
- release
|
||||
steps:
|
||||
- uses: actions/download-artifact@v4.1.7
|
||||
with:
|
||||
|
12
.github/workflows/docker-builds.yml
vendored
12
.github/workflows/docker-builds.yml
vendored
@ -30,8 +30,18 @@ env:
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
docker-build:
|
||||
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
needs: get-label-type
|
||||
timeout-minutes: 240
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@ -68,7 +78,7 @@ jobs:
|
||||
- docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
|
||||
runner: linux.arm64.m7g.4xlarge
|
||||
timeout-minutes: 600
|
||||
runs-on: [self-hosted, "${{ matrix.runner }}"]
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }}"
|
||||
env:
|
||||
DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/${{ matrix.docker-image-name }}
|
||||
steps:
|
||||
|
18
.github/workflows/docker-release.yml
vendored
18
.github/workflows/docker-release.yml
vendored
@ -34,9 +34,19 @@ env:
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
generate-matrix:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
runs-on: [self-hosted, linux.large]
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.large"
|
||||
outputs:
|
||||
matrix: ${{ steps.generate-matrix.outputs.matrix }}
|
||||
steps:
|
||||
@ -54,10 +64,12 @@ jobs:
|
||||
|
||||
build:
|
||||
if: ${{ github.repository == 'pytorch/pytorch' }}
|
||||
runs-on: [self-hosted, linux.2xlarge]
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
|
||||
environment: ${{ (github.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
|
||||
timeout-minutes: 240
|
||||
needs: generate-matrix
|
||||
needs:
|
||||
- generate-matrix
|
||||
- get-label-type
|
||||
strategy:
|
||||
matrix: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
|
||||
fail-fast: false
|
||||
|
140
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
140
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
@ -1010,76 +1010,6 @@ jobs:
|
||||
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_10-cuda12_1-full-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_1-full
|
||||
build_environment: linux-binary-manywheel
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_1-full-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_10-cuda12_1-full-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cuda12_1-full
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_1-full-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_10-cuda12_1-full-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cuda12_1-full
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
|
||||
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_10-cuda12_4-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -1766,6 +1696,76 @@ jobs:
|
||||
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_11-cuda12_1-full-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_1-full
|
||||
build_environment: linux-binary-manywheel
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_1-full-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_11-cuda12_1-full-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda12_1-full
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_1-full-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_11-cuda12_1-full-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda12_1-full
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
|
||||
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_11-cuda12_4-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
# Template is at: .github/templates/linux_binary_build_workflow.yml.j2
|
||||
# Generation script: .github/scripts/generate_ci_workflows.py
|
||||
name: linux-binary-manywheel
|
||||
name: linux-binary-manywheel-split
|
||||
|
||||
|
||||
on:
|
||||
@ -19,7 +19,7 @@ env:
|
||||
ANACONDA_USER: pytorch
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
BINARY_ENV_FILE: /tmp/env
|
||||
BUILD_ENVIRONMENT: linux-binary-manywheel
|
||||
BUILD_ENVIRONMENT: linux-binary-manywheel-split
|
||||
BUILDER_ROOT: /builder
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
@ -28,7 +28,7 @@ env:
|
||||
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
SKIP_ALL_TESTS: 0
|
||||
concurrency:
|
||||
group: linux-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
group: linux-binary-manywheel-split-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
@ -58,7 +58,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -81,7 +81,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -105,7 +105,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -128,7 +128,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -152,7 +152,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -175,7 +175,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
@ -2,7 +2,7 @@
|
||||
|
||||
# Template is at: .github/templates/linux_binary_build_workflow.yml.j2
|
||||
# Generation script: .github/scripts/generate_ci_workflows.py
|
||||
name: linux-binary-manywheel
|
||||
name: linux-binary-manywheel-split
|
||||
|
||||
|
||||
on:
|
||||
@ -24,7 +24,7 @@ env:
|
||||
ANACONDA_USER: pytorch
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
BINARY_ENV_FILE: /tmp/env
|
||||
BUILD_ENVIRONMENT: linux-binary-manywheel
|
||||
BUILD_ENVIRONMENT: linux-binary-manywheel-split
|
||||
BUILDER_ROOT: /builder
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
@ -33,7 +33,7 @@ env:
|
||||
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
SKIP_ALL_TESTS: 0
|
||||
concurrency:
|
||||
group: linux-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
group: linux-binary-manywheel-split-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
@ -63,7 +63,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -86,7 +86,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -134,7 +134,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -157,7 +157,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -205,7 +205,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -228,7 +228,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -275,7 +275,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cpu
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cpu-test: # Testing
|
||||
@ -296,7 +296,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cpu
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge
|
||||
secrets:
|
||||
@ -343,7 +343,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -366,7 +366,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -414,7 +414,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -437,7 +437,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -467,76 +467,6 @@ jobs:
|
||||
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_10-cuda12_1-full-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_1-full
|
||||
build_environment: linux-binary-manywheel
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_1-full-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_10-cuda12_1-full-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cuda12_1-full
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_1-full-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_10-cuda12_1-full-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cuda12_1-full
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
|
||||
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_10-cuda12_4-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -555,7 +485,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -578,7 +508,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -625,7 +555,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cpu
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cpu-test: # Testing
|
||||
@ -646,7 +576,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cpu
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge
|
||||
secrets:
|
||||
@ -693,7 +623,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -716,7 +646,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -764,7 +694,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -787,7 +717,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -817,6 +747,76 @@ jobs:
|
||||
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_11-cuda12_1-full-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_1-full
|
||||
build_environment: linux-binary-manywheel-split
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_1-full-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_11-cuda12_1-full-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda12_1-full
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_1-full-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_11-cuda12_1-full-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda12_1-full
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
|
||||
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_11-cuda12_4-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -835,7 +835,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -858,7 +858,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -905,7 +905,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cpu
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cpu-test: # Testing
|
||||
@ -926,7 +926,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cpu
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge
|
||||
secrets:
|
||||
@ -973,7 +973,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -996,7 +996,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -1044,7 +1044,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1067,7 +1067,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -1115,7 +1115,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1138,7 +1138,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -1185,7 +1185,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cpu
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cpu-test: # Testing
|
||||
@ -1206,7 +1206,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cpu
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge
|
||||
secrets:
|
||||
@ -1253,7 +1253,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.13"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1276,7 +1276,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.13"
|
||||
build_name: manywheel-py3_13-cuda11_8
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -1324,7 +1324,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.13"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1347,7 +1347,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.13"
|
||||
build_name: manywheel-py3_13-cuda12_1
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -1395,7 +1395,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.13"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1418,7 +1418,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.13"
|
||||
build_name: manywheel-py3_13-cuda12_4
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
@ -1465,7 +1465,7 @@ jobs:
|
||||
DESIRED_PYTHON: "3.13"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cpu
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cpu-test: # Testing
|
||||
@ -1486,7 +1486,7 @@ jobs:
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.13"
|
||||
build_name: manywheel-py3_13-cpu
|
||||
build_environment: linux-binary-manywheel
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge
|
||||
secrets:
|
11
.github/workflows/inductor-cu124.yml
vendored
11
.github/workflows/inductor-cu124.yml
vendored
@ -18,11 +18,22 @@ concurrency:
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
|
||||
# Should be synced with the one in inductor.yml, but this doesn't run inductor_timm
|
||||
name: cuda12.4-py3.10-gcc9-sm86
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
sync-tag: linux-focal-cuda12_4-py3_10-gcc9-inductor-build
|
||||
build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
|
||||
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
|
11
.github/workflows/inductor-micro-benchmark.yml
vendored
11
.github/workflows/inductor-micro-benchmark.yml
vendored
@ -16,10 +16,21 @@ concurrency:
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build:
|
||||
name: cuda12.1-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
|
||||
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
|
11
.github/workflows/inductor-perf-compare.yml
vendored
11
.github/workflows/inductor-perf-compare.yml
vendored
@ -13,10 +13,21 @@ concurrency:
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
|
||||
name: cuda12.1-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
|
||||
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
|
@ -68,10 +68,21 @@ concurrency:
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
|
||||
name: cuda12.1-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
|
||||
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
|
@ -50,10 +50,21 @@ concurrency:
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-jammy-aarch64-py3_10-inductor-build:
|
||||
name: linux-jammy-aarch64-py3.10-inductor
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runner: linux.arm64.m7g.4xlarge
|
||||
build-environment: linux-jammy-aarch64-py3.10
|
||||
docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
|
||||
|
@ -48,10 +48,21 @@ concurrency:
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-jammy-cpu-py3_9-gcc11-inductor-build:
|
||||
name: linux-jammy-cpu-py3.9-gcc11-inductor
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image-name: pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
|
||||
test-matrix: |
|
||||
|
11
.github/workflows/inductor-perf-test-nightly.yml
vendored
11
.github/workflows/inductor-perf-test-nightly.yml
vendored
@ -66,10 +66,21 @@ concurrency:
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
|
||||
name: cuda12.1-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
|
||||
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
|
13
.github/workflows/inductor-periodic.yml
vendored
13
.github/workflows/inductor-periodic.yml
vendored
@ -18,10 +18,21 @@ concurrency:
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-focal-cuda12_1-py3_10-gcc9-periodic-dynamo-benchmarks-build:
|
||||
name: cuda12.1-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
|
||||
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.6'
|
||||
@ -60,7 +71,9 @@ jobs:
|
||||
linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp:
|
||||
name: cuda12.1-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
|
||||
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
|
11
.github/workflows/inductor-rocm.yml
vendored
11
.github/workflows/inductor-rocm.yml
vendored
@ -22,10 +22,21 @@ concurrency:
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-focal-rocm6_1-py3_8-inductor-build:
|
||||
name: rocm6.1-py3.8-inductor
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-rocm6.1-py3.8
|
||||
docker-image-name: pytorch-linux-focal-rocm-n-py3
|
||||
test-matrix: |
|
||||
|
18
.github/workflows/periodic.yml
vendored
18
.github/workflows/periodic.yml
vendored
@ -57,8 +57,10 @@ jobs:
|
||||
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
linux-focal-cuda12_1-py3_10-gcc9-test:
|
||||
@ -87,8 +89,10 @@ jobs:
|
||||
{ config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
{ config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
|
||||
@ -333,8 +337,10 @@ jobs:
|
||||
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
|
||||
|
8
.github/workflows/rocm.yml
vendored
8
.github/workflows/rocm.yml
vendored
@ -3,18 +3,12 @@ name: rocm
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
# - main
|
||||
- main
|
||||
- release/*
|
||||
tags:
|
||||
- ciflow/rocm/*
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
|
||||
# Also run less frequently on weekends.
|
||||
- cron: 45 0,8,16 * * 1-5
|
||||
- cron: 45 4 * * 0,6
|
||||
- cron: 45 4,12,20 * * 1-5
|
||||
- cron: 45 12 * * 0,6
|
||||
- cron: 29 8 * * * # about 1:29am PDT
|
||||
|
||||
concurrency:
|
||||
|
19
.github/workflows/slow.yml
vendored
19
.github/workflows/slow.yml
vendored
@ -56,12 +56,14 @@ jobs:
|
||||
cuda-arch-list: 8.6
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 2, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 3, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 4, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 5, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 6, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 1, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 2, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 3, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 4, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 5, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 6, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 7, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 8, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
|
||||
linux-focal-cuda12_1-py3-gcc9-slow-gradcheck-test:
|
||||
@ -87,8 +89,9 @@ jobs:
|
||||
cuda-arch-list: 8.6
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "slow", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "slow", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "slow", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "slow", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "slow", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
|
||||
linux-focal-cuda12_1-py3_10-gcc9-sm86-test:
|
||||
|
@ -10,8 +10,18 @@ permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
index:
|
||||
runs-on: linux.g5.4xlarge.nvidia.gpu # 1 GPU A10G 24GB each
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" # 1 GPU A10G 24GB each
|
||||
environment: target-determinator-env
|
||||
steps:
|
||||
- name: Clone PyTorch
|
||||
|
11
.github/workflows/torchbench.yml
vendored
11
.github/workflows/torchbench.yml
vendored
@ -11,10 +11,21 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: ./.github/workflows/_runner-determinator.yml
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-focal-cuda12_1-py3_10-gcc9-torchbench-build-gcp:
|
||||
name: cuda12.1-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
|
||||
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
|
6
.github/workflows/trunk.yml
vendored
6
.github/workflows/trunk.yml
vendored
@ -266,8 +266,10 @@ jobs:
|
||||
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
{ config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
|
2
.github/workflows/upload-test-stats.yml
vendored
2
.github/workflows/upload-test-stats.yml
vendored
@ -96,7 +96,7 @@ jobs:
|
||||
python3 -m tools.stats.check_disabled_tests --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --repo "${REPO_FULLNAME}"
|
||||
|
||||
- name: Upload gpt-fast benchmark results to Rockset
|
||||
if: steps.upload-s3.outcome && steps.upload-s3.outcome == 'success' && github.event.workflow_run.name == 'inductor-micro-benchmark'
|
||||
if: steps.upload-s3.outcome && steps.upload-s3.outcome == 'success' && contains('inductor-micro-benchmark', github.event.workflow_run.name)
|
||||
env:
|
||||
ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
|
||||
WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
|
||||
|
59
README.md
59
README.md
@ -27,8 +27,8 @@ Our trunk health (Continuous Integration signals) can be found at [hud.pytorch.o
|
||||
- [NVIDIA CUDA Support](#nvidia-cuda-support)
|
||||
- [AMD ROCm Support](#amd-rocm-support)
|
||||
- [Intel GPU Support](#intel-gpu-support)
|
||||
- [Install Dependencies](#install-dependencies)
|
||||
- [Get the PyTorch Source](#get-the-pytorch-source)
|
||||
- [Install Dependencies](#install-dependencies)
|
||||
- [Install PyTorch](#install-pytorch)
|
||||
- [Adjust Build Options (Optional)](#adjust-build-options-optional)
|
||||
- [Docker Image](#docker-image)
|
||||
@ -161,9 +161,34 @@ They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv)
|
||||
#### Prerequisites
|
||||
If you are installing from source, you will need:
|
||||
- Python 3.8 or later (for Linux, Python 3.8.1+ is needed)
|
||||
- A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required)
|
||||
- A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required, on Linux)
|
||||
- Visual Studio or Visual Studio Build Tool on Windows
|
||||
|
||||
We highly recommend installing an [Anaconda](https://www.anaconda.com/download) environment. You will get a high-quality BLAS library (MKL) and you get controlled dependency versions regardless of your Linux distro.
|
||||
\* PyTorch CI uses Visual C++ BuildTools, which come with Visual Studio Enterprise,
|
||||
Professional, or Community Editions. You can also install the build tools from
|
||||
https://visualstudio.microsoft.com/visual-cpp-build-tools/. The build tools *do not*
|
||||
come with Visual Studio Code by default.
|
||||
|
||||
\* We highly recommend installing an [Anaconda](https://www.anaconda.com/download) environment. You will get a high-quality BLAS library (MKL) and you get controlled dependency versions regardless of your Linux distro.
|
||||
|
||||
An example of environment setup is shown below:
|
||||
|
||||
* Linux:
|
||||
|
||||
```bash
|
||||
$ source <CONDA_INSTALL_DIR>/bin/activate
|
||||
$ conda create -y -n <CONDA_NAME>
|
||||
$ conda activate <CONDA_NAME>
|
||||
```
|
||||
|
||||
* Windows:
|
||||
|
||||
```bash
|
||||
$ source <CONDA_INSTALL_DIR>\Scripts\activate.bat
|
||||
$ conda create -y -n <CONDA_NAME>
|
||||
$ conda activate <CONDA_NAME>
|
||||
$ call "C:\Program Files\Microsoft Visual Studio\<VERSION>\Community\VC\Auxiliary\Build\vcvarsall.bat" x64
|
||||
```
|
||||
|
||||
##### NVIDIA CUDA Support
|
||||
If you want to compile with CUDA support, [select a supported version of CUDA from our support matrix](https://pytorch.org/get-started/locally/), then install the following:
|
||||
@ -194,12 +219,23 @@ If you want to compile with Intel GPU support, follow these
|
||||
If you want to disable Intel GPU support, export the environment variable `USE_XPU=0`.
|
||||
Other potentially useful environment variables may be found in `setup.py`.
|
||||
|
||||
#### Get the PyTorch Source
|
||||
```bash
|
||||
git clone --recursive https://github.com/pytorch/pytorch
|
||||
cd pytorch
|
||||
# if you are updating an existing checkout
|
||||
git submodule sync
|
||||
git submodule update --init --recursive
|
||||
```
|
||||
|
||||
#### Install Dependencies
|
||||
|
||||
**Common**
|
||||
|
||||
```bash
|
||||
conda install cmake ninja
|
||||
# Run this command on native Windows
|
||||
conda install rust
|
||||
# Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section below
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
@ -235,15 +271,6 @@ pip install mkl-static mkl-include
|
||||
conda install -c conda-forge libuv=1.39
|
||||
```
|
||||
|
||||
#### Get the PyTorch Source
|
||||
```bash
|
||||
git clone --recursive https://github.com/pytorch/pytorch
|
||||
cd pytorch
|
||||
# if you are updating an existing checkout
|
||||
git submodule sync
|
||||
git submodule update --init --recursive
|
||||
```
|
||||
|
||||
#### Install PyTorch
|
||||
**On Linux**
|
||||
|
||||
@ -284,13 +311,6 @@ python3 setup.py develop
|
||||
|
||||
**On Windows**
|
||||
|
||||
Choose Correct Visual Studio Version.
|
||||
|
||||
PyTorch CI uses Visual C++ BuildTools, which come with Visual Studio Enterprise,
|
||||
Professional, or Community Editions. You can also install the build tools from
|
||||
https://visualstudio.microsoft.com/visual-cpp-build-tools/. The build tools *do not*
|
||||
come with Visual Studio Code by default.
|
||||
|
||||
If you want to build legacy python code, please refer to [Building on legacy code and CUDA](https://github.com/pytorch/pytorch/blob/main/CONTRIBUTING.md#building-on-legacy-code-and-cuda)
|
||||
|
||||
**CPU-only builds**
|
||||
@ -298,7 +318,6 @@ If you want to build legacy python code, please refer to [Building on legacy cod
|
||||
In this mode PyTorch computations will run on your CPU, not your GPU
|
||||
|
||||
```cmd
|
||||
conda activate
|
||||
python setup.py develop
|
||||
```
|
||||
|
||||
|
@ -299,6 +299,15 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
|
||||
AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)
|
||||
|
||||
#define AT_DISPATCH_CASE_FLOATING_TYPES_AND5( \
|
||||
SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, ...) \
|
||||
AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__)
|
||||
|
||||
#define AT_DISPATCH_FLOATING_TYPES_AND4( \
|
||||
SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, TYPE, NAME, ...) \
|
||||
AT_DISPATCH_SWITCH( \
|
||||
@ -307,6 +316,26 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
|
||||
AT_DISPATCH_CASE_FLOATING_TYPES_AND4( \
|
||||
SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, __VA_ARGS__))
|
||||
|
||||
#define AT_DISPATCH_FLOATING_TYPES_AND5( \
|
||||
SCALARTYPE1, \
|
||||
SCALARTYPE2, \
|
||||
SCALARTYPE3, \
|
||||
SCALARTYPE4, \
|
||||
SCALARTYPE5, \
|
||||
TYPE, \
|
||||
NAME, \
|
||||
...) \
|
||||
AT_DISPATCH_SWITCH( \
|
||||
TYPE, \
|
||||
NAME, \
|
||||
AT_DISPATCH_CASE_FLOATING_TYPES_AND5( \
|
||||
SCALARTYPE1, \
|
||||
SCALARTYPE2, \
|
||||
SCALARTYPE3, \
|
||||
SCALARTYPE4, \
|
||||
SCALARTYPE5, \
|
||||
__VA_ARGS__))
|
||||
|
||||
#define AT_DISPATCH_CASE_COMPLEX_TYPES(...) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::ComplexDouble, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::ComplexFloat, __VA_ARGS__)
|
||||
|
@ -1408,7 +1408,6 @@ void scaled_gemm(
|
||||
const void *result_scale_ptr,
|
||||
int64_t result_ld,
|
||||
ScalarType result_dtype,
|
||||
void* amax_ptr,
|
||||
bool use_fast_accum) {
|
||||
#if CUDA_VERSION >= 11080 || defined(USE_ROCM)
|
||||
const auto computeType = CUBLAS_COMPUTE_32F;
|
||||
@ -1421,13 +1420,9 @@ void scaled_gemm(
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb));
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr);
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr);
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
|
||||
#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 60200)
|
||||
// Amax support in ROCm as of 6.2
|
||||
if (isFloat8Type(result_dtype)) {
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_AMAX_D_POINTER, amax_ptr);
|
||||
if (result_scale_ptr != nullptr) {
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
|
||||
}
|
||||
#endif
|
||||
#ifndef USE_ROCM
|
||||
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_FAST_ACCUM, fastAccuMode);
|
||||
#endif
|
||||
|
@ -140,7 +140,6 @@ void scaled_gemm(
|
||||
const void* result_scale_ptr,
|
||||
int64_t result_ld,
|
||||
ScalarType result_dtype,
|
||||
void* amax_ptr,
|
||||
bool use_fast_accum);
|
||||
|
||||
#define CUDABLAS_BGEMM_ARGTYPES(Dtype) \
|
||||
|
@ -188,7 +188,10 @@ TuningResultsValidator::TuningResultsValidator() {
|
||||
RegisterValidator(
|
||||
"ROCM_VERSION",
|
||||
[rocm_version]() { return rocm_version; },
|
||||
[rocm_version](auto&& k) { return rocm_version == k ? OK : FAIL; });
|
||||
[rocm_version](auto&& k) {
|
||||
TUNABLE_LOG1("ROCM_VERSION validation: expect ", k, " to match ", rocm_version);
|
||||
return rocm_version == k ? OK : FAIL;
|
||||
});
|
||||
}
|
||||
// gfx arch
|
||||
{
|
||||
@ -196,7 +199,10 @@ TuningResultsValidator::TuningResultsValidator() {
|
||||
RegisterValidator(
|
||||
"GCN_ARCH_NAME",
|
||||
[gcn_arch_name]() { return gcn_arch_name; },
|
||||
[gcn_arch_name](auto&& k) { return gcn_arch_name == k ? OK : FAIL; });
|
||||
[gcn_arch_name](auto&& k) {
|
||||
TUNABLE_LOG1("GCN_ARCH_NAME validation: expect ", k, " to match ", gcn_arch_name);
|
||||
return gcn_arch_name == k ? OK : FAIL;
|
||||
});
|
||||
}
|
||||
// rocblas
|
||||
{
|
||||
@ -212,7 +218,10 @@ TuningResultsValidator::TuningResultsValidator() {
|
||||
RegisterValidator(
|
||||
"ROCBLAS_VERSION",
|
||||
[rocblas_version]() { return rocblas_version; },
|
||||
[rocblas_version](auto&& k) { return rocblas_version == k ? OK : FAIL; });
|
||||
[rocblas_version](auto&& k) {
|
||||
TUNABLE_LOG1("ROCBLAS_VERSION validation: expect ", k, " to match ", rocblas_version);
|
||||
return rocblas_version == k ? OK : FAIL;
|
||||
});
|
||||
}
|
||||
// hipblaslt
|
||||
{
|
||||
@ -226,7 +235,10 @@ TuningResultsValidator::TuningResultsValidator() {
|
||||
RegisterValidator(
|
||||
"HIPBLASLT_VERSION",
|
||||
[hipblaslt_version]() { return hipblaslt_version; },
|
||||
[hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; });
|
||||
[hipblaslt_version](auto&& k) {
|
||||
TUNABLE_LOG1("HIPBLASLT_VERSION validation: expect ", k, " to match ", hipblaslt_version);
|
||||
return hipblaslt_version == k ? OK : FAIL;
|
||||
});
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -104,7 +104,6 @@ class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
|
||||
params->c_scale_ptr,
|
||||
params->ldc,
|
||||
params->c_dtype,
|
||||
params->amax_ptr,
|
||||
params->use_fast_accum);
|
||||
return OK;
|
||||
}
|
||||
|
@ -779,6 +779,28 @@ std::tuple<Tensor, std::optional<int64_t>> scatter_reduce_batch_rule(
|
||||
self, self_bdim, dim, index, index_bdim, src, src_bdim, reduce);
|
||||
}
|
||||
|
||||
std::tuple<Tensor, std::optional<int64_t>> scatter_reduce_two_batch_rule(
|
||||
const Tensor& self, std::optional<int64_t> self_bdim,
|
||||
int64_t dim,
|
||||
const Tensor& index, std::optional<int64_t> index_bdim,
|
||||
const Tensor& src, std::optional<int64_t> src_bdim,
|
||||
const c10::string_view reduce,
|
||||
bool include_self) {
|
||||
return scatter_batch_rule(ATEN_FN2(scatter_reduce, two),
|
||||
self, self_bdim, dim, index, index_bdim, src, src_bdim, reduce, include_self);
|
||||
}
|
||||
|
||||
std::tuple<Tensor, std::optional<int64_t>> scatter_reduce__two_batch_rule(
|
||||
const Tensor& self, std::optional<int64_t> self_bdim,
|
||||
int64_t dim,
|
||||
const Tensor& index, std::optional<int64_t> index_bdim,
|
||||
const Tensor& src, std::optional<int64_t> src_bdim,
|
||||
const c10::string_view reduce,
|
||||
bool include_self) {
|
||||
return scatter_batch_rule(ATEN_FN2(scatter_reduce_, two),
|
||||
self, self_bdim, dim, index, index_bdim, src, src_bdim, reduce, include_self);
|
||||
}
|
||||
|
||||
std::tuple<Tensor, std::optional<int64_t>> scatter_value_reduce_batch_rule(
|
||||
const Tensor& self, std::optional<int64_t> self_bdim,
|
||||
int64_t dim,
|
||||
@ -1250,6 +1272,8 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
|
||||
VMAP_SUPPORT(scatter_add, scatter_add_batch_rule);
|
||||
VMAP_SUPPORT2(scatter, reduce, scatter_reduce_batch_rule);
|
||||
VMAP_SUPPORT2(scatter, value_reduce, scatter_value_reduce_batch_rule);
|
||||
VMAP_SUPPORT2(scatter_reduce, two, scatter_reduce_two_batch_rule);
|
||||
VMAP_SUPPORT2(scatter_reduce_, two, scatter_reduce__two_batch_rule);
|
||||
// as_strided_scatter does not work with the for-loop fallback today,
|
||||
// because as_strided_scatter will return an output that matches
|
||||
// the strides/storage_offset of its input.
|
||||
|
@ -209,7 +209,13 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
|
||||
|
||||
bool all_contiguous = is_contiguous(input);
|
||||
constexpr bool mixed_type = !std::is_same_v<scalar_t, param_t>;
|
||||
const auto dtype = mixed_type ? kFloat : input.scalar_type();
|
||||
// Using float data type for Half _var_sum in batchnorm stats updating on CPU
|
||||
// to avoid _var_sum overflow since the representation range of Half is small.
|
||||
using opmath_t = std::conditional_t<std::is_same_v<param_t, at::Half>, at::opmath_type<param_t>, param_t>;
|
||||
auto dtype = mixed_type ? kFloat : input.scalar_type();
|
||||
if (dtype == kHalf) {
|
||||
dtype = kFloat;
|
||||
}
|
||||
|
||||
auto save_mean_a = save_mean.accessor<param_t, 1>();
|
||||
auto save_var_transform_a = save_var_transform.accessor<param_t, 1>();
|
||||
@ -220,9 +226,9 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
|
||||
if (all_contiguous) {
|
||||
auto _mean = at::empty({n_input}, input.options().dtype(dtype));
|
||||
auto _var_sum = at::empty({n_input}, input.options().dtype(dtype));
|
||||
auto _mean_a = _mean.accessor<param_t, 1>();
|
||||
auto _var_sum_a = _var_sum.accessor<param_t, 1>();
|
||||
auto momentum_ = static_cast<param_t>(momentum);
|
||||
auto _mean_a = _mean.accessor<opmath_t, 1>();
|
||||
auto _var_sum_a = _var_sum.accessor<opmath_t, 1>();
|
||||
auto momentum_ = static_cast<opmath_t>(momentum);
|
||||
|
||||
batch_norm_cpu_collect_stats_stub(kCPU, _mean, _var_sum, input);
|
||||
|
||||
|
@ -11,18 +11,18 @@ namespace ao {
|
||||
namespace sparse {
|
||||
|
||||
namespace {
|
||||
constexpr int64_t serialization_version_index = 0;
|
||||
constexpr int64_t bias_index = 1;
|
||||
constexpr int64_t out_features_block_size_index = 2;
|
||||
constexpr int64_t in_features_block_size_index = 3;
|
||||
constexpr int64_t weight_scales_index = 4;
|
||||
constexpr int64_t weight_zero_point_index = 5;
|
||||
constexpr int64_t quantization_scheme_index = 6;
|
||||
constexpr int64_t row_block_indices_index = 7;
|
||||
constexpr int64_t col_block_indices_index = 8;
|
||||
constexpr int64_t weight_values_index = 9;
|
||||
constexpr int64_t num_output_channels_index = 10;
|
||||
constexpr int64_t num_input_channels_index = 11;
|
||||
constexpr int64_t serialization_version_index [[maybe_unused]] = 0;
|
||||
constexpr int64_t bias_index [[maybe_unused]] = 1;
|
||||
constexpr int64_t out_features_block_size_index [[maybe_unused]] = 2;
|
||||
constexpr int64_t in_features_block_size_index [[maybe_unused]] = 3;
|
||||
constexpr int64_t weight_scales_index [[maybe_unused]] = 4;
|
||||
constexpr int64_t weight_zero_point_index [[maybe_unused]] = 5;
|
||||
constexpr int64_t quantization_scheme_index [[maybe_unused]] = 6;
|
||||
constexpr int64_t row_block_indices_index [[maybe_unused]] = 7;
|
||||
constexpr int64_t col_block_indices_index [[maybe_unused]] = 8;
|
||||
constexpr int64_t weight_values_index [[maybe_unused]] = 9;
|
||||
constexpr int64_t num_output_channels_index [[maybe_unused]] = 10;
|
||||
constexpr int64_t num_input_channels_index [[maybe_unused]] = 11;
|
||||
|
||||
template <typename TENSOR_DTYPE, typename VEC_DTYPE>
|
||||
std::vector<VEC_DTYPE> unwrap_vector(at::Tensor tensor) {
|
||||
|
@ -81,6 +81,12 @@ void atan2_kernel(TensorIteratorBase& iter) {
|
||||
}
|
||||
|
||||
#if !defined(C10_MOBILE)
|
||||
#define _AT_DISPATCH_INTEGRAL_TYPES_V2(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_V2( \
|
||||
TYPE, \
|
||||
NAME, \
|
||||
AT_WRAP(__VA_ARGS__), \
|
||||
AT_EXPAND(AT_INTEGRAL_TYPES_V2))
|
||||
#define _AT_DISPATCH_ALL_TYPES_AND_BOOL(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_V2( \
|
||||
TYPE, \
|
||||
@ -104,6 +110,8 @@ void atan2_kernel(TensorIteratorBase& iter) {
|
||||
AT_DISPATCH_V2(TYPE, NAME, AT_WRAP(__VA_ARGS__), \
|
||||
kHalf, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
|
||||
#else
|
||||
#define _AT_DISPATCH_INTEGRAL_TYPES_V2(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, __VA_ARGS__)
|
||||
#define _AT_DISPATCH_ALL_TYPES_AND_BOOL(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( \
|
||||
kComplexHalf, kHalf, kBool, kBFloat16, TYPE, NAME, __VA_ARGS__)
|
||||
@ -382,7 +390,7 @@ void bitwise_and_kernel(TensorIteratorBase& iter) {
|
||||
if (iter.dtype() == ScalarType::Bool) {
|
||||
cpu_kernel(iter, [](bool a, bool b) { return a && b; });
|
||||
} else {
|
||||
AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "bitwise_and_cpu", [&]() {
|
||||
_AT_DISPATCH_INTEGRAL_TYPES_V2(iter.dtype(), "bitwise_and_cpu", [&]() {
|
||||
cpu_kernel_vec(
|
||||
iter,
|
||||
[](scalar_t a, scalar_t b) -> scalar_t { return a & b; },
|
||||
@ -395,7 +403,7 @@ void bitwise_or_kernel(TensorIteratorBase& iter) {
|
||||
if (iter.dtype() == ScalarType::Bool) {
|
||||
cpu_kernel(iter, [](bool a, bool b) { return a || b; });
|
||||
} else {
|
||||
AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "bitwise_or_cpu", [&]() {
|
||||
_AT_DISPATCH_INTEGRAL_TYPES_V2(iter.dtype(), "bitwise_or_cpu", [&]() {
|
||||
cpu_kernel_vec(
|
||||
iter,
|
||||
[](scalar_t a, scalar_t b) -> scalar_t { return a | b; },
|
||||
@ -410,7 +418,7 @@ void bitwise_xor_kernel(TensorIteratorBase& iter) {
|
||||
// this operation for both Boolean and integral types.
|
||||
cpu_kernel(iter, [](bool a, bool b) { return a != b; });
|
||||
} else {
|
||||
AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "bitwise_xor_cpu", [&]() {
|
||||
_AT_DISPATCH_INTEGRAL_TYPES_V2(iter.dtype(), "bitwise_xor_cpu", [&]() {
|
||||
cpu_kernel_vec(
|
||||
iter,
|
||||
[](scalar_t a, scalar_t b) -> scalar_t { return a ^ b; },
|
||||
|
@ -473,8 +473,7 @@ void cpu_flash_attention(
|
||||
scalar_t* transpose_buffer_ptr = transpose_buffer.get();
|
||||
std::unique_ptr<scalar_t[]> v_copy_buffer = std::make_unique<scalar_t[]>(ekvSplitSize * packb_size);
|
||||
scalar_t* v_copy_buffer_ptr = v_copy_buffer.get();
|
||||
for (const auto z : c10::irange(begin, end)) {
|
||||
(void)z; // Suppress unused variable
|
||||
for (C10_UNUSED auto z : c10::irange(begin, end)) {
|
||||
n = l * kvSplitSize;
|
||||
int64_t kvBlockSize = std::min(kvSplitSize, kvSize - n);
|
||||
int64_t ekvBlockSize = kvBlockSize % 2 == 0 ? kvBlockSize : kvBlockSize + 1;
|
||||
@ -567,8 +566,7 @@ void cpu_flash_attention(
|
||||
? query_padding_ptr + ompIdx * qSplitSize * eheadSize
|
||||
: nullptr;
|
||||
|
||||
for (const auto z : c10::irange(begin, end)) {
|
||||
(void)z; // Suppress unused variable
|
||||
for (C10_UNUSED auto z : c10::irange(begin, end)) {
|
||||
int64_t m = k * qSplitSize;
|
||||
int64_t qBlockSize = std::min(qSplitSize, qSize - m);
|
||||
// Initialize max and sum
|
||||
@ -933,8 +931,7 @@ void cpu_flash_attention_backward(
|
||||
|
||||
at::Tensor dsum = at::empty({qSplitSize}, query.options().dtype(accumulate_dtype));
|
||||
accum_t* dsum_data = dsum.data_ptr<accum_t>();
|
||||
for (const auto z : c10::irange(begin, end)) {
|
||||
(void)z; // Suppress unused variable
|
||||
for (C10_UNUSED auto z : c10::irange(begin, end)) {
|
||||
// rowsum of grad_out * out
|
||||
for (int64_t m = 0; m < qSize; m += qSplitSize) {
|
||||
int64_t qBlockSize = std::min(qSplitSize, qSize - m);
|
||||
|
@ -964,9 +964,9 @@ ScalingType get_scaling_type(
|
||||
|
||||
} // namespace
|
||||
|
||||
// Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax
|
||||
// Computes matrix multiply + bias while applying scaling to input and output matrices
|
||||
// Scales are only applicable when matrices are of Float8 type and assumbed to be equal to 1.0 by default.
|
||||
// If output matrix type is 16 or 32-bit type, neither scale_result is applied nor amax is computed.
|
||||
// If output matrix type is 16 or 32-bit type, scale_result is not applied.
|
||||
// Known limitations:
|
||||
// - Only works if mat1 is row-major and mat2 is column-major
|
||||
// - Only works if matrices sizes are divisible by 32
|
||||
@ -1068,9 +1068,6 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
|
||||
const auto out_dtype_ = args.result->scalar_type();
|
||||
TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt");
|
||||
|
||||
// Some scaled_gemms require an amax to populate lets create one here
|
||||
Tensor amax = at::empty({0}, mat1.options().dtype(ScalarType::Float));
|
||||
|
||||
#ifdef USE_ROCM
|
||||
auto tuning_ctx = at::cuda::tunable::getTuningContext();
|
||||
if (tuning_ctx->IsTunableOpEnabled()) {
|
||||
@ -1126,7 +1123,6 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
|
||||
params.c_scale_ptr = scale_result ? scale_result->data_ptr() : nullptr;
|
||||
params.ldc = args.result_ld;
|
||||
params.c_dtype = out_dtype_;
|
||||
params.amax_ptr = amax.data_ptr();
|
||||
params.use_fast_accum = use_fast_accum;
|
||||
if (transa_ && transb_) {
|
||||
TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::T)
|
||||
@ -1150,11 +1146,6 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
|
||||
else
|
||||
#endif
|
||||
{
|
||||
#if defined(USE_ROCM) && ROCM_VERSION >= 60200
|
||||
// hipBlasLT requires scaleD to be set to something in order to use AMAX
|
||||
auto dummy_options = TensorOptions().dtype(kFloat).device(kCUDA);
|
||||
auto dummy_scale = at::ones(1, dummy_options);
|
||||
#endif
|
||||
at::cuda::blas::scaled_gemm(
|
||||
args.transa,
|
||||
args.transb,
|
||||
@ -1172,14 +1163,9 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
|
||||
bias ? bias->data_ptr(): nullptr,
|
||||
bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_,
|
||||
args.result->data_ptr(),
|
||||
#if defined(USE_ROCM) && ROCM_VERSION >= 60200
|
||||
scale_result ? scale_result->data_ptr() : dummy_scale.data_ptr(),
|
||||
#else
|
||||
scale_result ? scale_result->data_ptr() : nullptr,
|
||||
#endif
|
||||
args.result_ld,
|
||||
out_dtype_,
|
||||
amax.data_ptr(),
|
||||
use_fast_accum);
|
||||
}
|
||||
|
||||
|
@ -1092,7 +1092,11 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
|
||||
}
|
||||
|
||||
constexpr int min_values_per_thread = 16;
|
||||
#ifndef USE_ROCM
|
||||
constexpr int max_values_per_thread = 256;
|
||||
#else
|
||||
constexpr int max_values_per_thread = 1024;
|
||||
#endif
|
||||
|
||||
if (config.values_per_thread() >= block_height * 16 || config.values_per_thread() >= max_values_per_thread) {
|
||||
// Divide the input across warps in a thread-block, if that leaves at least
|
||||
|
@ -177,12 +177,11 @@ struct KthValueLauncher {
|
||||
cuda::detail::TensorInfo<scalar_t, index_t> values_info,
|
||||
int collapse_values_dim,
|
||||
cuda::detail::TensorInfo<int64_t, index_t> indices_info,
|
||||
int collapse_indices_dim,
|
||||
C10_UNUSED int collapse_indices_dim,
|
||||
cuda::detail::TensorInfo<const scalar_t, index_t> self_info,
|
||||
int collapse_self_dim,
|
||||
int64_t num_slices,
|
||||
int64_t slice_size) {
|
||||
(void)collapse_indices_dim; // Suppress unused variable warning
|
||||
dim3 grid;
|
||||
if (!getGridFromTiles(num_slices, grid)) {
|
||||
AT_ERROR("slices are too many");
|
||||
@ -213,15 +212,13 @@ struct MedianLauncher {
|
||||
template <typename scalar_t, typename index_t, int all_dims>
|
||||
inline void launch(
|
||||
cuda::detail::TensorInfo<scalar_t, index_t> values_info,
|
||||
int collapse_values_dim,
|
||||
C10_UNUSED int collapse_values_dim,
|
||||
cuda::detail::TensorInfo<int64_t, index_t> indices_info,
|
||||
int collapse_indices_dim,
|
||||
C10_UNUSED int collapse_indices_dim,
|
||||
cuda::detail::TensorInfo<const scalar_t, index_t> self_info,
|
||||
int collapse_self_dim,
|
||||
int64_t num_slices,
|
||||
int64_t slice_size) {
|
||||
(void)collapse_values_dim; // Suppress unused variable warning
|
||||
(void)collapse_indices_dim; // Suppress unused variable warning
|
||||
dim3 grid;
|
||||
if (!getGridFromTiles(num_slices, grid)) {
|
||||
AT_ERROR("slices are too many");
|
||||
|
@ -22,6 +22,7 @@ void run_cudnn_SDP_fprop(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
Tensor& softmaxstats,
|
||||
Tensor& o,
|
||||
Tensor& dropoutseed,
|
||||
@ -43,6 +44,7 @@ void run_cudnn_SDP_bprop(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
const Tensor& o,
|
||||
const Tensor& dO,
|
||||
const Tensor& softmaxstats,
|
||||
@ -86,9 +88,9 @@ using graph_and_tensors = std::tuple<
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // Q,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // K,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // V,
|
||||
std::optional<std::shared_ptr<fe::graph::Tensor_attributes>>, // Bias
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // Attn_scale,
|
||||
// TODO(eqy): additional options
|
||||
// std::shared_ptr<fe::graph::Tensor_attributes>, // Bias,
|
||||
// std::shared_ptr<fe::graph::Tensor_attributes>, // SEQ_LEN_Q,
|
||||
// std::shared_ptr<fe::graph::Tensor_attributes>, // SEQ_LEN_KV,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // Seed,
|
||||
@ -104,7 +106,8 @@ using graph_and_tensors_backward = std::tuple<
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // Q,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // K,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // V,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // Attn_scale
|
||||
std::optional<std::shared_ptr<fe::graph::Tensor_attributes>>, // Bias,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // Attn_scale,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // Seed,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // Offset,
|
||||
std::shared_ptr<fe::graph::Tensor_attributes>, // O,
|
||||
@ -126,6 +129,8 @@ struct MHAParams {
|
||||
std::array<int, MAX_MHA_DIM> q_stride;
|
||||
std::array<int, MAX_MHA_DIM> k_stride;
|
||||
std::array<int, MAX_MHA_DIM> v_stride;
|
||||
std::array<int, MAX_MHA_DIM> bias_dim;
|
||||
std::array<int, MAX_MHA_DIM> bias_stride;
|
||||
int64_t b;
|
||||
int64_t h;
|
||||
int64_t s_q;
|
||||
@ -135,6 +140,9 @@ struct MHAParams {
|
||||
double dropout_probability;
|
||||
bool is_causal;
|
||||
bool return_softmaxstats;
|
||||
// might be redundant if we take 0 dim/stride
|
||||
// as signaling no-bias
|
||||
bool has_attn_bias;
|
||||
};
|
||||
|
||||
void setMHAParams(
|
||||
@ -148,6 +156,7 @@ void setMHAParams(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
double dropout_probability,
|
||||
bool is_causal,
|
||||
bool return_softmaxstats) {
|
||||
@ -166,6 +175,7 @@ void setMHAParams(
|
||||
params.dropout_probability = dropout_probability;
|
||||
params.is_causal = is_causal;
|
||||
params.return_softmaxstats = return_softmaxstats;
|
||||
params.has_attn_bias = attn_bias.has_value();
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
q.sizes().size() == MAX_MHA_DIM,
|
||||
"Q tensor has unexpected number of dims, please report a bug to PyTorch.");
|
||||
@ -190,6 +200,17 @@ void setMHAParams(
|
||||
std::copy(k.strides().begin(), k.strides().end(), params.k_stride.begin());
|
||||
std::copy(v.sizes().begin(), v.sizes().end(), params.v_dim.begin());
|
||||
std::copy(v.strides().begin(), v.strides().end(), params.v_stride.begin());
|
||||
// uninit is OK as the struct is memset 0'd
|
||||
if (params.has_attn_bias) {
|
||||
std::copy(
|
||||
attn_bias.value().sizes().begin(),
|
||||
attn_bias.value().sizes().end(),
|
||||
params.bias_dim.begin());
|
||||
std::copy(
|
||||
attn_bias.value().strides().begin(),
|
||||
attn_bias.value().strides().end(),
|
||||
params.bias_stride.begin());
|
||||
}
|
||||
}
|
||||
|
||||
struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
|
||||
@ -203,6 +224,7 @@ struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
double dropout_probability,
|
||||
bool is_causal,
|
||||
bool return_softmaxstats) {
|
||||
@ -217,6 +239,7 @@ struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
attn_bias,
|
||||
dropout_probability,
|
||||
is_causal,
|
||||
return_softmaxstats);
|
||||
@ -285,6 +308,7 @@ auto build_graph_and_tensors(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
Tensor& softmaxstats,
|
||||
Tensor& o,
|
||||
Tensor& dropoutseed,
|
||||
@ -301,36 +325,6 @@ auto build_graph_and_tensors(
|
||||
mha_graph->set_io_data_type(dtype)
|
||||
.set_intermediate_data_type(fe::DataType_t::FLOAT)
|
||||
.set_compute_data_type(fe::DataType_t::FLOAT);
|
||||
auto Q = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("Q")
|
||||
.set_dim(std::vector<int64_t>(
|
||||
q.sizes().data(), q.sizes().data() + q.sizes().size()))
|
||||
.set_stride(fixSizeOneDimStrideSDPA(
|
||||
q.sizes(),
|
||||
std::vector<int64_t>(
|
||||
q.strides().data(),
|
||||
q.strides().data() + q.strides().size()))));
|
||||
auto K = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("K")
|
||||
.set_dim(std::vector<int64_t>(
|
||||
k.sizes().data(), k.sizes().data() + k.sizes().size()))
|
||||
.set_stride(fixSizeOneDimStrideSDPA(
|
||||
k.sizes(),
|
||||
std::vector<int64_t>(
|
||||
k.strides().data(),
|
||||
k.strides().data() + k.strides().size()))));
|
||||
auto V = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("V")
|
||||
.set_dim(std::vector<int64_t>(
|
||||
v.sizes().data(), v.sizes().data() + v.sizes().size()))
|
||||
.set_stride(fixSizeOneDimStrideSDPA(
|
||||
v.sizes(),
|
||||
std::vector<int64_t>(
|
||||
v.strides().data(),
|
||||
v.strides().data() + v.strides().size()))));
|
||||
auto attn_scale =
|
||||
mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Attn_scale")
|
||||
@ -338,11 +332,6 @@ auto build_graph_and_tensors(
|
||||
.set_stride({1, 1, 1, 1})
|
||||
.set_is_pass_by_value(true)
|
||||
.set_data_type(fe::DataType_t::FLOAT));
|
||||
// TODO(eqy): support bias in the future in a follow-up PR
|
||||
// auto bias = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
// .set_name("bias")
|
||||
// .set_dim({b, 1, s_q, s_kv})
|
||||
// .set_stride({s_q * s_kv, s_q * s_kv, s_kv, 1}));
|
||||
auto seed = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Seed")
|
||||
.set_dim({1, 1, 1, 1})
|
||||
@ -360,11 +349,30 @@ auto build_graph_and_tensors(
|
||||
.set_causal_mask(is_causal)
|
||||
.set_attn_scale(attn_scale)
|
||||
.set_dropout(dropout_probability, seed, offset);
|
||||
// Optional bias in flash attention is only supported 8.9.3 onwards
|
||||
if (cudnnGetVersion() >= 8904) {
|
||||
// scaled_dot_product_flash_attention_options.set_alibi_mask(true);
|
||||
auto Q = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("Q")
|
||||
.set_dim(q.sizes().vec())
|
||||
.set_stride(fixSizeOneDimStrideSDPA(q.sizes(), q.strides().vec())));
|
||||
auto K = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("K")
|
||||
.set_dim(k.sizes().vec())
|
||||
.set_stride(fixSizeOneDimStrideSDPA(k.sizes(), k.strides().vec())));
|
||||
auto V = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("V")
|
||||
.set_dim(v.sizes().vec())
|
||||
.set_stride(fixSizeOneDimStrideSDPA(v.sizes(), v.strides().vec())));
|
||||
std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
|
||||
if (attn_bias.has_value()) {
|
||||
bias =
|
||||
mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("bias")
|
||||
.set_dim(attn_bias.value().sizes().vec())
|
||||
.set_stride(attn_bias.value().strides().vec()));
|
||||
scaled_dot_product_flash_attention_options.set_bias(bias.value());
|
||||
}
|
||||
|
||||
auto seq_q = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Seq_q")
|
||||
.set_dim({b, 1, 1, 1})
|
||||
@ -376,20 +384,9 @@ auto build_graph_and_tensors(
|
||||
.set_stride({1, 1, 1, 1})
|
||||
.set_data_type(fe::DataType_t::INT32));
|
||||
|
||||
// if (cudnnGetVersion() >= 8903) {
|
||||
// scaled_dot_product_flash_attention_options.set_bias(bias)
|
||||
// .set_padding_mask(true)
|
||||
// .set_seq_len_q(seq_q)
|
||||
// .set_seq_len_kv(seq_kv);
|
||||
// }
|
||||
|
||||
auto [O, Stats] =
|
||||
mha_graph->sdpa(Q, K, V, scaled_dot_product_flash_attention_options);
|
||||
O->set_output(true)
|
||||
.set_dim(std::vector<int64_t>(
|
||||
o.sizes().data(), o.sizes().data() + o.sizes().size()))
|
||||
.set_stride(std::vector<int64_t>(
|
||||
o.strides().data(), o.strides().data() + o.strides().size()));
|
||||
O->set_output(true).set_dim(o.sizes().vec()).set_stride(o.strides().vec());
|
||||
|
||||
if (Stats) {
|
||||
Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT);
|
||||
@ -407,6 +404,7 @@ auto build_graph_and_tensors(
|
||||
std::move(Q),
|
||||
std::move(K),
|
||||
std::move(V),
|
||||
std::move(bias),
|
||||
std::move(attn_scale),
|
||||
std::move(seed),
|
||||
std::move(offset),
|
||||
@ -427,6 +425,7 @@ auto build_graph_and_tensors_backward(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
const Tensor& o,
|
||||
const Tensor& dO,
|
||||
const Tensor& softmaxstats,
|
||||
@ -447,24 +446,6 @@ auto build_graph_and_tensors_backward(
|
||||
mha_graph->set_io_data_type(dtype)
|
||||
.set_intermediate_data_type(fe::DataType_t::FLOAT)
|
||||
.set_compute_data_type(fe::DataType_t::FLOAT);
|
||||
auto Q = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("Q")
|
||||
.set_dim(std::vector<int64_t>(q.sizes().begin(), q.sizes().end()))
|
||||
.set_stride(
|
||||
std::vector<int64_t>(q.strides().begin(), q.strides().end())));
|
||||
auto K = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("K")
|
||||
.set_dim(std::vector<int64_t>(k.sizes().begin(), k.sizes().end()))
|
||||
.set_stride(
|
||||
std::vector<int64_t>(k.strides().begin(), k.strides().end())));
|
||||
auto V = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("V")
|
||||
.set_dim(std::vector<int64_t>(v.sizes().begin(), v.sizes().end()))
|
||||
.set_stride(
|
||||
std::vector<int64_t>(v.strides().begin(), v.strides().end())));
|
||||
auto attn_scale =
|
||||
mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Attn_scale")
|
||||
@ -472,6 +453,31 @@ auto build_graph_and_tensors_backward(
|
||||
.set_stride({1, 1, 1, 1})
|
||||
.set_is_pass_by_value(true)
|
||||
.set_data_type(fe::DataType_t::FLOAT));
|
||||
auto sdpa_backward_options = fe::graph::SDPA_backward_attributes()
|
||||
.set_name("CUDNN_SDPA_BACKWARD")
|
||||
.set_causal_mask(is_causal)
|
||||
.set_attn_scale(attn_scale);
|
||||
auto Q = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Q")
|
||||
.set_dim(q.sizes().vec())
|
||||
.set_stride(q.strides().vec()));
|
||||
auto K = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("K")
|
||||
.set_dim(k.sizes().vec())
|
||||
.set_stride(k.strides().vec()));
|
||||
auto V = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("V")
|
||||
.set_dim(v.sizes().vec())
|
||||
.set_stride(v.strides().vec()));
|
||||
std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
|
||||
if (attn_bias.has_value()) {
|
||||
bias =
|
||||
mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("bias")
|
||||
.set_dim(attn_bias.value().sizes().vec())
|
||||
.set_stride(attn_bias.value().strides().vec()));
|
||||
sdpa_backward_options.set_bias(bias.value());
|
||||
}
|
||||
auto Seed = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Seed")
|
||||
.set_dim({1, 1, 1, 1})
|
||||
@ -482,47 +488,27 @@ auto build_graph_and_tensors_backward(
|
||||
.set_dim({1, 1, 1, 1})
|
||||
.set_stride({1, 1, 1, 1})
|
||||
.set_data_type(fe::DataType_t::INT32));
|
||||
auto O = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("O")
|
||||
.set_dim(std::vector<int64_t>(o.sizes().begin(), o.sizes().end()))
|
||||
.set_stride(
|
||||
std::vector<int64_t>(o.strides().begin(), o.strides().end())));
|
||||
auto STATS = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("Stats")
|
||||
.set_dim(std::vector<int64_t>(
|
||||
softmaxstats.sizes().begin(), softmaxstats.sizes().end()))
|
||||
.set_stride(std::vector<int64_t>(
|
||||
softmaxstats.strides().begin(), softmaxstats.strides().end()))
|
||||
.set_data_type(fe::DataType_t::FLOAT));
|
||||
auto DO = mha_graph->tensor(
|
||||
fe::graph::Tensor_attributes()
|
||||
.set_name("DO")
|
||||
.set_dim(std::vector<int64_t>(dO.sizes().begin(), dO.sizes().end()))
|
||||
.set_stride(
|
||||
std::vector<int64_t>(dO.strides().begin(), dO.strides().end())));
|
||||
auto sdpa_backward_options = fe::graph::SDPA_backward_attributes()
|
||||
.set_name("CUDNN_SDPA_BACKWARD")
|
||||
.set_causal_mask(is_causal)
|
||||
.set_attn_scale(attn_scale);
|
||||
auto O = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("O")
|
||||
.set_dim(o.sizes().vec())
|
||||
.set_stride(o.strides().vec()));
|
||||
auto STATS = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("Stats")
|
||||
.set_dim(softmaxstats.sizes().vec())
|
||||
.set_stride(softmaxstats.strides().vec())
|
||||
.set_data_type(fe::DataType_t::FLOAT));
|
||||
auto DO = mha_graph->tensor(fe::graph::Tensor_attributes()
|
||||
.set_name("DO")
|
||||
.set_dim(dO.sizes().vec())
|
||||
.set_stride(dO.strides().vec()));
|
||||
if (dropout_probability != 0.0f) {
|
||||
sdpa_backward_options.set_dropout(dropout_probability, Seed, Offset);
|
||||
}
|
||||
auto [DQ, DK, DV] =
|
||||
mha_graph->sdpa_backward(Q, K, V, O, DO, STATS, sdpa_backward_options);
|
||||
DQ->set_output(true)
|
||||
.set_dim(std::vector<int64_t>(dQ.sizes().begin(), dQ.sizes().end()))
|
||||
.set_stride(
|
||||
std::vector<int64_t>(dQ.strides().begin(), dQ.strides().end()));
|
||||
DK->set_output(true)
|
||||
.set_dim(std::vector<int64_t>(dK.sizes().begin(), dK.sizes().end()))
|
||||
.set_stride(
|
||||
std::vector<int64_t>(dK.strides().begin(), dK.strides().end()));
|
||||
DV->set_output(true)
|
||||
.set_dim(std::vector<int64_t>(dV.sizes().begin(), dV.sizes().end()))
|
||||
.set_stride(
|
||||
std::vector<int64_t>(dV.strides().begin(), dV.strides().end()));
|
||||
DQ->set_output(true).set_dim(dQ.sizes().vec()).set_stride(dQ.strides().vec());
|
||||
DK->set_output(true).set_dim(dK.sizes().vec()).set_stride(dK.strides().vec());
|
||||
DV->set_output(true).set_dim(dV.sizes().vec()).set_stride(dV.strides().vec());
|
||||
AT_CUDNN_FRONTEND_CHECK(mha_graph->validate());
|
||||
AT_CUDNN_FRONTEND_CHECK(mha_graph->build_operation_graph(handle));
|
||||
AT_CUDNN_FRONTEND_CHECK(
|
||||
@ -534,6 +520,7 @@ auto build_graph_and_tensors_backward(
|
||||
std::move(Q),
|
||||
std::move(K),
|
||||
std::move(V),
|
||||
std::move(bias),
|
||||
std::move(attn_scale),
|
||||
std::move(Seed),
|
||||
std::move(Offset),
|
||||
@ -559,6 +546,7 @@ void run_cudnn_SDP_fprop(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
Tensor& softmaxstats,
|
||||
Tensor& o,
|
||||
Tensor& dropoutseed,
|
||||
@ -573,6 +561,11 @@ void run_cudnn_SDP_fprop(
|
||||
softmaxstats = at::empty({b, h, s_q}, q.options().dtype(kFloat));
|
||||
}
|
||||
|
||||
// do nothing if we got 0-element tensors
|
||||
if (!q.numel() || !k.numel() || !v.numel()) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto key = MHACacheKeyWrapper(
|
||||
b,
|
||||
h,
|
||||
@ -583,6 +576,7 @@ void run_cudnn_SDP_fprop(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
attn_bias,
|
||||
dropout_probability,
|
||||
is_causal,
|
||||
return_softmaxstats);
|
||||
@ -605,13 +599,14 @@ void run_cudnn_SDP_fprop(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
attn_bias,
|
||||
softmaxstats,
|
||||
o,
|
||||
dropoutseed,
|
||||
dropoutoffset,
|
||||
handle);
|
||||
}
|
||||
auto [mha_graph, Q, K, V, attn_scale, seed, offset, O, Stats] =
|
||||
auto [mha_graph, Q, K, V, bias, attn_scale, seed, offset, O, Stats] =
|
||||
graph_and_tensors_values;
|
||||
std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*>
|
||||
variant_pack = {
|
||||
@ -619,13 +614,15 @@ void run_cudnn_SDP_fprop(
|
||||
{K, k.data_ptr()},
|
||||
{V, v.data_ptr()},
|
||||
{attn_scale, &scaling_factor},
|
||||
//{bias, bias.data_ptr()},
|
||||
{seed, dropoutseed.data_ptr()},
|
||||
{offset, dropoutoffset.data_ptr()},
|
||||
{O, o.data_ptr()}};
|
||||
if (return_softmaxstats) {
|
||||
variant_pack[Stats] = softmaxstats.data_ptr();
|
||||
}
|
||||
if (attn_bias.has_value()) {
|
||||
variant_pack[bias.value()] = attn_bias.value().data_ptr();
|
||||
}
|
||||
auto workspace_size = mha_graph->get_workspace_size();
|
||||
auto workspace_ptr =
|
||||
c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
|
||||
@ -647,6 +644,7 @@ void run_cudnn_SDP_bprop(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
const Tensor& o,
|
||||
const Tensor& dO,
|
||||
const Tensor& softmaxstats,
|
||||
@ -655,6 +653,12 @@ void run_cudnn_SDP_bprop(
|
||||
Tensor& dV,
|
||||
const Tensor& dropoutseed,
|
||||
const Tensor& dropoutoffset) {
|
||||
// do nothing if we got 0-element tensors
|
||||
if (!q.numel() || !k.numel() || !v.numel() || !o.numel() || !dO.numel() ||
|
||||
!softmaxstats.numel()) {
|
||||
return;
|
||||
}
|
||||
|
||||
Tensor dO_ = dO;
|
||||
if (!dO.strides()[dO.strides().size() - 1]) {
|
||||
TORCH_WARN(
|
||||
@ -694,6 +698,7 @@ void run_cudnn_SDP_bprop(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
attn_bias,
|
||||
dropout_probability,
|
||||
is_causal,
|
||||
true);
|
||||
@ -715,6 +720,7 @@ void run_cudnn_SDP_bprop(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
attn_bias,
|
||||
o,
|
||||
dO_,
|
||||
softmaxstats,
|
||||
@ -726,8 +732,20 @@ void run_cudnn_SDP_bprop(
|
||||
handle);
|
||||
}
|
||||
auto
|
||||
[mha_graph, Q, K, V, attn_scale, Seed, Offset, O, Do, Stats, Dq, Dk, Dv] =
|
||||
graph_and_tensors_backward_values;
|
||||
[mha_graph,
|
||||
Q,
|
||||
K,
|
||||
V,
|
||||
bias,
|
||||
attn_scale,
|
||||
Seed,
|
||||
Offset,
|
||||
O,
|
||||
Do,
|
||||
Stats,
|
||||
Dq,
|
||||
Dk,
|
||||
Dv] = graph_and_tensors_backward_values;
|
||||
std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*>
|
||||
variant_pack = {// inputs
|
||||
{Q, q.data_ptr()},
|
||||
@ -746,6 +764,9 @@ void run_cudnn_SDP_bprop(
|
||||
variant_pack[Seed] = dropoutseed.data_ptr();
|
||||
variant_pack[Offset] = dropoutoffset.data_ptr();
|
||||
}
|
||||
if (attn_bias.has_value()) {
|
||||
variant_pack[bias.value()] = attn_bias.value().data_ptr();
|
||||
}
|
||||
auto workspace_size = mha_graph->get_workspace_size();
|
||||
auto workspace_ptr =
|
||||
c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
|
||||
|
@ -18,6 +18,7 @@ void run_cudnn_SDP_fprop(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
Tensor& softmaxstats,
|
||||
Tensor& o,
|
||||
Tensor& dropoutseed,
|
||||
@ -36,6 +37,7 @@ void run_cudnn_SDP_bprop(
|
||||
const Tensor& q,
|
||||
const Tensor& k,
|
||||
const Tensor& v,
|
||||
const std::optional<Tensor>& attn_bias,
|
||||
const Tensor& o,
|
||||
const Tensor& dO,
|
||||
const Tensor& softmaxstats,
|
||||
|
@ -197,7 +197,8 @@ Tensor group_norm(
|
||||
|
||||
const Tensor kEmpty;
|
||||
auto memory_format = input.suggest_memory_format();
|
||||
const auto& X = input.device().is_cpu() ? input.contiguous(memory_format) : input.contiguous();
|
||||
const auto& X = input.device().is_cpu() || input.is_privateuseone() ?
|
||||
input.contiguous(memory_format) : input.contiguous();
|
||||
const auto& gamma = weight.defined() ? weight.contiguous() : kEmpty;
|
||||
const auto& beta = bias.defined() ? bias.contiguous() : kEmpty;
|
||||
TORCH_CHECK(!gamma.defined() || gamma.sym_numel() == C);
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include <ATen/Parallel.h>
|
||||
#include <ATen/native/cpu/mixed_data_type.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <ATen/OpMathType.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
@ -295,7 +296,12 @@ Tensor rms_norm(
|
||||
eps_val = eps.value();
|
||||
}
|
||||
|
||||
auto result = input.mul(at::rsqrt(at::pow(input, 2).mean(dims_to_reduce_ref, /*keep_dim=*/true).add_(eps_val)));
|
||||
// upcast is needed for fp16 and bf16
|
||||
c10::ScalarType opmath_t = toOpMathType(input.scalar_type());
|
||||
Tensor upcasted_input = input.to(opmath_t);
|
||||
|
||||
Tensor rqrst_input = rsqrt(at::pow(upcasted_input, 2).mean(dims_to_reduce_ref, /*keep_dim=*/true).add_(eps_val));
|
||||
Tensor result = upcasted_input.mul(rqrst_input).type_as(input);
|
||||
|
||||
if (weight_opt.has_value()) {
|
||||
result = result.mul(weight_opt.value());
|
||||
|
@ -2,10 +2,10 @@
|
||||
|
||||
@interface MPSCNNNeuronOp : NSObject
|
||||
|
||||
+ (MPSCNNNeuronHardSigmoid*)hardSigmoid API_AVAILABLE(ios(11.0), macos(10.13));
|
||||
+ (MPSCNNNeuronReLU*)relu;
|
||||
+ (MPSCNNNeuronSigmoid*)sigmoid;
|
||||
+ (MPSCNNNeuronTanH*)tanh;
|
||||
+ (MPSCNNNeuron*)hardSigmoid API_AVAILABLE(ios(11.0), macos(10.13));
|
||||
+ (MPSCNNNeuron*)relu;
|
||||
+ (MPSCNNNeuron*)sigmoid;
|
||||
+ (MPSCNNNeuron*)tanh;
|
||||
|
||||
@end
|
||||
|
||||
|
@ -8,69 +8,67 @@ C10_CLANG_DIAGNOSTIC_IGNORE("-Wdeprecated-declarations")
|
||||
|
||||
@implementation MPSCNNNeuronOp
|
||||
|
||||
+ (MPSCNNNeuronHardSigmoid*)hardSigmoid API_AVAILABLE(ios(11.0), macos(10.13)) {
|
||||
// Remove this once we support iOS 11.3
|
||||
#if TARGET_OS_MACCATALYST
|
||||
return nil;
|
||||
#else
|
||||
+ (MPSCNNNeuron*)hardSigmoid API_AVAILABLE(ios(11.0), macos(10.13)) {
|
||||
static MPSCNNNeuron* neuron = nil;
|
||||
static dispatch_once_t onceToken;
|
||||
static MPSCNNNeuronHardSigmoid* neuron = nil;
|
||||
dispatch_once(&onceToken, ^{
|
||||
#if TARGET_OS_MACCATALYST
|
||||
neuron = [[MPSCNNNeuron alloc] initWithDevice:[MetalContext sharedInstance].device neuronDescriptor:[MPSCNNNeuronOpDescriptor hardSigmoidDescriptor]];
|
||||
#else
|
||||
neuron = [[MPSCNNNeuronHardSigmoid alloc]
|
||||
initWithDevice:[MetalContext sharedInstance].device
|
||||
a:1.0 / 6.0
|
||||
b:0.5];
|
||||
initWithDevice:[MetalContext sharedInstance].device
|
||||
a:1.0 / 6.0
|
||||
b:0.5];
|
||||
#endif
|
||||
});
|
||||
return neuron;
|
||||
#endif
|
||||
}
|
||||
|
||||
+ (MPSCNNNeuronReLU*)relu {
|
||||
// Remove this once we support iOS 11.3
|
||||
#if TARGET_OS_MACCATALYST
|
||||
return nil;
|
||||
#else
|
||||
static MPSCNNNeuronReLU* relu = nil;
|
||||
+ (MPSCNNNeuron*)relu {
|
||||
static MPSCNNNeuron* neuron = nil;
|
||||
static dispatch_once_t onceToken;
|
||||
dispatch_once(&onceToken, ^{
|
||||
relu = [[MPSCNNNeuronReLU alloc]
|
||||
initWithDevice:[MetalContext sharedInstance].device
|
||||
a:0];
|
||||
});
|
||||
return relu;
|
||||
#if TARGET_OS_MACCATALYST
|
||||
neuron = [[MPSCNNNeuron alloc]
|
||||
initWithDevice:[MetalContext sharedInstance].device
|
||||
neuronDescriptor:[MPSCNNNeuronOpDescriptor reluDescriptor]];
|
||||
#else
|
||||
neuron = [[MPSCNNNeuronReLU alloc]
|
||||
initWithDevice:[MetalContext sharedInstance].device
|
||||
a:0];
|
||||
#endif
|
||||
});
|
||||
return neuron;
|
||||
}
|
||||
|
||||
+ (MPSCNNNeuronSigmoid*)sigmoid {
|
||||
// Remove this once we support iOS 11.3
|
||||
#if TARGET_OS_MACCATALYST
|
||||
return nil;
|
||||
#else
|
||||
+ (MPSCNNNeuron*)sigmoid {
|
||||
static MPSCNNNeuron* neuron = nil;
|
||||
static dispatch_once_t onceToken;
|
||||
static MPSCNNNeuronSigmoid* sigmoid = nil;
|
||||
dispatch_once(&onceToken, ^{
|
||||
sigmoid = [[MPSCNNNeuronSigmoid alloc]
|
||||
initWithDevice:[MetalContext sharedInstance].device];
|
||||
});
|
||||
return sigmoid;
|
||||
#if TARGET_OS_MACCATALYST
|
||||
neuron = [[MPSCNNNeuron alloc] initWithDevice:[MetalContext sharedInstance].device neuronDescriptor:[MPSCNNNeuronOpDescriptor sigmoidDescriptor]];
|
||||
#else
|
||||
neuron = [[MPSCNNNeuronSigmoid alloc]
|
||||
initWithDevice:[MetalContext sharedInstance].device];
|
||||
#endif
|
||||
});
|
||||
return neuron;
|
||||
}
|
||||
|
||||
+ (MPSCNNNeuronTanH*)tanh {
|
||||
// Remove this once we support iOS 11.3
|
||||
#if TARGET_OS_MACCATALYST
|
||||
return nil;
|
||||
#else
|
||||
+ (MPSCNNNeuron*)tanh {
|
||||
static MPSCNNNeuron* neuron = nil;
|
||||
static dispatch_once_t onceToken;
|
||||
static MPSCNNNeuronTanH* tanh = nil;
|
||||
dispatch_once(&onceToken, ^{
|
||||
tanh = [[MPSCNNNeuronTanH alloc]
|
||||
initWithDevice:[MetalContext sharedInstance].device
|
||||
a:1
|
||||
b:1];
|
||||
});
|
||||
return tanh;
|
||||
#if TARGET_OS_MACCATALYST
|
||||
neuron = [[MPSCNNNeuron alloc] initWithDevice:[MetalContext sharedInstance].device neuronDescriptor:[MPSCNNNeuronOpDescriptor tanhDescriptor]];
|
||||
#else
|
||||
neuron = [[MPSCNNNeuronTanH alloc]
|
||||
initWithDevice:[MetalContext sharedInstance].device
|
||||
a:1
|
||||
b:1];
|
||||
#endif
|
||||
});
|
||||
return neuron;
|
||||
}
|
||||
|
||||
@end
|
||||
@ -85,9 +83,9 @@ API_AVAILABLE(ios(11.3), macos(10.13), macCatalyst(13.0))
|
||||
static MPSNNNeuronDescriptor* neuronDesc = nil;
|
||||
dispatch_once(&onceToken, ^{
|
||||
neuronDesc = [MPSNNNeuronDescriptor
|
||||
cnnNeuronDescriptorWithType:MPSCNNNeuronTypeHardSigmoid
|
||||
a:1.0 / 6.0
|
||||
b:0.5];
|
||||
cnnNeuronDescriptorWithType:MPSCNNNeuronTypeHardSigmoid
|
||||
a:1.0 / 6.0
|
||||
b:0.5];
|
||||
});
|
||||
return neuronDesc;
|
||||
}
|
||||
@ -97,8 +95,8 @@ API_AVAILABLE(ios(11.3), macos(10.13), macCatalyst(13.0))
|
||||
static MPSNNNeuronDescriptor* neuronDesc = nil;
|
||||
dispatch_once(&onceToken, ^{
|
||||
neuronDesc =
|
||||
[MPSNNNeuronDescriptor cnnNeuronDescriptorWithType:MPSCNNNeuronTypeReLU
|
||||
a:0];
|
||||
[MPSNNNeuronDescriptor cnnNeuronDescriptorWithType:MPSCNNNeuronTypeReLU
|
||||
a:0];
|
||||
});
|
||||
return neuronDesc;
|
||||
}
|
||||
@ -108,7 +106,7 @@ API_AVAILABLE(ios(11.3), macos(10.13), macCatalyst(13.0))
|
||||
static MPSNNNeuronDescriptor* neuronDesc = nil;
|
||||
dispatch_once(&onceToken, ^{
|
||||
neuronDesc = [MPSNNNeuronDescriptor
|
||||
cnnNeuronDescriptorWithType:MPSCNNNeuronTypeSigmoid];
|
||||
cnnNeuronDescriptorWithType:MPSCNNNeuronTypeSigmoid];
|
||||
});
|
||||
return neuronDesc;
|
||||
}
|
||||
@ -117,10 +115,9 @@ API_AVAILABLE(ios(11.3), macos(10.13), macCatalyst(13.0))
|
||||
static dispatch_once_t onceToken;
|
||||
static MPSNNNeuronDescriptor* neuronDesc = nil;
|
||||
dispatch_once(&onceToken, ^{
|
||||
neuronDesc =
|
||||
[MPSNNNeuronDescriptor cnnNeuronDescriptorWithType:MPSCNNNeuronTypeTanH
|
||||
a:1.0
|
||||
b:1.0];
|
||||
neuronDesc = [MPSNNNeuronDescriptor cnnNeuronDescriptorWithType:MPSCNNNeuronTypeTanH
|
||||
a:1.0
|
||||
b:1.0];
|
||||
});
|
||||
return neuronDesc;
|
||||
}
|
||||
|
@ -437,6 +437,20 @@ static void nllnd_loss_forward_impl(Tensor& output,
|
||||
if (output.numel() == 0)
|
||||
return;
|
||||
|
||||
// https://github.com/pytorch/pytorch/blob/042f2f7746a064f1527d95d1f1d712b4f0b34186/aten/src/ATen/native/cuda/Loss.cu#L335-L346
|
||||
if (target_arg.numel() == 0) {
|
||||
// Here target (and input) have zero elements
|
||||
// Mean reduction on empty tensors produces NaN. See the discussion in
|
||||
// https://github.com/pytorch/pytorch/pull/64572#issuecomment-926504162
|
||||
if (reduction == Reduction::Mean) {
|
||||
output.fill_(std::numeric_limits<double>::quiet_NaN());
|
||||
} else {
|
||||
output.zero_();
|
||||
}
|
||||
total_weight.zero_();
|
||||
return;
|
||||
}
|
||||
|
||||
struct CachedGraph : public MPSCachedGraph {
|
||||
CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
|
||||
MPSGraphTensor* inputTensor_ = nil;
|
||||
@ -537,7 +551,9 @@ static void nllnd_loss_forward_impl(Tensor& output,
|
||||
mpsGraphBatchSizeTensor = [mpsGraph reductionSumWithTensor:mpsSelectOneTensor
|
||||
axes:nil
|
||||
name:@"batchSizeReductionTensor"];
|
||||
mpsGraphReducedTensor = divisionNoNaN(mpsGraph, mpsGraphReducedTensor, mpsGraphBatchSizeTensor);
|
||||
mpsGraphReducedTensor = [mpsGraph divisionWithPrimaryTensor:mpsGraphReducedTensor
|
||||
secondaryTensor:mpsGraphBatchSizeTensor
|
||||
name:@"divisionTensor"];
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -904,8 +904,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_mps(const Tensor& input,
|
||||
for (const auto idx : c10::irange(axis)) {
|
||||
stat_shape.push_back(input_shape[idx]);
|
||||
}
|
||||
for (const auto idx : c10::irange(axis, input.dim())) {
|
||||
(void)idx; // Suppress unused variable
|
||||
for (C10_UNUSED auto idx : c10::irange(axis, input.dim())) {
|
||||
stat_shape.push_back(1);
|
||||
}
|
||||
mean = mean.view(stat_shape);
|
||||
|
@ -8554,7 +8554,7 @@
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: method, function
|
||||
dispatch:
|
||||
CPU, CUDA: __rshift__
|
||||
CPU, CUDA, MPS: __rshift__
|
||||
tags: pointwise
|
||||
|
||||
- func: __irshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
|
||||
@ -14706,6 +14706,11 @@
|
||||
CUDA: _fbgemm_dense_to_jagged_forward_symint
|
||||
CPU: _padded_dense_to_jagged_forward_cpu
|
||||
|
||||
- func: _nested_from_padded_tensor(Tensor padded, Tensor offsets, Tensor dummy, int ragged_idx=1, Tensor? min_seqlen=None, Tensor? max_seqlen=None, SymInt? sum_S=None) -> Tensor
|
||||
variants: function
|
||||
device_check: NoCheck
|
||||
dispatch: {}
|
||||
|
||||
- func: _nested_tensor_softmax_with_shape(Tensor self, Tensor query) -> Tensor
|
||||
dispatch:
|
||||
NestedTensorCPU: NestedTensor_softmax_dropout
|
||||
|
@ -313,9 +313,9 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> deserialize_conv(
|
||||
output_padding.emplace_back(config_vals.at(idx));
|
||||
idx++;
|
||||
}
|
||||
int64_t groups = config_vals.at(idx);
|
||||
int64_t groups [[maybe_unused]] = config_vals.at(idx);
|
||||
idx++;
|
||||
int64_t flags = config_vals.at(idx);
|
||||
int64_t flags [[maybe_unused]] = config_vals.at(idx);
|
||||
idx++;
|
||||
TORCH_INTERNAL_ASSERT(idx == static_cast<int64_t>(config_vals.size()),
|
||||
"Unexpected length of config_vals, expected ",
|
||||
@ -323,7 +323,7 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> deserialize_conv(
|
||||
" got ",
|
||||
config_vals.size());
|
||||
|
||||
bool transpose = flags & (1 << 0);
|
||||
bool transpose [[maybe_unused]] = flags & (1 << 0);
|
||||
|
||||
int64_t other_flags = flags & ~(1 << 0);
|
||||
TORCH_INTERNAL_ASSERT(other_flags == 0, "Unexpected flags set in ", flags, ".");
|
||||
|
@ -251,8 +251,8 @@ TORCH_LIBRARY(_quantized, m) {
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("_quantized::wrapped_fbgemm_pack_gemm_matrix_fp16(Tensor W) -> Tensor"));
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("_quantized::wrapped_fbgemm_linear_fp16_weight(Tensor X, Tensor W, Tensor B, int out_channel) -> Tensor"));
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("_quantized::wrapped_quantized_linear(Tensor X, Tensor X_scale, Tensor X_zero_point, Tensor W, Tensor W_scale, Tensor W_zero_point, Tensor B, Tensor output_scale, Tensor output_zero_point, int out_channel) -> Tensor Y"));
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("_quantized::_wrapped_linear_prepack(Tensor W, Tensor W_scale, Tensor W_zero_point, Tensor B) -> Tensor"));
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("_quantized::_wrapped_quantized_linear_prepacked(Tensor X, Tensor X_scale, Tensor X_zero_point, Tensor W_prepack, Tensor output_scale, Tensor output_zero_point, int out_channel) -> Tensor Y"));
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("_quantized::_wrapped_linear_prepack(Tensor W, Tensor W_scale, Tensor W_zero_point, Tensor B) -> Tensor"), {at::Tag::flexible_layout});
|
||||
m.def(TORCH_SELECTIVE_SCHEMA("_quantized::_wrapped_quantized_linear_prepacked(Tensor X, Tensor X_scale, Tensor X_zero_point, Tensor W_prepack, Tensor output_scale, Tensor output_zero_point, int out_channel) -> Tensor Y"), {at::Tag::flexible_layout});
|
||||
}
|
||||
|
||||
TORCH_LIBRARY(onednn, m) {
|
||||
|
@ -536,6 +536,24 @@ std::optional<Tensor> convert_boolean_attn_mask(const std::optional<Tensor>& att
|
||||
// Otherwise, attn_mask represents an additive attention tensor
|
||||
return attn_mask;
|
||||
}
|
||||
|
||||
// alternate version to workaround -inf issue with cuDNN
|
||||
// TODO(eqy): delete this when cuDNN -inf issue is resolved
|
||||
std::optional<Tensor> convert_boolean_attn_mask_cudnn(const std::optional<Tensor>& attn_mask, caffe2::TypeMeta dtype) {
|
||||
// Pass through
|
||||
if(!attn_mask.has_value()){
|
||||
return std::nullopt;
|
||||
}
|
||||
// Convert boolean mask to additive mask; need to invert mask to indicate what
|
||||
// to mask *out*.
|
||||
if (attn_mask->dtype() == at::kBool) {
|
||||
// TODO Use the max type of the input and output
|
||||
return at::where(attn_mask->logical_not(), -65504.0, at::scalar_tensor(0.0, at::TensorOptions().dtype(dtype)));
|
||||
}
|
||||
// Otherwise, attn_mask represents an additive attention tensor
|
||||
return attn_mask;
|
||||
}
|
||||
|
||||
// Memory Efficient Attention requires a padded attn mask bias
|
||||
// This function pads the attn_mask bias to be a multiple of 16
|
||||
// Then slices the padded bias to the original size
|
||||
@ -698,15 +716,16 @@ Tensor scaled_dot_product_attention(
|
||||
query_, key, value, attn_mask_, dropout_p, is_causal, scale, enable_gqa);
|
||||
}
|
||||
sdp::SDPBackend backend = static_cast<sdp::SDPBackend>(choice_int);
|
||||
std::optional<Tensor> attn_mask = convert_boolean_attn_mask(attn_mask_, query_.dtype());
|
||||
switch (backend) {
|
||||
case sdp::SDPBackend::cudnn_attention: {
|
||||
std::optional<Tensor> attn_mask = convert_boolean_attn_mask_cudnn(attn_mask_, query_.dtype());
|
||||
bool compute_logsumexp = should_compute_logsumexp(query_, key, value);
|
||||
auto out_lse_softmax = at::_scaled_dot_product_cudnn_attention(
|
||||
query_, key, value, attn_mask_, compute_logsumexp, dropout_p, is_causal, false /*return_debug_mask*/, scale);
|
||||
query_, key, value, attn_mask, compute_logsumexp, dropout_p, is_causal, false /*return_debug_mask*/, scale);
|
||||
return std::get<0>(out_lse_softmax);
|
||||
}
|
||||
case sdp::SDPBackend::flash_attention: {
|
||||
std::optional<Tensor> attn_mask = convert_boolean_attn_mask(attn_mask_, query_.dtype());
|
||||
if(query_.device().type() == DeviceType::CUDA){
|
||||
c10::SymInt og_size = query_.sym_size(-1);
|
||||
Tensor query_padded = pad_last_dim<8, false>(query_);
|
||||
@ -723,6 +742,7 @@ Tensor scaled_dot_product_attention(
|
||||
query_, key, value, dropout_p, is_causal, attn_mask, scale));
|
||||
}
|
||||
case sdp::SDPBackend::efficient_attention: {
|
||||
std::optional<Tensor> attn_mask = convert_boolean_attn_mask(attn_mask_, query_.dtype());
|
||||
bool compute_logsumexp = should_compute_logsumexp(query_, key, value);
|
||||
if (attn_mask.has_value()) {
|
||||
attn_mask.value() = preprocess_mask(attn_mask.value(), query_, key, value);;
|
||||
@ -732,11 +752,13 @@ Tensor scaled_dot_product_attention(
|
||||
return std::get<0>(out_and_lse);
|
||||
}
|
||||
case sdp::SDPBackend::overrideable: {
|
||||
std::optional<Tensor> attn_mask = convert_boolean_attn_mask(attn_mask_, query_.dtype());
|
||||
auto out_lse_softmax = at::_scaled_dot_product_fused_attention_overrideable(
|
||||
query_, key, value, attn_mask, dropout_p, is_causal, false /*return_debug_mask*/, scale);
|
||||
return std::get<0>(out_lse_softmax);
|
||||
}
|
||||
case sdp::SDPBackend::math:
|
||||
case sdp::SDPBackend::math: {
|
||||
std::optional<Tensor> attn_mask = convert_boolean_attn_mask(attn_mask_, query_.dtype());
|
||||
if ((!GradMode::is_enabled() || (!query_.requires_grad() && !key.requires_grad() && !value.requires_grad()))
|
||||
&& query_.device().type() == DeviceType::MPS && dropout_p == 0.0
|
||||
&& query_.is_contiguous() && key.is_contiguous() && value.is_contiguous()
|
||||
@ -761,6 +783,7 @@ Tensor scaled_dot_product_attention(
|
||||
std::nullopt, /*dropout_mask*/
|
||||
scale,
|
||||
enable_gqa));
|
||||
}
|
||||
default:
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
|
@ -774,6 +774,18 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt, Tensor, Ten
|
||||
TORCH_CHECK(
|
||||
max_seqlen_batch_k == max_seqlen_batch_v,
|
||||
"Key and Value must have the same sequence length");
|
||||
auto attn_bias_ = attn_bias;
|
||||
if (attn_bias_.has_value()) {
|
||||
const auto bias_dim = attn_bias_.value().dim();
|
||||
if (bias_dim == 2) {
|
||||
attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
|
||||
} else if (bias_dim == 3) {
|
||||
attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
|
||||
} else {
|
||||
attn_bias_ = attn_bias_.value().expand({batch_size, attn_bias_.value().size(1), max_seqlen_batch_q, max_seqlen_batch_k});
|
||||
TORCH_CHECK(bias_dim == 4, "cuDNN SDPA expects either a 2D, 3D, or 4D attn_bias but got ", attn_bias_.value().dim(), "D");
|
||||
}
|
||||
}
|
||||
|
||||
Tensor attention, log_sumexp;
|
||||
|
||||
@ -818,13 +830,14 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt, Tensor, Ten
|
||||
query/* Tensor q*/,
|
||||
key/* Tensor k*/,
|
||||
value/* Tensor v*/,
|
||||
attn_bias_ /* std::optional<Tensor> */,
|
||||
log_sumexp/*Tensor softmaxstats*/,
|
||||
attention/*Tensor o*/,
|
||||
cudnn_seed/*Tensor dropoutseed*/,
|
||||
cudnn_offset/*Tensor dropoutoffset*/);
|
||||
|
||||
// TODO(eqy): support debug_attn_mask
|
||||
return std::make_tuple(attention, log_sumexp, Tensor(), Tensor(), max_seqlen_batch_q, max_seqlen_batch_k, cudnn_seed, cudnn_offset, Tensor());
|
||||
return std::make_tuple(std::move(attention), std::move(log_sumexp), Tensor(), Tensor(), max_seqlen_batch_q, max_seqlen_batch_k, std::move(cudnn_seed), std::move(cudnn_offset), Tensor());
|
||||
}
|
||||
|
||||
std::tuple<Tensor, Tensor, Tensor, Tensor> _scaled_dot_product_efficient_attention_cuda(
|
||||
@ -1102,10 +1115,13 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
|
||||
offset_t = at::empty({}, at::dtype(at::kLong).device(device));
|
||||
} else {
|
||||
auto [seed, offset] = at::cuda::philox::unpack(philox_state);
|
||||
seed_t = at::scalar_tensor(
|
||||
at::Scalar(static_cast<int64_t>(seed)), at::dtype(at::kLong));
|
||||
offset_t = at::scalar_tensor(
|
||||
at::Scalar(static_cast<int64_t>(offset)), at::dtype(at::kLong));
|
||||
#ifdef USE_ROCM
|
||||
const auto options = at::dtype(at::kLong).device(at::kCUDA);
|
||||
#else
|
||||
const auto options = at::dtype(at::kLong);
|
||||
#endif
|
||||
seed_t = at::scalar_tensor(at::Scalar(static_cast<int64_t>(seed)), options);
|
||||
offset_t = at::scalar_tensor(at::Scalar(static_cast<int64_t>(offset)), options);
|
||||
}
|
||||
} else {
|
||||
// Not using dropout
|
||||
@ -1118,7 +1134,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
|
||||
auto ret = aotriton::v2::flash::check_gpu(stream);
|
||||
if (hipSuccess != ret) {
|
||||
TORCH_CHECK(false,
|
||||
"[AOTriton] Accelerated SDPA only supports MI200/MI300X GPUs (gfx90a:sramecc+:xnack- or gfx94a:sramecc+:xnack-)")
|
||||
"[AOTriton] Accelerated SDPA only supports MI200/MI300X/Navi31 GPUs"
|
||||
" (gfx90a:sramecc+:xnack-/gfx942:sramecc+:xnack-/gfx1100)")
|
||||
}
|
||||
|
||||
// AOTriton may accept aligned on logsumexp tensor in the future for better
|
||||
@ -1147,8 +1164,16 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
|
||||
|
||||
using aotriton::v2::flash::attn_fwd;
|
||||
using sdp::aotriton_adapter::mk_aotensor;
|
||||
using sdp::aotriton_adapter::mk_aoscalartensor;
|
||||
using sdp::aotriton_adapter::mk_philoxtensor;
|
||||
aotriton::TensorView<4> empty_t4(0, {0, 0, 0, 0}, {0, 0, 0, 0}, aotriton::DType::kFloat16);
|
||||
at::Tensor softmax_fa_t = at::empty({ 0, 0, 0, 0 }, query.options());
|
||||
const bool use_philox_state = in_capture_stream;
|
||||
auto seed = use_philox_state ? mk_philoxtensor(philox_state.seed_.ptr) : mk_aoscalartensor(seed_t);
|
||||
auto offset1 = use_philox_state ? mk_philoxtensor(philox_state.offset_.ptr) : mk_aoscalartensor(offset_t);
|
||||
auto offset2 = use_philox_state ? philox_state.offset_intragraph_ : 0;
|
||||
auto seed_output = use_philox_state ? mk_philoxtensor(seed_t.data_ptr<int64_t>()) : mk_philoxtensor(nullptr);
|
||||
auto offset_output = use_philox_state ? mk_philoxtensor(offset_t.data_ptr<int64_t>()) : mk_philoxtensor(nullptr);
|
||||
hipError_t err; // TODO: Error handling
|
||||
err = attn_fwd(mk_aotensor(q_t, "q"),
|
||||
mk_aotensor(k_t, "k"),
|
||||
@ -1158,8 +1183,11 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
|
||||
mk_aotensor<2>(softmax_lse, "M"),
|
||||
mk_aotensor(output_t, "Out"),
|
||||
dropout_p,
|
||||
use_dropout ? *seed_t.data_ptr<int64_t>() : 0,
|
||||
use_dropout ? *offset_t.data_ptr<int64_t>() : 0,
|
||||
seed,
|
||||
offset1,
|
||||
offset2,
|
||||
seed_output,
|
||||
offset_output,
|
||||
mk_aotensor(softmax_fa_t, "encoded_softmax"),
|
||||
is_causal,
|
||||
stream);
|
||||
|
@ -195,6 +195,27 @@ std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_backward_
|
||||
const int64_t num_heads = query.size(1);
|
||||
const int64_t head_dim_qk = query.size(3);
|
||||
const int64_t head_dim_v = value.size(3);
|
||||
const int64_t max_seqlen_batch_q = query.size(2);
|
||||
const int64_t max_seqlen_batch_k = key.size(2);
|
||||
|
||||
// This is needed because SaveVariable automatically converts
|
||||
// std::optional to undefined tensor
|
||||
std::optional<Tensor> attn_bias_;
|
||||
if (attn_bias.defined()) {
|
||||
attn_bias_ = attn_bias;
|
||||
}
|
||||
if (attn_bias_.has_value()) {
|
||||
const auto bias_dim = attn_bias_.value().dim();
|
||||
if (bias_dim == 2) {
|
||||
attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
|
||||
} else if (bias_dim == 3) {
|
||||
attn_bias_ = attn_bias_.value().expand({batch_size, 1, max_seqlen_batch_q, max_seqlen_batch_k});
|
||||
} else {
|
||||
attn_bias_ = attn_bias_.value().expand({batch_size, attn_bias_.value().size(1), max_seqlen_batch_q, max_seqlen_batch_k});
|
||||
TORCH_CHECK(bias_dim == 4, "cuDNN SDPA expects either a 2D, 3D, or 4D attn_bias but got ", attn_bias_.value().dim(), "D");
|
||||
}
|
||||
}
|
||||
|
||||
const auto softmax_scale = sdp::calculate_scale(query, scale).as_float_unchecked();
|
||||
auto dq = at::empty_like(query);
|
||||
auto dk = at::empty_like(key);
|
||||
@ -211,6 +232,7 @@ std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_backward_
|
||||
query /*const Tensor& q*/,
|
||||
key /*const Tensor& k*/,
|
||||
value /*const Tensor& v*/,
|
||||
attn_bias_ /*const std::optional<Tensor>& attn_bias*/,
|
||||
out /*const Tensor& o*/,
|
||||
grad_out/*const Tensor& dO*/,
|
||||
logsumexp.unsqueeze(-1)/*const Tensor& softmaxstats*/,
|
||||
@ -219,7 +241,7 @@ std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_backward_
|
||||
dv/*Tensor& dV*/,
|
||||
philox_seed/*Tensor& dropoutseed*/,
|
||||
philox_offset/*Tensor& dropoutoffset*/);
|
||||
return std::make_tuple(dq, dk, dv);
|
||||
return std::make_tuple(std::move(dq), std::move(dk), std::move(dv));
|
||||
}
|
||||
|
||||
std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
|
||||
@ -394,7 +416,8 @@ _efficient_attention_backward(
|
||||
auto ret = aotriton::v2::flash::check_gpu(stream);
|
||||
if (hipSuccess != ret) {
|
||||
TORCH_CHECK(false,
|
||||
"[AOTriton] Accelerated SDPA only supports MI200/MI300X GPUs (gfx90a:sramecc+:xnack- or gfx942:sramecc+:xnack-)")
|
||||
"[AOTriton] Accelerated SDPA only supports MI200/MI300X/Navi31 GPUs"
|
||||
" (gfx90a:sramecc+:xnack-/gfx942:sramecc+:xnack-/gfx1100)")
|
||||
}
|
||||
const auto softmax_scale = sdp::calculate_scale(query, scale).as_float_unchecked();
|
||||
bool is_causal;
|
||||
@ -419,6 +442,7 @@ _efficient_attention_backward(
|
||||
hipError_t err;
|
||||
using aotriton::v2::flash::attn_bwd;
|
||||
using sdp::aotriton_adapter::mk_aotensor;
|
||||
using sdp::aotriton_adapter::mk_aoscalartensor;
|
||||
using sdp::aotriton_adapter::cast_dtype;
|
||||
aotriton::TensorView<4> empty_t4(0, {0, 0, 0, 0}, {0, 0, 0, 0}, cast_dtype(query.dtype()));
|
||||
err = attn_bwd(mk_aotensor(q_t, "q"),
|
||||
@ -435,8 +459,9 @@ _efficient_attention_backward(
|
||||
mk_aotensor<2>(softmax_lse, "L"),
|
||||
mk_aotensor<2>(delta, "delta"),
|
||||
float(dropout_p),
|
||||
rng_engine_inputs.seed_.val,
|
||||
rng_engine_inputs.offset_.val,
|
||||
mk_aoscalartensor(philox_seed),
|
||||
mk_aoscalartensor(philox_offset),
|
||||
0,
|
||||
is_causal,
|
||||
stream);
|
||||
#else
|
||||
|
@ -210,6 +210,7 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
|
||||
// Check that the gpu is capable of running flash attention
|
||||
using sm80 = SMVersion<8, 0>;
|
||||
using sm90 = SMVersion<9, 0>;
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
#if USE_ROCM
|
||||
#if USE_AOTRITON
|
||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||
@ -221,11 +222,19 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
|
||||
}
|
||||
return false;
|
||||
}
|
||||
c10::string_view arch(dprops->gcnArchName);
|
||||
if (arch == "gfx1100") {
|
||||
static const bool enable_navi3x = c10::utils::check_env("TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL") == true;
|
||||
if (!enable_navi3x) {
|
||||
TORCH_WARN_ONCE("Flash attention support on Navi31 GPU is still experimental."
|
||||
" Enable it with TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1.");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
#else
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
if (!check_sm_version<sm80, sm90>(dprops)) {
|
||||
if (debug) {
|
||||
TORCH_WARN(
|
||||
@ -245,6 +254,7 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug)
|
||||
// Mem Efficient attention supports hardware in the range [sm_50, sm_90]
|
||||
using sm50 = SMVersion<5, 0>;
|
||||
using sm90 = SMVersion<9, 0>;
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
#if USE_ROCM
|
||||
#if USE_AOTRITON
|
||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||
@ -256,11 +266,19 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug)
|
||||
}
|
||||
return false;
|
||||
}
|
||||
c10::string_view arch(dprops->gcnArchName);
|
||||
if (arch == "gfx1100") {
|
||||
static const bool enable_navi3x = c10::utils::check_env("TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL") == true;
|
||||
if (!enable_navi3x) {
|
||||
TORCH_WARN_ONCE("Memory Efficient attention on Navi31 GPU is still experimental."
|
||||
" Enable it with TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1.");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
#else
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
if (!check_sm_version<sm50, sm90>(dprops)) {
|
||||
if (debug) {
|
||||
TORCH_WARN(
|
||||
@ -561,7 +579,7 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
|
||||
check_cudnn_deterministic,
|
||||
// check_is_causal,
|
||||
check_dtypes_low_precision,
|
||||
check_for_attn_mask_cudnn,
|
||||
check_attn_mask_shape,
|
||||
check_cudnn_hardware_support
|
||||
);
|
||||
for (auto& constraint : general_constraints) {
|
||||
@ -616,9 +634,14 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) {
|
||||
}
|
||||
}
|
||||
}
|
||||
#if USE_ROCM
|
||||
constexpr bool backend_supports_grouped_query_attention = false;
|
||||
#else
|
||||
constexpr bool backend_supports_grouped_query_attention = true;
|
||||
#endif
|
||||
if (has_only_dense_inputs(params)) {
|
||||
constexpr auto dense_constraints = array_of<bool (*)(sdp_params const&, bool)>(
|
||||
check_batch_size_and_num_heads_dense<true /*supports_grouped_query_attention=*/>,
|
||||
check_batch_size_and_num_heads_dense<backend_supports_grouped_query_attention>,
|
||||
check_nonzero_sequence_lengths_dense,
|
||||
check_last_dim_stride_equals_1_dense<true /*ignore_singleton_dim=*/>);
|
||||
for (auto& constraint : dense_constraints) {
|
||||
|
@ -115,6 +115,18 @@ aotriton::TensorView<Rank> mk_aotensor(const at::Tensor& q, c10::string_view ten
|
||||
cast_dtype(q.dtype()));
|
||||
}
|
||||
|
||||
inline aotriton::TensorView<0> mk_aoscalartensor(const at::Tensor& q)
|
||||
{
|
||||
return aotriton::TensorView<0>(reinterpret_cast<intptr_t>(q.data_ptr()),
|
||||
cast_dtype(q.dtype()));
|
||||
}
|
||||
|
||||
inline aotriton::TensorView<0> mk_philoxtensor(const int64_t* ptr)
|
||||
{
|
||||
return aotriton::TensorView<0>(reinterpret_cast<intptr_t>(ptr),
|
||||
aotriton::DType::kUInt64); // AOTriton excepts unsigned int64
|
||||
}
|
||||
|
||||
} // namespace aotriton_adapter
|
||||
|
||||
} // namespace sdp
|
||||
|
@ -72,7 +72,8 @@ void check_gpu_arch(hipStream_t stream) {
|
||||
auto ret = aotriton::v2::flash::check_gpu(stream);
|
||||
if (hipSuccess != ret) {
|
||||
TORCH_CHECK(false,
|
||||
"FlashAttention only supports MI200/MI300X GPUs (gfx90a:sramecc+:xnack- or gfx942:sramecc+:xnack-)")
|
||||
"[AOTriton] Accelerated SDPA only supports MI200/MI300X/Navi31 GPUs"
|
||||
" (gfx90a:sramecc+:xnack-/gfx942:sramecc+:xnack-/gfx1100)")
|
||||
}
|
||||
}
|
||||
|
||||
@ -164,6 +165,8 @@ mha_fwd(const at::Tensor &q, // batch_size x seqlen_q x num_heads x head
|
||||
auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(std::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
|
||||
at::Tensor seed_t, offset_t;
|
||||
|
||||
at::PhiloxCudaState philox_state;
|
||||
bool use_philox_state = false;
|
||||
if (p_dropout > 0.0) {
|
||||
// number of times random will be generated per thread, to offset philox counter in thc random
|
||||
// state
|
||||
@ -171,12 +174,14 @@ mha_fwd(const at::Tensor &q, // batch_size x seqlen_q x num_heads x head
|
||||
int64_t counter_offset = batch_size * num_heads * 32;
|
||||
// See Note [Acquire lock when using random generators]
|
||||
std::lock_guard<std::mutex> lock(gen->mutex_);
|
||||
at::PhiloxCudaState philox_state = gen->philox_cuda_state(counter_offset);
|
||||
philox_state = gen->philox_cuda_state(counter_offset);
|
||||
if (at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None) {
|
||||
auto [seed, offset] = at::cuda::philox::unpack(philox_state);
|
||||
seed_t = at::scalar_tensor(at::Scalar(static_cast<int64_t>(seed)), at::dtype(at::kLong));
|
||||
offset_t = at::scalar_tensor(at::Scalar(static_cast<int64_t>(offset)), at::dtype(at::kLong));
|
||||
seed_t = at::scalar_tensor(at::Scalar(static_cast<int64_t>(seed)), at::dtype(at::kLong).device(at::kCUDA));
|
||||
offset_t = at::scalar_tensor(at::Scalar(static_cast<int64_t>(offset)), at::dtype(at::kLong).device(at::kCUDA));
|
||||
} else {
|
||||
// See Note [CUDA Graph-safe RNG states] about the design
|
||||
use_philox_state = true;
|
||||
seed_t = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
|
||||
offset_t = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
|
||||
}
|
||||
@ -185,19 +190,8 @@ mha_fwd(const at::Tensor &q, // batch_size x seqlen_q x num_heads x head
|
||||
seed_t = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
|
||||
offset_t = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
|
||||
} else {
|
||||
seed_t = at::empty({}, at::dtype(at::kLong));
|
||||
offset_t = at::empty({}, at::dtype(at::kLong));
|
||||
}
|
||||
}
|
||||
|
||||
at::PhiloxCudaState philox_args;
|
||||
if (p_dropout > 0.0) {
|
||||
if (at::cuda::currentStreamCaptureStatus() ==
|
||||
at::cuda::CaptureStatus::None)
|
||||
{
|
||||
philox_args = at::PhiloxCudaState(*seed_t.data_ptr<int64_t>(), *offset_t.data_ptr<int64_t>());
|
||||
} else { // dropout + capture
|
||||
philox_args = at::PhiloxCudaState(seed_t.data_ptr<int64_t>(), offset_t.data_ptr<int64_t>(), 0);
|
||||
seed_t = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
|
||||
offset_t = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
|
||||
}
|
||||
}
|
||||
|
||||
@ -219,9 +213,17 @@ mha_fwd(const at::Tensor &q, // batch_size x seqlen_q x num_heads x head
|
||||
|
||||
hipError_t err; // TODO: Error handling
|
||||
using aotriton::v2::flash::attn_fwd;
|
||||
using aotriton::TensorView;
|
||||
using sdp::aotriton_adapter::mk_aotensor;
|
||||
using sdp::aotriton_adapter::mk_aoscalartensor;
|
||||
using sdp::aotriton_adapter::mk_philoxtensor;
|
||||
using sdp::aotriton_adapter::cast_dtype;
|
||||
aotriton::TensorView<4> empty_bias(0, {0,0,0,0}, {0,0,0,0}, cast_dtype(q.dtype()));
|
||||
auto seed = use_philox_state ? mk_philoxtensor(philox_state.seed_.ptr) : mk_aoscalartensor(seed_t);
|
||||
auto offset1 = use_philox_state ? mk_philoxtensor(philox_state.offset_.ptr) : mk_aoscalartensor(offset_t);
|
||||
auto offset2 = use_philox_state ? philox_state.offset_intragraph_ : 0;
|
||||
auto seed_output = use_philox_state ? mk_philoxtensor(seed_t.data_ptr<int64_t>()) : mk_philoxtensor(nullptr);
|
||||
auto offset_output = use_philox_state ? mk_philoxtensor(offset_t.data_ptr<int64_t>()) : mk_philoxtensor(nullptr);
|
||||
err = attn_fwd(mk_aotensor(q_t, "q"),
|
||||
mk_aotensor(k_t, "k"),
|
||||
mk_aotensor(v_t, "v"),
|
||||
@ -230,8 +232,11 @@ mha_fwd(const at::Tensor &q, // batch_size x seqlen_q x num_heads x head
|
||||
mk_aotensor<2>(M, "M"),
|
||||
mk_aotensor(output_t, "Out"),
|
||||
p_dropout,
|
||||
philox_args.seed_.val,
|
||||
philox_args.offset_.val,
|
||||
seed,
|
||||
offset1,
|
||||
offset2,
|
||||
seed_output,
|
||||
offset_output,
|
||||
mk_aotensor(softmax_fa_t, "encoded_softmax"),
|
||||
is_causal,
|
||||
stream);
|
||||
@ -392,17 +397,6 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si
|
||||
dv_expanded = dv;
|
||||
}
|
||||
|
||||
at::PhiloxCudaState philox_args;
|
||||
if (p_dropout > 0.0) {
|
||||
if (at::cuda::currentStreamCaptureStatus() ==
|
||||
at::cuda::CaptureStatus::None)
|
||||
{
|
||||
philox_args = at::PhiloxCudaState(*philox_seed.data_ptr<int64_t>(), *philox_offset.data_ptr<int64_t>());
|
||||
} else { // dropout + capture
|
||||
philox_args = at::PhiloxCudaState(philox_seed.data_ptr<int64_t>(), philox_offset.data_ptr<int64_t>(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
at::Tensor q_t = q.permute({0,2,1,3});
|
||||
at::Tensor k_t = k.permute({0,2,1,3});
|
||||
at::Tensor v_t = v.permute({0,2,1,3});
|
||||
@ -420,6 +414,7 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si
|
||||
{
|
||||
using aotriton::v2::flash::attn_bwd;
|
||||
using sdp::aotriton_adapter::mk_aotensor;
|
||||
using sdp::aotriton_adapter::mk_aoscalartensor;
|
||||
using sdp::aotriton_adapter::cast_dtype;
|
||||
aotriton::TensorView<4> empty_bias(0, {0,0,0,0}, {0,0,0,0}, cast_dtype(q.dtype()));
|
||||
err = attn_bwd(mk_aotensor(q_t, "q"),
|
||||
@ -436,8 +431,9 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si
|
||||
mk_aotensor<2>(softmax_lse_cont, "L"),
|
||||
mk_aotensor<2>(delta, "delta"),
|
||||
p_dropout,
|
||||
philox_args.seed_.val,
|
||||
philox_args.offset_.val,
|
||||
mk_aoscalartensor(philox_seed),
|
||||
mk_aoscalartensor(philox_offset),
|
||||
0,
|
||||
is_causal,
|
||||
stream);
|
||||
}
|
||||
|
@ -275,17 +275,6 @@ inline bool check_for_attn_mask(sdp_params const& params, bool debug) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// TODO(eqy): remove this once support is added
|
||||
inline bool check_for_attn_mask_cudnn(sdp_params const& params, bool debug) {
|
||||
if (params.attn_mask.has_value()) {
|
||||
if (debug) {
|
||||
TORCH_WARN("cuDNN Attention does not support non-null attn_mask.");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool check_attn_mask_shape(sdp_params const& params, bool debug) {
|
||||
auto attn_mask = params.attn_mask;
|
||||
if (!attn_mask.has_value()) {
|
||||
|
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,25
|
||||
hf_Reformer,pass,23
|
||||
|
||||
|
||||
|
||||
|
|
@ -130,6 +130,10 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,0
|
||||
|
||||
|
||||
|
||||
hf_DistilBert,pass,0
|
||||
|
||||
|
||||
@ -142,6 +146,10 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_T5,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_base,pass,0
|
||||
|
||||
|
||||
@ -170,6 +178,10 @@ maml_omniglot,pass,0
|
||||
|
||||
|
||||
|
||||
mnasnet1_0,pass,0
|
||||
|
||||
|
||||
|
||||
mobilenet_v2,pass,0
|
||||
|
||||
|
||||
@ -242,6 +254,10 @@ resnext50_32x4d,pass,0
|
||||
|
||||
|
||||
|
||||
shufflenet_v2_x1_0,pass,0
|
||||
|
||||
|
||||
|
||||
soft_actor_critic,fail_to_run,0
|
||||
|
||||
|
||||
|
|
@ -126,6 +126,10 @@ MobileBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
OPTForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
PLBartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
|
@ -130,6 +130,10 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,0
|
||||
|
||||
|
||||
|
||||
hf_DistilBert,pass,0
|
||||
|
||||
|
||||
@ -142,6 +146,10 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_T5,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_base,pass,0
|
||||
|
||||
|
||||
@ -170,6 +178,10 @@ maml_omniglot,pass,0
|
||||
|
||||
|
||||
|
||||
mnasnet1_0,pass,0
|
||||
|
||||
|
||||
|
||||
mobilenet_v2,pass,0
|
||||
|
||||
|
||||
@ -242,6 +254,10 @@ resnext50_32x4d,pass,0
|
||||
|
||||
|
||||
|
||||
shufflenet_v2_x1_0,pass,0
|
||||
|
||||
|
||||
|
||||
soft_actor_critic,fail_to_run,0
|
||||
|
||||
|
||||
|
|
@ -130,6 +130,10 @@ MobileBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
OPTForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
PLBartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
|
@ -138,6 +138,10 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,19
|
||||
|
||||
|
||||
|
||||
hf_DistilBert,pass,0
|
||||
|
||||
|
||||
@ -150,10 +154,18 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Longformer,pass,4
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
|
||||
|
||||
|
||||
hf_T5,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_base,pass,0
|
||||
|
||||
|
||||
@ -178,6 +190,10 @@ maml,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
mnasnet1_0,pass,0
|
||||
|
||||
|
||||
|
||||
maml_omniglot,pass,0
|
||||
|
||||
|
||||
@ -258,6 +274,10 @@ resnext50_32x4d,pass,0
|
||||
|
||||
|
||||
|
||||
shufflenet_v2_x1_0,pass,0
|
||||
|
||||
|
||||
|
||||
soft_actor_critic,pass,0
|
||||
|
||||
|
||||
|
|
@ -130,6 +130,10 @@ MobileBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
OPTForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
PLBartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
|
@ -138,6 +138,10 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,19
|
||||
|
||||
|
||||
|
||||
hf_DistilBert,pass,0
|
||||
|
||||
|
||||
@ -150,10 +154,18 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Longformer,pass,4
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
|
||||
|
||||
|
||||
hf_T5,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_base,pass,0
|
||||
|
||||
|
||||
@ -182,6 +194,10 @@ maml_omniglot,pass,0
|
||||
|
||||
|
||||
|
||||
mnasnet1_0,pass,0
|
||||
|
||||
|
||||
|
||||
mobilenet_v2,pass,0
|
||||
|
||||
|
||||
@ -258,6 +274,10 @@ resnext50_32x4d,pass,0
|
||||
|
||||
|
||||
|
||||
shufflenet_v2_x1_0,pass,0
|
||||
|
||||
|
||||
|
||||
soft_actor_critic,pass,0
|
||||
|
||||
|
||||
|
|
@ -130,6 +130,10 @@ MobileBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
OPTForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
PLBartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
|
@ -138,6 +138,10 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,13
|
||||
|
||||
|
||||
|
||||
hf_DistilBert,pass,0
|
||||
|
||||
|
||||
@ -150,10 +154,18 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Longformer,pass,4
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
|
||||
|
||||
|
||||
hf_T5,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_base,pass,0
|
||||
|
||||
|
||||
|
|
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,25
|
||||
hf_Reformer,pass,23
|
||||
|
||||
|
||||
|
||||
|
|
@ -114,6 +114,10 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,0
|
||||
|
||||
|
||||
|
||||
hf_DistilBert,pass,0
|
||||
|
||||
|
||||
@ -126,6 +130,10 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_T5,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_base,pass,0
|
||||
|
||||
|
||||
@ -158,6 +166,10 @@ mobilenet_v2,pass,0
|
||||
|
||||
|
||||
|
||||
mnasnet1_0,pass,0
|
||||
|
||||
|
||||
|
||||
mobilenet_v2_quantized_qat,fail_to_run,0
|
||||
|
||||
|
||||
@ -226,6 +238,10 @@ resnext50_32x4d,pass,0
|
||||
|
||||
|
||||
|
||||
shufflenet_v2_x1_0,pass,0
|
||||
|
||||
|
||||
|
||||
soft_actor_critic,fail_to_run,0
|
||||
|
||||
|
||||
|
|
@ -114,6 +114,10 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,0
|
||||
|
||||
|
||||
|
||||
hf_DistilBert,pass,0
|
||||
|
||||
|
||||
@ -126,6 +130,10 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_T5,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_base,pass,0
|
||||
|
||||
|
||||
@ -154,6 +162,10 @@ maml_omniglot,pass,0
|
||||
|
||||
|
||||
|
||||
mnasnet1_0,pass,0
|
||||
|
||||
|
||||
|
||||
mobilenet_v2,pass,0
|
||||
|
||||
|
||||
@ -226,6 +238,10 @@ resnext50_32x4d,pass,0
|
||||
|
||||
|
||||
|
||||
shufflenet_v2_x1_0,pass,0
|
||||
|
||||
|
||||
|
||||
soft_actor_critic,fail_to_run,0
|
||||
|
||||
|
||||
|
|
@ -130,6 +130,10 @@ MobileBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
OPTForCausalLM,pass,0
|
||||
|
||||
|
||||
|
||||
PLBartForCausalLM,pass,0
|
||||
|
||||
|
||||
|
|
@ -122,6 +122,10 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,13
|
||||
|
||||
|
||||
|
||||
hf_DistilBert,pass,0
|
||||
|
||||
|
||||
@ -134,10 +138,18 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Longformer,pass,4
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
|
||||
|
||||
|
||||
hf_T5,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_base,pass,0
|
||||
|
||||
|
||||
|
|
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,25
|
||||
hf_Reformer,pass,23
|
||||
|
||||
|
||||
|
||||
|
|
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,25
|
||||
hf_Reformer,pass,23
|
||||
|
||||
|
||||
|
||||
|
|
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,25
|
||||
hf_Reformer,pass,23
|
||||
|
||||
|
||||
|
||||
|
|
@ -1338,6 +1338,16 @@ def try_script(model, example_inputs):
|
||||
return None
|
||||
|
||||
|
||||
def _produce_dynamic_shapes_for_export(path, x):
|
||||
# mark_dynamic() is ignored for export.
|
||||
# use this to produce dynamic_shapes spec instead.
|
||||
from torch.export.dynamic_shapes import Dim
|
||||
|
||||
if not isinstance(x, torch.Tensor):
|
||||
return None
|
||||
return {i: Dim.AUTO for i in getattr(x, "_dynamo_dynamic_indices", {})}
|
||||
|
||||
|
||||
class AOTInductorModelCache:
|
||||
cache = {}
|
||||
|
||||
@ -1345,6 +1355,7 @@ class AOTInductorModelCache:
|
||||
def load(cls, model, example_inputs, device):
|
||||
import torch._inductor
|
||||
import torch.export._trace
|
||||
from torch.export.dynamic_shapes import _tree_map_with_path
|
||||
|
||||
key = weakref.ref(model)
|
||||
if key not in cls.cache:
|
||||
@ -1364,10 +1375,16 @@ class AOTInductorModelCache:
|
||||
else:
|
||||
_register_dataclass_output_as_pytree(example_outputs)
|
||||
|
||||
combined_args = tuple(example_args) + tuple(example_kwargs.values())
|
||||
dynamic_shapes = _tree_map_with_path(
|
||||
_produce_dynamic_shapes_for_export, combined_args
|
||||
)
|
||||
|
||||
gm = torch.export._trace._export(
|
||||
model,
|
||||
example_args,
|
||||
example_kwargs,
|
||||
dynamic_shapes=dynamic_shapes,
|
||||
pre_dispatch=True,
|
||||
strict=False,
|
||||
).module()
|
||||
@ -1382,11 +1399,20 @@ class AOTInductorModelCache:
|
||||
|
||||
|
||||
def export(model, example_inputs):
|
||||
from torch.export.dynamic_shapes import _tree_map_with_path
|
||||
|
||||
example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
|
||||
example_outputs = model(*example_args, **example_kwargs)
|
||||
_register_dataclass_output_as_pytree(example_outputs)
|
||||
|
||||
ep = torch.export.export(model, example_args, example_kwargs)
|
||||
combined_args = tuple(example_args) + tuple(example_kwargs.values())
|
||||
dynamic_shapes = _tree_map_with_path(
|
||||
_produce_dynamic_shapes_for_export, combined_args
|
||||
)
|
||||
|
||||
ep = torch.export.export(
|
||||
model, example_args, example_kwargs, dynamic_shapes=dynamic_shapes
|
||||
)
|
||||
|
||||
def opt_export(_, example_inputs):
|
||||
example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
|
||||
@ -2370,7 +2396,11 @@ class BenchmarkRunner:
|
||||
return set()
|
||||
|
||||
@property
|
||||
def skip_models_for_freezing(self):
|
||||
def skip_models_for_freezing_cpu(self):
|
||||
return set()
|
||||
|
||||
@property
|
||||
def skip_models_for_freezing_cuda(self):
|
||||
return set()
|
||||
|
||||
@property
|
||||
@ -4275,7 +4305,6 @@ def run(runner, args, original_dir=None):
|
||||
runner.skip_models.update(runner.slow_models)
|
||||
|
||||
if args.devices == ["cpu"]:
|
||||
runner.skip_models.update(runner.very_slow_models)
|
||||
runner.skip_models.update(runner.skip_models_for_cpu)
|
||||
elif args.devices == ["cuda"]:
|
||||
runner.skip_models.update(runner.skip_models_for_cuda)
|
||||
@ -4284,7 +4313,10 @@ def run(runner, args, original_dir=None):
|
||||
runner.skip_models.update(runner.skip_multiprocess_models)
|
||||
|
||||
if args.freezing:
|
||||
runner.skip_models.update(runner.skip_models_for_freezing)
|
||||
if args.devices == ["cpu"]:
|
||||
runner.skip_models.update(runner.skip_models_for_freezing_cpu)
|
||||
elif args.devices == ["cuda"]:
|
||||
runner.skip_models.update(runner.skip_models_for_freezing_cuda)
|
||||
|
||||
if args.no_skip:
|
||||
runner.skip_models.clear()
|
||||
|
@ -505,7 +505,7 @@ class HuggingfaceRunner(BenchmarkRunner):
|
||||
return 4e-3, cosine
|
||||
if (
|
||||
current_device == "cpu"
|
||||
and name in self._config["tolerance"]["higher_inference"]
|
||||
and name in self._config["tolerance"]["higher_inference_cpu"]
|
||||
):
|
||||
return 4e-3, cosine
|
||||
return 1e-3, cosine
|
||||
|
@ -11,9 +11,7 @@ skip:
|
||||
- GPTJForQuestionAnswering
|
||||
|
||||
device:
|
||||
cpu:
|
||||
# OOMs
|
||||
- OPTForCausalLM
|
||||
cpu: []
|
||||
|
||||
control_flow:
|
||||
- AllenaiLongformerBase
|
||||
@ -71,6 +69,7 @@ batch_size:
|
||||
TrOCRForCausalLM: 2
|
||||
XGLMForCausalLM: 4
|
||||
XLNetLMHeadModel: 2
|
||||
YituTechConvBert: 2
|
||||
|
||||
|
||||
tolerance:
|
||||
|
@ -1,5 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from contextlib import nullcontext
|
||||
|
||||
import click
|
||||
import numpy as np
|
||||
from operator_inp_utils import OperatorInputsLoader
|
||||
@ -16,11 +18,13 @@ from torch.utils._pytree import tree_map_only
|
||||
|
||||
|
||||
aten = torch.ops.aten
|
||||
profile_enabled = False
|
||||
|
||||
|
||||
def compute_speedups(
|
||||
operator, models, example_inputs, repeats, accuracy_checking=False, device="cuda"
|
||||
):
|
||||
global profile_enabled
|
||||
expected = models[0](*example_inputs)
|
||||
if accuracy_checking:
|
||||
for model in models[1:]:
|
||||
@ -35,20 +39,32 @@ def compute_speedups(
|
||||
|
||||
timings = np.zeros((repeats, len(models)), np.float64)
|
||||
for rep in range(repeats):
|
||||
# interleave the runs to handle frequency scaling and load changes
|
||||
for m, model in enumerate(models):
|
||||
if device == "cuda":
|
||||
model(*example_inputs)
|
||||
|
||||
# benchmarker.benchmark_gpu() clears L2 cache to hide the latency of CPU launch time
|
||||
# along with cuda synchronization
|
||||
timings[rep, m] = benchmarker.benchmark_gpu(
|
||||
lambda: model(*example_inputs)
|
||||
record_rep_context = (
|
||||
torch.profiler.record_function(f"rep_{rep}")
|
||||
if profile_enabled
|
||||
else nullcontext()
|
||||
)
|
||||
with record_rep_context:
|
||||
# interleave the runs to handle frequency scaling and load changes
|
||||
for m, model in enumerate(models):
|
||||
record_model_context = (
|
||||
torch.profiler.record_function(f"model_{m}")
|
||||
if profile_enabled
|
||||
else nullcontext()
|
||||
)
|
||||
else:
|
||||
from torch._inductor.utils import timed
|
||||
with record_model_context:
|
||||
if device == "cuda":
|
||||
model(*example_inputs)
|
||||
|
||||
timings[rep, m] = timed(model, example_inputs)
|
||||
# benchmarker.benchmark_gpu() clears L2 cache to hide the latency of CPU launch time
|
||||
# along with cuda synchronization
|
||||
timings[rep, m] = benchmarker.benchmark_gpu(
|
||||
lambda: model(*example_inputs)
|
||||
)
|
||||
else:
|
||||
from torch._inductor.utils import timed
|
||||
|
||||
timings[rep, m] = timed(model, example_inputs)
|
||||
return np.median(timings, axis=0)
|
||||
|
||||
|
||||
@ -171,6 +187,7 @@ def skip_operator(operator):
|
||||
@click.option(
|
||||
"--channels-last", help="force inputs to channels last", is_flag=True, default=False
|
||||
)
|
||||
@click.option("--profile", help="profile the benchmark", is_flag=True, default=False)
|
||||
def benchmark(
|
||||
suite,
|
||||
op,
|
||||
@ -183,7 +200,9 @@ def benchmark(
|
||||
inp_file,
|
||||
start_idx,
|
||||
channels_last,
|
||||
profile,
|
||||
):
|
||||
global profile_enabled
|
||||
if inp_file is not None:
|
||||
loader = OperatorInputsLoader(inp_file)
|
||||
else:
|
||||
@ -209,6 +228,8 @@ def benchmark(
|
||||
ops = [eval(op)]
|
||||
|
||||
max_samples = max_samples + start_idx
|
||||
profile_enabled = profile
|
||||
|
||||
for operator in ops:
|
||||
if skip_operator(operator):
|
||||
continue
|
||||
@ -216,10 +237,31 @@ def benchmark(
|
||||
print(f"Running {operator}")
|
||||
inp_gen = loader.get_inputs_for_operator(operator, dtype=dtype, device=device)
|
||||
timings = []
|
||||
|
||||
for i in range(min(max_samples, 1000000)):
|
||||
inputs_list = []
|
||||
for _ in range(min(max_samples, 1000000)):
|
||||
try:
|
||||
inps = next(inp_gen)
|
||||
inputs_list.append(inps)
|
||||
except StopIteration:
|
||||
break
|
||||
|
||||
profiler_context = (
|
||||
torch.profiler.profile(
|
||||
activities=[
|
||||
torch.profiler.ProfilerActivity.CPU,
|
||||
torch.profiler.ProfilerActivity.CUDA,
|
||||
],
|
||||
record_shapes=False,
|
||||
profile_memory=False,
|
||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||
f"./log/operator_{operator}", use_gzip=True
|
||||
),
|
||||
)
|
||||
if profile_enabled
|
||||
else nullcontext()
|
||||
)
|
||||
with profiler_context as prof:
|
||||
for i, inps in enumerate(inputs_list):
|
||||
if inps is None:
|
||||
break
|
||||
if i < start_idx:
|
||||
@ -230,28 +272,32 @@ def benchmark(
|
||||
args, kwargs = tree_map_only(
|
||||
torch.Tensor, to_channels_last, (args, kwargs)
|
||||
)
|
||||
|
||||
except StopIteration:
|
||||
break
|
||||
try:
|
||||
# aten, nvfuser, inductor
|
||||
timings.append(
|
||||
microbenchmark(
|
||||
operator,
|
||||
args,
|
||||
kwargs,
|
||||
dtype,
|
||||
accuracy_checking,
|
||||
repeats,
|
||||
measure_nvfuser,
|
||||
device,
|
||||
try:
|
||||
iter_context = (
|
||||
torch.profiler.record_function(f"iter_{i}")
|
||||
if profile_enabled
|
||||
else nullcontext()
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"error {operator}")
|
||||
print(e)
|
||||
# comment out this line to avoid blocking other tests
|
||||
# raise e
|
||||
with iter_context:
|
||||
# aten, nvfuser, inductor
|
||||
timings.append(
|
||||
microbenchmark(
|
||||
operator,
|
||||
args,
|
||||
kwargs,
|
||||
dtype,
|
||||
accuracy_checking,
|
||||
repeats,
|
||||
measure_nvfuser,
|
||||
device,
|
||||
)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
print(f"error {operator}")
|
||||
print(e)
|
||||
# comment out this line to avoid blocking other tests
|
||||
# raise e
|
||||
|
||||
if not timings:
|
||||
continue
|
||||
|
@ -60,6 +60,13 @@ class BenchmarkBase(ABC):
|
||||
# TODO is there other parts we need to add ?
|
||||
_enable_compile_time_instruction_count = False
|
||||
|
||||
# number of iterations used to run when collecting instruction_count or compile_time_instruction_count.
|
||||
_num_iterations = 5
|
||||
|
||||
def with_iterations(self, value):
|
||||
self._num_iterations = value
|
||||
return self
|
||||
|
||||
def enable_instruction_count(self):
|
||||
self._enable_instruction_count = True
|
||||
return self
|
||||
@ -88,7 +95,7 @@ class BenchmarkBase(ABC):
|
||||
def _count_instructions(self):
|
||||
print(f"collecting instruction count for {self.name()}")
|
||||
results = []
|
||||
for i in range(10):
|
||||
for i in range(self._num_iterations):
|
||||
self._prepare()
|
||||
id = i_counter.start()
|
||||
self._work()
|
||||
@ -102,7 +109,7 @@ class BenchmarkBase(ABC):
|
||||
config.record_compile_time_instruction_count = True
|
||||
|
||||
results = []
|
||||
for i in range(10):
|
||||
for i in range(self._num_iterations):
|
||||
self._prepare()
|
||||
# CompileTimeInstructionCounter.record is only called on convert_frame._compile_inner
|
||||
# hence this will only count instruction count spent in compile_inner.
|
||||
|
@ -4,21 +4,29 @@ import sys
|
||||
from benchmark_base import BenchmarkBase
|
||||
|
||||
import torch
|
||||
from torch._inductor.utils import fresh_inductor_cache
|
||||
|
||||
|
||||
class Benchmark(BenchmarkBase):
|
||||
def __init__(self, backend):
|
||||
self.backend = backend
|
||||
def __init__(self, backend, dynamic=False, is_gpu=False):
|
||||
self._backend = backend
|
||||
self._dynamic = dynamic
|
||||
self._device = "cuda" if is_gpu else "cpu"
|
||||
|
||||
def name(self):
|
||||
return f"add_loop_{self.backend}"
|
||||
prefix = f"add_loop_{self._backend}"
|
||||
if self._dynamic:
|
||||
prefix += "_dynamic"
|
||||
if self._device == "cuda":
|
||||
prefix += "_gpu"
|
||||
return prefix
|
||||
|
||||
def description(self):
|
||||
return "a loop over 100 add node"
|
||||
|
||||
def _prepare_once(self, dynamic=True):
|
||||
self.a = torch.ones(1000)
|
||||
self.b = torch.torch.ones(1000)
|
||||
def _prepare_once(self):
|
||||
self.a = torch.ones(1000, device=self._device)
|
||||
self.b = torch.torch.ones(1000, device=self._device)
|
||||
|
||||
def _prepare(self):
|
||||
torch._dynamo.reset()
|
||||
@ -26,7 +34,7 @@ class Benchmark(BenchmarkBase):
|
||||
gc.disable()
|
||||
|
||||
def _work(self):
|
||||
@torch.compile(backend=self.backend, fullgraph=True)
|
||||
@torch.compile(backend=self._backend, fullgraph=True, dynamic=self._dynamic)
|
||||
def f(a, b):
|
||||
result = a.clone()
|
||||
for i in range(1000):
|
||||
@ -38,17 +46,24 @@ class Benchmark(BenchmarkBase):
|
||||
result = result.sin()
|
||||
return result
|
||||
|
||||
f(self.a, self.b)
|
||||
with fresh_inductor_cache():
|
||||
f(self.a, self.b)
|
||||
|
||||
|
||||
def main():
|
||||
result_path = sys.argv[1]
|
||||
Benchmark(
|
||||
"eager"
|
||||
).enable_compile_time_instruction_count().collect_all().append_results(result_path)
|
||||
Benchmark(
|
||||
"inductor"
|
||||
).enable_compile_time_instruction_count().collect_all().append_results(result_path)
|
||||
all = [
|
||||
Benchmark("eager"),
|
||||
Benchmark("eager", dynamic=True),
|
||||
Benchmark("inductor"),
|
||||
Benchmark("inductor", is_gpu=True),
|
||||
Benchmark("inductor", is_gpu=True, dynamic=True),
|
||||
]
|
||||
|
||||
for benchmark in all:
|
||||
benchmark.enable_compile_time_instruction_count().collect_all().append_results(
|
||||
result_path
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -32,7 +32,9 @@ class Benchmark(BenchmarkBase):
|
||||
|
||||
def main():
|
||||
result_path = sys.argv[1]
|
||||
Benchmark().enable_instruction_count().collect_all().append_results(result_path)
|
||||
Benchmark().enable_compile_time_instruction_count().collect_all().append_results(
|
||||
result_path
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -387,11 +387,6 @@ def get_skip_tests(suite, device, is_training: bool):
|
||||
skip_tests.update(module.TorchBenchmarkRunner().skip_models_for_cpu)
|
||||
elif device == "cuda":
|
||||
skip_tests.update(module.TorchBenchmarkRunner().skip_models_for_cuda)
|
||||
else:
|
||||
if hasattr(module, "SKIP"):
|
||||
skip_tests.update(module.SKIP)
|
||||
if is_training and hasattr(module, "SKIP_TRAIN"):
|
||||
skip_tests.update(module.SKIP_TRAIN)
|
||||
|
||||
skip_tests = (f"-x {name}" for name in skip_tests)
|
||||
skip_str = " ".join(skip_tests)
|
||||
@ -438,7 +433,7 @@ def generate_commands(args, dtypes, suites, devices, compilers, output_dir):
|
||||
if args.enable_cpu_launcher:
|
||||
launcher_cmd = f"python -m torch.backends.xeon.run_cpu {args.cpu_launcher_args}"
|
||||
cmd = f"{launcher_cmd} benchmarks/dynamo/{suite}.py --{testing} --{dtype} -d{device} --output={output_filename}"
|
||||
cmd = f"{cmd} {base_cmd} {args.extra_args} --no-skip --dashboard"
|
||||
cmd = f"{cmd} {base_cmd} {args.extra_args} --dashboard"
|
||||
skip_tests_str = get_skip_tests(suite, device, args.training)
|
||||
cmd = f"{cmd} {skip_tests_str}"
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user