mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
This PR changes the empty collection factory call to Python literals: - `list()` -> `[]` - `tuple()` -> `()` - `dict()` -> `{}` The Python literals are more performant and safer. For example, the bytecode for building an empty dictionary: ```bash $ python3 -m dis - <<EOS import collections d1 = {} d2 = dict() dict = collections.OrderedDict d3 = dict() EOS ``` ```text 0 0 RESUME 0 1 2 LOAD_CONST 0 (0) 4 LOAD_CONST 1 (None) 6 IMPORT_NAME 0 (collections) 8 STORE_NAME 0 (collections) 3 10 BUILD_MAP 0 12 STORE_NAME 1 (d1) 4 14 PUSH_NULL 16 LOAD_NAME 2 (dict) 18 CALL 0 26 STORE_NAME 3 (d2) 6 28 LOAD_NAME 0 (collections) 30 LOAD_ATTR 8 (OrderedDict) 50 STORE_NAME 2 (dict) 7 52 PUSH_NULL 54 LOAD_NAME 2 (dict) 56 CALL 0 64 STORE_NAME 5 (d3) 66 RETURN_CONST 1 (None) ``` The dict literal `{}` only has one bytecode `BUILD_MAP`, while the factory call `dict()` has three `PUSH_NULL + LOAD_NAME + CALL`. Also, the factory call is not safe if users override the `dict` name in `locals` or `globals` (see the example of replacing with `OrderedDict` above). Pull Request resolved: https://github.com/pytorch/pytorch/pull/130199 Approved by: https://github.com/malfet
333 lines
7.9 KiB
Python
333 lines
7.9 KiB
Python
import json
|
|
import locale
|
|
import os
|
|
import re
|
|
import subprocess
|
|
from collections import namedtuple
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
|
|
|
|
@dataclass
|
|
class CategoryGroup:
|
|
name: str
|
|
categories: list
|
|
|
|
|
|
frontend_categories = [
|
|
"meta",
|
|
"nn",
|
|
"linalg",
|
|
"cpp",
|
|
"python",
|
|
"complex",
|
|
"vmap",
|
|
"autograd",
|
|
"build",
|
|
"memory_format",
|
|
"foreach",
|
|
"dataloader",
|
|
"sparse",
|
|
"nested tensor",
|
|
"optimizer",
|
|
]
|
|
|
|
pytorch_2_categories = [
|
|
"dynamo",
|
|
"inductor",
|
|
]
|
|
|
|
# These will all get mapped to quantization
|
|
quantization = CategoryGroup(
|
|
name="quantization",
|
|
categories=[
|
|
"quantization",
|
|
"AO frontend",
|
|
"AO Pruning",
|
|
],
|
|
)
|
|
|
|
# Distributed has a number of release note labels we want to map to one
|
|
distributed = CategoryGroup(
|
|
name="distributed",
|
|
categories=[
|
|
"distributed",
|
|
"distributed (c10d)",
|
|
"distributed (composable)",
|
|
"distributed (ddp)",
|
|
"distributed (fsdp)",
|
|
"distributed (rpc)",
|
|
"distributed (sharded)",
|
|
],
|
|
)
|
|
|
|
categories = (
|
|
[
|
|
"Uncategorized",
|
|
"lazy",
|
|
"hub",
|
|
"mobile",
|
|
"jit",
|
|
"visualization",
|
|
"onnx",
|
|
"caffe2",
|
|
"amd",
|
|
"rocm",
|
|
"cuda",
|
|
"cpu",
|
|
"cudnn",
|
|
"xla",
|
|
"benchmark",
|
|
"profiler",
|
|
"performance_as_product",
|
|
"package",
|
|
"dispatcher",
|
|
"releng",
|
|
"fx",
|
|
"code_coverage",
|
|
"vulkan",
|
|
"skip",
|
|
"composability",
|
|
# 2.0 release
|
|
"mps",
|
|
"intel",
|
|
"functorch",
|
|
"gnn",
|
|
"distributions",
|
|
"serialization",
|
|
]
|
|
+ [f"{category}_frontend" for category in frontend_categories]
|
|
+ pytorch_2_categories
|
|
+ [quantization.name]
|
|
+ [distributed.name]
|
|
)
|
|
|
|
|
|
topics = [
|
|
"bc breaking",
|
|
"deprecation",
|
|
"new features",
|
|
"improvements",
|
|
"bug fixes",
|
|
"performance",
|
|
"docs",
|
|
"devs",
|
|
"Untopiced",
|
|
"not user facing",
|
|
"security",
|
|
]
|
|
|
|
|
|
Features = namedtuple(
|
|
"Features",
|
|
["title", "body", "pr_number", "files_changed", "labels", "author", "accepters"],
|
|
)
|
|
|
|
|
|
def dict_to_features(dct):
|
|
return Features(
|
|
title=dct["title"],
|
|
body=dct["body"],
|
|
pr_number=dct["pr_number"],
|
|
files_changed=dct["files_changed"],
|
|
labels=dct["labels"],
|
|
author=dct["author"],
|
|
accepters=tuple(dct["accepters"]),
|
|
)
|
|
|
|
|
|
def features_to_dict(features):
|
|
return dict(features._asdict())
|
|
|
|
|
|
def run(command):
|
|
"""Returns (return-code, stdout, stderr)"""
|
|
p = subprocess.Popen(
|
|
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
|
|
)
|
|
output, err = p.communicate()
|
|
rc = p.returncode
|
|
enc = locale.getpreferredencoding()
|
|
output = output.decode(enc)
|
|
err = err.decode(enc)
|
|
return rc, output.strip(), err.strip()
|
|
|
|
|
|
def commit_body(commit_hash):
|
|
cmd = f"git log -n 1 --pretty=format:%b {commit_hash}"
|
|
ret, out, err = run(cmd)
|
|
return out if ret == 0 else None
|
|
|
|
|
|
def commit_title(commit_hash):
|
|
cmd = f"git log -n 1 --pretty=format:%s {commit_hash}"
|
|
ret, out, err = run(cmd)
|
|
return out if ret == 0 else None
|
|
|
|
|
|
def commit_files_changed(commit_hash):
|
|
cmd = f"git diff-tree --no-commit-id --name-only -r {commit_hash}"
|
|
ret, out, err = run(cmd)
|
|
return out.split("\n") if ret == 0 else None
|
|
|
|
|
|
def parse_pr_number(body, commit_hash, title):
|
|
regex = r"Pull Request resolved: https://github.com/pytorch/pytorch/pull/([0-9]+)"
|
|
matches = re.findall(regex, body)
|
|
if len(matches) == 0:
|
|
if "revert" not in title.lower() and "updating submodules" not in title.lower():
|
|
print(f"[{commit_hash}: {title}] Could not parse PR number, ignoring PR")
|
|
return None
|
|
if len(matches) > 1:
|
|
print(f"[{commit_hash}: {title}] Got two PR numbers, using the first one")
|
|
return matches[0]
|
|
return matches[0]
|
|
|
|
|
|
def get_ghstack_token():
|
|
pattern = "github_oauth = (.*)"
|
|
with open(Path("~/.ghstackrc").expanduser(), "r+") as f:
|
|
config = f.read()
|
|
matches = re.findall(pattern, config)
|
|
if len(matches) == 0:
|
|
raise RuntimeError("Can't find a github oauth token")
|
|
return matches[0]
|
|
|
|
|
|
def get_token():
|
|
env_token = os.environ.get("GITHUB_TOKEN")
|
|
if env_token is not None:
|
|
print("using GITHUB_TOKEN from environment variable")
|
|
return env_token
|
|
else:
|
|
return get_ghstack_token()
|
|
|
|
|
|
token = get_token()
|
|
|
|
headers = {"Authorization": f"token {token}"}
|
|
|
|
|
|
def run_query(query):
|
|
request = requests.post(
|
|
"https://api.github.com/graphql", json={"query": query}, headers=headers
|
|
)
|
|
if request.status_code == 200:
|
|
return request.json()
|
|
else:
|
|
raise Exception( # noqa: TRY002
|
|
f"Query failed to run by returning code of {request.status_code}. {request.json()}"
|
|
)
|
|
|
|
|
|
_ERRORS = []
|
|
_MAX_ERROR_LEN = 20
|
|
|
|
|
|
def github_data(pr_number):
|
|
query = (
|
|
"""
|
|
{
|
|
repository(owner: "pytorch", name: "pytorch") {
|
|
pullRequest(number: %s ) {
|
|
author {
|
|
login
|
|
}
|
|
reviews(last: 5, states: APPROVED) {
|
|
nodes {
|
|
author {
|
|
login
|
|
}
|
|
}
|
|
}
|
|
labels(first: 10) {
|
|
edges {
|
|
node {
|
|
name
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
""" # noqa: UP031
|
|
% pr_number
|
|
)
|
|
query = run_query(query)
|
|
if query.get("errors"):
|
|
global _ERRORS
|
|
_ERRORS.append(query.get("errors"))
|
|
if len(_ERRORS) < _MAX_ERROR_LEN:
|
|
return [], "None", ()
|
|
else:
|
|
raise Exception( # noqa: TRY002
|
|
f"Got {_MAX_ERROR_LEN} errors: {_ERRORS}, please check if"
|
|
" there is something wrong"
|
|
)
|
|
edges = query["data"]["repository"]["pullRequest"]["labels"]["edges"]
|
|
labels = [edge["node"]["name"] for edge in edges]
|
|
author = query["data"]["repository"]["pullRequest"]["author"]["login"]
|
|
nodes = query["data"]["repository"]["pullRequest"]["reviews"]["nodes"]
|
|
|
|
# using set to dedup multiple accepts from same accepter
|
|
accepters = {node["author"]["login"] for node in nodes}
|
|
accepters = tuple(sorted(accepters))
|
|
|
|
return labels, author, accepters
|
|
|
|
|
|
def get_features(commit_hash):
|
|
title, body, files_changed = (
|
|
commit_title(commit_hash),
|
|
commit_body(commit_hash),
|
|
commit_files_changed(commit_hash),
|
|
)
|
|
pr_number = parse_pr_number(body, commit_hash, title)
|
|
labels = []
|
|
author = ""
|
|
accepters = ()
|
|
if pr_number is not None:
|
|
labels, author, accepters = github_data(pr_number)
|
|
result = Features(title, body, pr_number, files_changed, labels, author, accepters)
|
|
return result
|
|
|
|
|
|
_commit_data_cache = None
|
|
|
|
|
|
def get_commit_data_cache(path="results/data.json"):
|
|
global _commit_data_cache
|
|
if _commit_data_cache is None:
|
|
_commit_data_cache = _CommitDataCache(path)
|
|
return _commit_data_cache
|
|
|
|
|
|
class _CommitDataCache:
|
|
def __init__(self, path):
|
|
self.path = path
|
|
self.data = {}
|
|
if os.path.exists(path):
|
|
self.data = self.read_from_disk()
|
|
else:
|
|
os.makedirs(Path(path).parent, exist_ok=True)
|
|
|
|
def get(self, commit):
|
|
if commit not in self.data.keys():
|
|
# Fetch and cache the data
|
|
self.data[commit] = get_features(commit)
|
|
self.write_to_disk()
|
|
return self.data[commit]
|
|
|
|
def read_from_disk(self):
|
|
with open(self.path) as f:
|
|
data = json.load(f)
|
|
data = {commit: dict_to_features(dct) for commit, dct in data.items()}
|
|
return data
|
|
|
|
def write_to_disk(self):
|
|
data = {commit: features._asdict() for commit, features in self.data.items()}
|
|
with open(self.path, "w") as f:
|
|
json.dump(data, f)
|