mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
# Summary This PR made some significant changes to the scripts around Release Scripts. At a high level: - Turned the quips into docs and updated links - Update the common.categorizes list in the hopes to make this the source of truth for releases- This is hard since the release_notes labels can be changed at will. An alternative would be to poll from github api. However, I think that is overkill. The notebook does a set compare and will show you knew categories. I think we want this to be manual so that the release note engineer will decided how to categorize. - Create cateogry group from speaking with folks on distributed and AO that told me these different release categories can be merged. - I am the newest person to Core and don't use ghstack soo made token getting a lil more generic. - Added a classifier.py file. This file will train a commit categorizer for you, hopefully with decent accuracy. I was able to achieve 75% accuracy. I drop the highest frequency class - "skip" since this creates a more useful cateogrizer. - I updated the categorize.py script so that the prompt will be what the classifier thinks, gated by a flag. - Added a readme that will hopefully help future release notes engineers. Pull Request resolved: https://github.com/pytorch/pytorch/pull/94560 Approved by: https://github.com/albanD
302 lines
7.3 KiB
Python
302 lines
7.3 KiB
Python
from collections import namedtuple
|
|
from pathlib import Path
|
|
import locale
|
|
import subprocess
|
|
import re
|
|
import requests
|
|
import os
|
|
import json
|
|
from dataclasses import dataclass
|
|
|
|
@dataclass
|
|
class CategoryGroup:
|
|
name: str
|
|
categories: list
|
|
|
|
frontend_categories = [
|
|
'meta',
|
|
'nn',
|
|
'linalg',
|
|
'cpp',
|
|
'python',
|
|
'complex',
|
|
'vmap',
|
|
'autograd',
|
|
'build',
|
|
'memory_format',
|
|
'foreach',
|
|
'dataloader',
|
|
'sparse',
|
|
'nested tensor',
|
|
'optimizer'
|
|
]
|
|
|
|
pytorch_2_categories = [
|
|
'dynamo',
|
|
'inductor',
|
|
]
|
|
|
|
# These will all get mapped to quantization
|
|
quantization = CategoryGroup(
|
|
name="quantization",
|
|
categories=[
|
|
'quantization',
|
|
'AO frontend',
|
|
'AO Pruning', ]
|
|
)
|
|
|
|
# Distributed has a number of release note labels we want to map to one
|
|
distributed = CategoryGroup(
|
|
name="distributed",
|
|
categories=[
|
|
'distributed',
|
|
'distributed (c10d)',
|
|
'distributed (composable)',
|
|
'distributed (ddp)',
|
|
'distributed (fsdp)',
|
|
'distributed (rpc)',
|
|
'distributed (sharded)',
|
|
]
|
|
)
|
|
|
|
categories = [
|
|
'Uncategorized',
|
|
'lazy',
|
|
'hub',
|
|
'mobile',
|
|
'jit',
|
|
'visualization',
|
|
'onnx',
|
|
'caffe2',
|
|
'amd',
|
|
'rocm',
|
|
'cuda',
|
|
'cpu',
|
|
'cudnn',
|
|
'xla',
|
|
'benchmark',
|
|
'profiler',
|
|
'performance_as_product',
|
|
'package',
|
|
'dispatcher',
|
|
'releng',
|
|
'fx',
|
|
'code_coverage',
|
|
'vulkan',
|
|
'skip',
|
|
'composability',
|
|
# 2.0 release
|
|
'mps',
|
|
'intel',
|
|
'functorch',
|
|
'gnn',
|
|
'distributions',
|
|
'serialization',
|
|
] + [f'{category}_frontend' for category in frontend_categories] + pytorch_2_categories + [quantization.name] + [distributed.name]
|
|
|
|
|
|
topics = [
|
|
'bc_breaking',
|
|
'deprecations',
|
|
'new_features',
|
|
'improvements',
|
|
'bug_fixes',
|
|
'performance',
|
|
'docs',
|
|
'devs',
|
|
'Untopiced',
|
|
"not user facing",
|
|
"security",
|
|
]
|
|
|
|
|
|
Features = namedtuple('Features', [
|
|
'title',
|
|
'body',
|
|
'pr_number',
|
|
'files_changed',
|
|
'labels',
|
|
'author',
|
|
'accepters'
|
|
])
|
|
|
|
|
|
def dict_to_features(dct):
|
|
return Features(
|
|
title=dct['title'],
|
|
body=dct['body'],
|
|
pr_number=dct['pr_number'],
|
|
files_changed=dct['files_changed'],
|
|
labels=dct['labels'],
|
|
author=dct['author'],
|
|
accepters=tuple(dct['accepters']))
|
|
|
|
|
|
def features_to_dict(features):
|
|
return dict(features._asdict())
|
|
|
|
|
|
def run(command):
|
|
"""Returns (return-code, stdout, stderr)"""
|
|
p = subprocess.Popen(command, stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE, shell=True)
|
|
output, err = p.communicate()
|
|
rc = p.returncode
|
|
enc = locale.getpreferredencoding()
|
|
output = output.decode(enc)
|
|
err = err.decode(enc)
|
|
return rc, output.strip(), err.strip()
|
|
|
|
|
|
def commit_body(commit_hash):
|
|
cmd = f'git log -n 1 --pretty=format:%b {commit_hash}'
|
|
ret, out, err = run(cmd)
|
|
return out if ret == 0 else None
|
|
|
|
|
|
def commit_title(commit_hash):
|
|
cmd = f'git log -n 1 --pretty=format:%s {commit_hash}'
|
|
ret, out, err = run(cmd)
|
|
return out if ret == 0 else None
|
|
|
|
|
|
def commit_files_changed(commit_hash):
|
|
cmd = f'git diff-tree --no-commit-id --name-only -r {commit_hash}'
|
|
ret, out, err = run(cmd)
|
|
return out.split('\n') if ret == 0 else None
|
|
|
|
|
|
def parse_pr_number(body, commit_hash, title):
|
|
regex = r'Pull Request resolved: https://github.com/pytorch/pytorch/pull/([0-9]+)'
|
|
matches = re.findall(regex, body)
|
|
if len(matches) == 0:
|
|
if 'revert' not in title.lower() and 'updating submodules' not in title.lower():
|
|
print(f'[{commit_hash}: {title}] Could not parse PR number, ignoring PR')
|
|
return None
|
|
if len(matches) > 1:
|
|
print(f'[{commit_hash}: {title}] Got two PR numbers, using the first one')
|
|
return matches[0]
|
|
return matches[0]
|
|
|
|
|
|
def get_ghstack_token():
|
|
pattern = 'github_oauth = (.*)'
|
|
with open(Path('~/.ghstackrc').expanduser(), 'r+') as f:
|
|
config = f.read()
|
|
matches = re.findall(pattern, config)
|
|
if len(matches) == 0:
|
|
raise RuntimeError("Can't find a github oauth token")
|
|
return matches[0]
|
|
|
|
def get_token():
|
|
env_token = os.environ.get("GITHUB_TOKEN")
|
|
if env_token is not None:
|
|
print("using GITHUB_TOKEN from environment variable")
|
|
return env_token
|
|
else:
|
|
return get_ghstack_token()
|
|
|
|
token = get_token()
|
|
|
|
headers = {"Authorization": f"token {token}"}
|
|
|
|
def run_query(query):
|
|
request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
|
|
if request.status_code == 200:
|
|
return request.json()
|
|
else:
|
|
raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, request.json()))
|
|
|
|
|
|
def github_data(pr_number):
|
|
query = """
|
|
{
|
|
repository(owner: "pytorch", name: "pytorch") {
|
|
pullRequest(number: %s ) {
|
|
author {
|
|
login
|
|
}
|
|
reviews(last: 5, states: APPROVED) {
|
|
nodes {
|
|
author {
|
|
login
|
|
}
|
|
}
|
|
}
|
|
labels(first: 10) {
|
|
edges {
|
|
node {
|
|
name
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
""" % pr_number
|
|
query = run_query(query)
|
|
if query.get('errors'):
|
|
raise Exception(query['errors'])
|
|
edges = query['data']['repository']['pullRequest']['labels']['edges']
|
|
labels = [edge['node']['name'] for edge in edges]
|
|
author = query['data']['repository']['pullRequest']['author']['login']
|
|
nodes = query['data']['repository']['pullRequest']['reviews']['nodes']
|
|
|
|
# using set to dedup multiple accepts from same accepter
|
|
accepters = {node["author"]["login"] for node in nodes}
|
|
accepters = tuple(sorted(accepters))
|
|
|
|
return labels, author, accepters
|
|
|
|
|
|
def get_features(commit_hash):
|
|
title, body, files_changed = (
|
|
commit_title(commit_hash),
|
|
commit_body(commit_hash),
|
|
commit_files_changed(commit_hash))
|
|
pr_number = parse_pr_number(body, commit_hash, title)
|
|
labels = []
|
|
author = ""
|
|
accepters = tuple()
|
|
if pr_number is not None:
|
|
labels, author, accepters = github_data(pr_number)
|
|
result = Features(title, body, pr_number, files_changed, labels, author, accepters)
|
|
return result
|
|
|
|
|
|
_commit_data_cache = None
|
|
|
|
def get_commit_data_cache(path='results/data.json'):
|
|
global _commit_data_cache
|
|
if _commit_data_cache is None:
|
|
_commit_data_cache = _CommitDataCache(path)
|
|
return _commit_data_cache
|
|
|
|
class _CommitDataCache:
|
|
def __init__(self, path):
|
|
self.path = path
|
|
self.data = {}
|
|
if os.path.exists(path):
|
|
self.data = self.read_from_disk()
|
|
else:
|
|
os.makedirs(Path(path).parent, exist_ok=True)
|
|
|
|
def get(self, commit):
|
|
if commit not in self.data.keys():
|
|
# Fetch and cache the data
|
|
self.data[commit] = get_features(commit)
|
|
self.write_to_disk()
|
|
return self.data[commit]
|
|
|
|
def read_from_disk(self):
|
|
with open(self.path, 'r') as f:
|
|
data = json.load(f)
|
|
data = {commit: dict_to_features(dct)
|
|
for commit, dct in data.items()}
|
|
return data
|
|
|
|
def write_to_disk(self):
|
|
data = {commit: features._asdict() for commit, features in self.data.items()}
|
|
with open(self.path, 'w') as f:
|
|
json.dump(data, f)
|