pytorch/scripts/release_notes/common.py

from collections import namedtuple
from pathlib import Path
import locale
import subprocess
import re
import requests
import os
import json
from dataclasses import dataclass

@dataclass
class CategoryGroup:
    name: str
    categories: list

frontend_categories = [
    'meta',
    'nn',
    'linalg',
    'cpp',
    'python',
    'complex',
    'vmap',
    'autograd',
    'build',
    'memory_format',
    'foreach',
    'dataloader',
    'sparse',
    'nested tensor',
    'optimizer'
]

pytorch_2_categories = [
    'dynamo',
    'inductor',
]

# These will all get mapped to quantization
quantization = CategoryGroup(
    name="quantization",
    categories=[
        'quantization',
        'AO frontend',
        'AO Pruning', ]
)

# Distributed has a number of release note labels we want to map to one
distributed = CategoryGroup(
    name="distributed",
    categories=[
        'distributed',
        'distributed (c10d)',
        'distributed (composable)',
        'distributed (ddp)',
        'distributed (fsdp)',
        'distributed (rpc)',
        'distributed (sharded)',
    ]
)

categories = [
    'Uncategorized',
    'lazy',
    'hub',
    'mobile',
    'jit',
    'visualization',
    'onnx',
    'caffe2',
    'amd',
    'rocm',
    'cuda',
    'cpu',
    'cudnn',
    'xla',
    'benchmark',
    'profiler',
    'performance_as_product',
    'package',
    'dispatcher',
    'releng',
    'fx',
    'code_coverage',
    'vulkan',
    'skip',
    'composability',
    # 2.0 release
    'mps',
    'intel',
    'functorch',
    'gnn',
    'distributions',
    'serialization',
 ]  + [f'{category}_frontend' for category in frontend_categories] + pytorch_2_categories + [quantization.name] + [distributed.name]


topics = [
    'bc_breaking',
    'deprecations',
    'new_features',
    'improvements',
    'bug_fixes',
    'performance',
    'docs',
    'devs',
    'Untopiced',
    "not user facing",
    "security",
]


Features = namedtuple('Features', [
    'title',
    'body',
    'pr_number',
    'files_changed',
    'labels',
    'author',
    'accepters'
])


def dict_to_features(dct):
    return Features(
        title=dct['title'],
        body=dct['body'],
        pr_number=dct['pr_number'],
        files_changed=dct['files_changed'],
        labels=dct['labels'],
        author=dct['author'],
        accepters=tuple(dct['accepters']))


def features_to_dict(features):
    return dict(features._asdict())


def run(command):
    """Returns (return-code, stdout, stderr)"""
    p = subprocess.Popen(command, stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE, shell=True)
    output, err = p.communicate()
    rc = p.returncode
    enc = locale.getpreferredencoding()
    output = output.decode(enc)
    err = err.decode(enc)
    return rc, output.strip(), err.strip()


def commit_body(commit_hash):
    cmd = f'git log -n 1 --pretty=format:%b {commit_hash}'
    ret, out, err = run(cmd)
    return out if ret == 0 else None


def commit_title(commit_hash):
    cmd = f'git log -n 1 --pretty=format:%s {commit_hash}'
    ret, out, err = run(cmd)
    return out if ret == 0 else None


def commit_files_changed(commit_hash):
    cmd = f'git diff-tree --no-commit-id --name-only -r {commit_hash}'
    ret, out, err = run(cmd)
    return out.split('\n') if ret == 0 else None


def parse_pr_number(body, commit_hash, title):
    regex = r'Pull Request resolved: https://github.com/pytorch/pytorch/pull/([0-9]+)'
    matches = re.findall(regex, body)
    if len(matches) == 0:
        if 'revert' not in title.lower() and 'updating submodules' not in title.lower():
            print(f'[{commit_hash}: {title}] Could not parse PR number, ignoring PR')
        return None
    if len(matches) > 1:
        print(f'[{commit_hash}: {title}] Got two PR numbers, using the first one')
        return matches[0]
    return matches[0]


def get_ghstack_token():
    pattern = 'github_oauth = (.*)'
    with open(Path('~/.ghstackrc').expanduser(), 'r+') as f:
        config = f.read()
    matches = re.findall(pattern, config)
    if len(matches) == 0:
        raise RuntimeError("Can't find a github oauth token")
    return matches[0]

def get_token():
    env_token = os.environ.get("GITHUB_TOKEN")
    if env_token is not None:
        print("using GITHUB_TOKEN from environment variable")
        return env_token
    else:
        return get_ghstack_token()

token = get_token()

headers = {"Authorization": f"token {token}"}

def run_query(query):
    request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
    if request.status_code == 200:
        return request.json()
    else:
        raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, request.json()))


def github_data(pr_number):
    query = """
    {
      repository(owner: "pytorch", name: "pytorch") {
        pullRequest(number: %s ) {
          author {
            login
          }
          reviews(last: 5, states: APPROVED) {
            nodes {
              author {
                login
              }
            }
          }
          labels(first: 10) {
            edges {
              node {
                name
              }
            }
          }
        }
      }
    }
    """ % pr_number
    query = run_query(query)
    if query.get('errors'):
        raise Exception(query['errors'])
    edges = query['data']['repository']['pullRequest']['labels']['edges']
    labels = [edge['node']['name'] for edge in edges]
    author = query['data']['repository']['pullRequest']['author']['login']
    nodes = query['data']['repository']['pullRequest']['reviews']['nodes']

    # using set to dedup multiple accepts from same accepter
    accepters = {node["author"]["login"] for node in nodes}
    accepters = tuple(sorted(accepters))

    return labels, author, accepters


def get_features(commit_hash):
    title, body, files_changed = (
        commit_title(commit_hash),
        commit_body(commit_hash),
        commit_files_changed(commit_hash))
    pr_number = parse_pr_number(body, commit_hash, title)
    labels = []
    author = ""
    accepters = tuple()
    if pr_number is not None:
        labels, author, accepters = github_data(pr_number)
    result = Features(title, body, pr_number, files_changed, labels, author, accepters)
    return result


_commit_data_cache = None

def get_commit_data_cache(path='results/data.json'):
    global _commit_data_cache
    if _commit_data_cache is None:
        _commit_data_cache = _CommitDataCache(path)
    return _commit_data_cache

class _CommitDataCache:
    def __init__(self, path):
        self.path = path
        self.data = {}
        if os.path.exists(path):
            self.data = self.read_from_disk()
        else:
            os.makedirs(Path(path).parent, exist_ok=True)

    def get(self, commit):
        if commit not in self.data.keys():
            # Fetch and cache the data
            self.data[commit] = get_features(commit)
            self.write_to_disk()
        return self.data[commit]

    def read_from_disk(self):
        with open(self.path, 'r') as f:
            data = json.load(f)
            data = {commit: dict_to_features(dct)
                    for commit, dct in data.items()}
        return data

    def write_to_disk(self):
        data = {commit: features._asdict() for commit, features in self.data.items()}
        with open(self.path, 'w') as f:
            json.dump(data, f)