pytorch/scripts/release_notes/common.py

from collections import namedtuple
from os.path import expanduser
import locale
import subprocess
import re
import requests
import os
import json

categories = [
    'Uncategorized',
    'distributed',
    'mobile',
    'jit',
    'visualization',
    'onnx',
    'caffe2',
    'quantization',
    'amd',
    'benchmark',
    'profiler',
    'dispatcher',
    'releng',
    'fx',
    'code_coverage',
    'vulkan',
    'skip',
    'cpp_frontend',
    'python_frontend',
    'complex_frontend',
    'vmap_frontend',
    'autograd_frontend',
    'build_frontend',
    'memory_format_frontend',
    'foreach_frontend',
]

topics = [
    'bc_breaking',
    'deprecations',
    'new_features',
    'improvements',
    'bug_fixes',
    'performance',
    'docs',
    'devs',
    'Untopiced',
]


Features = namedtuple('Features', [
    'title',
    'body',
    'pr_number',
    'files_changed',
    'labels',
])


def dict_to_features(dct):
    return Features(
        title=dct['title'],
        body=dct['body'],
        pr_number=dct['pr_number'],
        files_changed=dct['files_changed'],
        labels=dct['labels'])


def features_to_dict(features):
    return dict(features._asdict())


def run(command):
    """Returns (return-code, stdout, stderr)"""
    p = subprocess.Popen(command, stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE, shell=True)
    output, err = p.communicate()
    rc = p.returncode
    enc = locale.getpreferredencoding()
    output = output.decode(enc)
    err = err.decode(enc)
    return rc, output.strip(), err.strip()


def commit_body(commit_hash):
    cmd = f'git log -n 1 --pretty=format:%b {commit_hash}'
    ret, out, err = run(cmd)
    return out if ret == 0 else None


def commit_title(commit_hash):
    cmd = f'git log -n 1 --pretty=format:%s {commit_hash}'
    ret, out, err = run(cmd)
    return out if ret == 0 else None


def commit_files_changed(commit_hash):
    cmd = f'git diff-tree --no-commit-id --name-only -r {commit_hash}'
    ret, out, err = run(cmd)
    return out.split('\n') if ret == 0 else None


def parse_pr_number(body, commit_hash, title):
    regex = r'Pull Request resolved: https://github.com/pytorch/pytorch/pull/([0-9]+)'
    matches = re.findall(regex, body)
    if len(matches) == 0:
        if 'revert' not in title.lower() and 'updating submodules' not in title.lower():
            print(f'[{commit_hash}: {title}] Could not parse PR number, ignoring PR')
        return None
    if len(matches) > 1:
        print(f'[{commit_hash}: {title}] Got two PR numbers, using the first one')
        return matches[0]
    return matches[0]


def get_ghstack_token():
    pattern = 'github_oauth = (.*)'
    with open(expanduser('~/.ghstackrc'), 'r+') as f:
        config = f.read()
    matches = re.findall(pattern, config)
    if len(matches) == 0:
        raise RuntimeError("Can't find a github oauth token")
    return matches[0]

token = get_ghstack_token()
headers = {"Authorization": f"token {token}"}

def run_query(query):
    request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
    if request.status_code == 200:
        return request.json()
    else:
        raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, query))


def gh_labels(pr_number):
    query = f"""
    {{
      repository(owner: "pytorch", name: "pytorch") {{
        pullRequest(number: {pr_number}) {{
          labels(first: 10) {{
            edges {{
              node {{
                name
              }}
            }}
          }}
        }}
      }}
    }}
    """
    query = run_query(query)
    edges = query['data']['repository']['pullRequest']['labels']['edges']
    return [edge['node']['name'] for edge in edges]


def get_features(commit_hash, return_dict=False):
    title, body, files_changed = (
        commit_title(commit_hash),
        commit_body(commit_hash),
        commit_files_changed(commit_hash))
    pr_number = parse_pr_number(body, commit_hash, title)
    labels = []
    if pr_number is not None:
        labels = gh_labels(pr_number)
    result = Features(title, body, pr_number, files_changed, labels)
    if return_dict:
        return features_to_dict(result)
    return result

class CommitDataCache:
    def __init__(self, path='results/data.json'):
        self.path = path
        self.data = {}
        if os.path.exists(path):
            self.data = self.read_from_disk()

    def get(self, commit):
        if commit not in self.data.keys():
            # Fetch and cache the data
            self.data[commit] = get_features(commit)
            self.write_to_disk()
        return self.data[commit]

    def read_from_disk(self):
        with open(self.path, 'r') as f:
            data = json.load(f)
            data = {commit: dict_to_features(dct)
                    for commit, dct in data.items()}
        return data

    def write_to_disk(self):
        data = {commit: features._asdict() for commit, features in self.data.items()}
        with open(self.path, 'w') as f:
            json.dump(data, f)