mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-21 01:23:56 +08:00
Compare commits
292 Commits
Author | SHA1 | Date | |
---|---|---|---|
0856a231c0 | |||
ab7f5d2943 | |||
b450a7faf2 | |||
d44db1145c | |||
690a0dbf36 | |||
fbb248a2e4 | |||
5ff0c60505 | |||
210d407245 | |||
b65f07d8c0 | |||
009ee86a19 | |||
ffd623823d | |||
3a2f97db6f | |||
434d15da8e | |||
5faf386652 | |||
8efaf8f176 | |||
0e774e57a6 | |||
c35d9d48d9 | |||
65df0d78ed | |||
4e56da38d9 | |||
cdcb206e10 | |||
321d70a7a9 | |||
67376c02e2 | |||
c6bea08448 | |||
e7cfc46fc1 | |||
e1b3cfb504 | |||
3c33499f87 | |||
03cdb2a390 | |||
1e71f11dec | |||
d38caba169 | |||
af62cc5f20 | |||
eebc8abbe2 | |||
81c7e3ec9f | |||
e8fe6b7140 | |||
884ca81d87 | |||
32fea876bb | |||
b31ba23913 | |||
0a9860daa7 | |||
2071a9b86e | |||
8197eb9f10 | |||
525eba68ab | |||
b514a60c36 | |||
9bdcba53fd | |||
f0bf81e141 | |||
9f9909ea2f | |||
6cd769957e | |||
1320e4ec0c | |||
f4a07a392c | |||
43b9af0cac | |||
cfcb95417c | |||
0c1a6f9b1d | |||
1756b5e956 | |||
dadd0c1b13 | |||
102c6b238c | |||
b80684b23f | |||
80607874c1 | |||
7b4b0cf966 | |||
4bbb9f2d68 | |||
5d7e845712 | |||
eccb2f0163 | |||
5adc20723b | |||
5ee4f17234 | |||
2dfaf2f227 | |||
777459b471 | |||
edcb56fd96 | |||
6bc082da0a | |||
eb8fda51f4 | |||
e77721e4fe | |||
009b581316 | |||
f99f2fb661 | |||
438db43d46 | |||
c306869ea2 | |||
d482e3d79d | |||
9c3c24800b | |||
2df41663f1 | |||
9aebc711c9 | |||
4a450b25d5 | |||
58f0a2745c | |||
7ac3311e48 | |||
ed47cb6cba | |||
973926431e | |||
ba9e4eb354 | |||
34bdb7f9cb | |||
848aae49e1 | |||
448937c00d | |||
ba37ddc5ce | |||
822915142b | |||
bd74632687 | |||
fd223374f0 | |||
d609ba24cb | |||
bde1eeebe0 | |||
3ea3b00e59 | |||
d8e3bdbb4c | |||
64ce900974 | |||
0ad9b239a1 | |||
e9e77cd3c4 | |||
1579c53635 | |||
f3bda2352a | |||
6179f537a3 | |||
850da1cc36 | |||
01a3966bc6 | |||
05f961840b | |||
aa90e0c36a | |||
8f8bbd4a4c | |||
e2d53d95b0 | |||
7e0b415ab4 | |||
ce75b169bd | |||
9bf528877e | |||
af2b78601b | |||
0dd2b750ca | |||
5169069997 | |||
3a848111e6 | |||
98c96fb1a7 | |||
5456d82311 | |||
9b2540b5a7 | |||
bd3b3aee9c | |||
a45a9cc0e1 | |||
b12616fd8e | |||
d77dd62ff8 | |||
9c6a48c8c3 | |||
01ff4f82ba | |||
4eb2a49d41 | |||
0a9d7c7edb | |||
be9fa192f0 | |||
9c35c132fa | |||
b9c77b98d5 | |||
f040a43cb3 | |||
35115eaf93 | |||
009101de12 | |||
fea15cc9f5 | |||
a28dfc8659 | |||
c03c12687f | |||
8831c68803 | |||
bcd4aa8fe0 | |||
a69ec2c722 | |||
7d03c53718 | |||
3a9c88377f | |||
647c983530 | |||
4e0cba1053 | |||
c94455651e | |||
25eae7b0ae | |||
cd30565aed | |||
8edc898f63 | |||
6c65cb2492 | |||
a2da2b4109 | |||
35becc6d84 | |||
506e5bb0c8 | |||
e485829a41 | |||
7e60205bd3 | |||
64326dccfb | |||
e5c78c6684 | |||
fa5222c296 | |||
0dd5f55ac8 | |||
b3628f117e | |||
ab90d4cddd | |||
dc5df92fa8 | |||
3cf12b235a | |||
eed51c5bdf | |||
3f60a60eed | |||
751beb9e73 | |||
793dcd236b | |||
2e4db64cab | |||
c9fd350567 | |||
93f563b8a8 | |||
e048c7f1c8 | |||
d3d56f9a0b | |||
766c6b2ce3 | |||
77966a43a4 | |||
bcd607542c | |||
2e8c5c00ec | |||
2860377021 | |||
c18bdb4433 | |||
d0d9b384f2 | |||
ca4e7aaa72 | |||
193e2df8ba | |||
c64de50ea4 | |||
b96149a19b | |||
be3b9bcf4d | |||
186f75342e | |||
e626eecc25 | |||
99709ee61d | |||
8da280ebbe | |||
e5fc98c542 | |||
7176674849 | |||
7fb94ab934 | |||
2feb29c0ff | |||
2c9991496b | |||
17595ef2de | |||
67f4dd56a3 | |||
ecf3ea197e | |||
87c1244c7d | |||
b3d86162b0 | |||
d57763f582 | |||
78cf7b4ab4 | |||
a58361f197 | |||
786cc41299 | |||
ecc0b54bec | |||
8b1b93947f | |||
8809eb6c93 | |||
e1bfad4846 | |||
d821358884 | |||
37378898a2 | |||
4a4b0e5783 | |||
ae88eb88a4 | |||
e1eab59aac | |||
087798b7fa | |||
0f544625f4 | |||
0cf88ff084 | |||
52c53f39d0 | |||
4946c2c500 | |||
d23eed85bb | |||
1cbb32a542 | |||
ce52177638 | |||
d3fcec1a3e | |||
93f335ef86 | |||
b3caec5a56 | |||
85fff78c2d | |||
13bf0d4659 | |||
91aab2a6d3 | |||
32a227f507 | |||
ffe9075f48 | |||
3b0a14b761 | |||
dcb50eaa4b | |||
c8ea286048 | |||
485adde742 | |||
bc659f86ad | |||
1df6f26214 | |||
770f805ae5 | |||
ed3b62cd3b | |||
632f2d2df9 | |||
b13abfa9fe | |||
270fa2f20b | |||
a3a3180c86 | |||
e7c0a8ddce | |||
e622790a93 | |||
df34f22854 | |||
0876b77f7f | |||
81e1e2489f | |||
174cdbccde | |||
1db916b5be | |||
68f77303b2 | |||
a2b6918a11 | |||
5c858448d3 | |||
c9f67e037c | |||
150f3cd9fa | |||
d429c15f25 | |||
4fa7892d64 | |||
6a26e19ea3 | |||
63c45056aa | |||
fc5a38ac92 | |||
c45d8ac554 | |||
0812aee2c3 | |||
f2b873e995 | |||
83fdbd6043 | |||
7183cded4e | |||
fa7daa247d | |||
a994bf4076 | |||
c6d9d5394e | |||
793262e8ec | |||
3ba5470eb8 | |||
0a7c8bdcac | |||
3113e967db | |||
04826b0f2c | |||
e60e8a6068 | |||
063be09b71 | |||
4450f5ef6b | |||
dc13e276ee | |||
8a8aa59d8c | |||
836b40be82 | |||
66d50ca6ae | |||
f9f3bdd60b | |||
52ff0590ff | |||
511bce58bd | |||
258eb50086 | |||
d787c6be8c | |||
ed302a73f4 | |||
89d47230d7 | |||
7f7c41b0c1 | |||
be57c8eeef | |||
8c7267f1cf | |||
7b3bb8c00f | |||
257a35134a | |||
c588453a0f | |||
d6f06c03f4 | |||
532a81d3d6 | |||
296f006132 | |||
298107fed7 | |||
0541442558 | |||
3951c2c189 | |||
ec2c339b53 | |||
21f0196412 | |||
0aaedcc02f | |||
32167cdf4b |
29
.circleci/config.yml
Normal file
29
.circleci/config.yml
Normal file
@ -0,0 +1,29 @@
|
||||
version: 2
|
||||
jobs:
|
||||
build_py3:
|
||||
working_directory: ~/pytorch-pretrained-BERT
|
||||
docker:
|
||||
- image: circleci/python:3.5
|
||||
steps:
|
||||
- checkout
|
||||
- run: sudo pip install --progress-bar off .
|
||||
- run: sudo pip install pytest ftfy spacy
|
||||
- run: sudo python -m spacy download en
|
||||
- run: python -m pytest -sv tests/
|
||||
build_py2:
|
||||
working_directory: ~/pytorch-pretrained-BERT
|
||||
docker:
|
||||
- image: circleci/python:2.7
|
||||
steps:
|
||||
- checkout
|
||||
- run: sudo pip install --progress-bar off .
|
||||
- run: sudo pip install pytest spacy
|
||||
- run: sudo pip install ftfy==4.4.3
|
||||
- run: sudo python -m spacy download en
|
||||
- run: python -m pytest -sv tests/
|
||||
workflows:
|
||||
version: 2
|
||||
build_and_test:
|
||||
jobs:
|
||||
- build_py3
|
||||
- build_py2
|
5
.gitignore
vendored
5
.gitignore
vendored
@ -119,4 +119,7 @@ dmypy.json
|
||||
.vscode
|
||||
|
||||
# TF code
|
||||
tensorflow_code
|
||||
tensorflow_code
|
||||
|
||||
# Models
|
||||
models
|
1
MANIFEST.in
Normal file
1
MANIFEST.in
Normal file
@ -0,0 +1 @@
|
||||
include LICENSE
|
@ -1,2 +0,0 @@
|
||||
#!/bin/sh
|
||||
python -m pytorch_pretrained_bert "$@"
|
7
docker/Dockerfile
Normal file
7
docker/Dockerfile
Normal file
@ -0,0 +1,7 @@
|
||||
FROM pytorch/pytorch:latest
|
||||
|
||||
RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
|
||||
|
||||
RUN pip install pytorch-pretrained-bert
|
||||
|
||||
WORKDIR /workspace
|
@ -28,7 +28,7 @@ import torch
|
||||
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
|
||||
from pytorch_pretrained_bert.tokenization import convert_to_unicode, BertTokenizer
|
||||
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
||||
from pytorch_pretrained_bert.modeling import BertModel
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
@ -80,10 +80,10 @@ def convert_examples_to_features(examples, seq_length, tokenizer):
|
||||
# The convention in BERT is:
|
||||
# (a) For sequence pairs:
|
||||
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
|
||||
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
|
||||
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
|
||||
# (b) For single sequences:
|
||||
# tokens: [CLS] the dog is hairy . [SEP]
|
||||
# type_ids: 0 0 0 0 0 0 0
|
||||
# type_ids: 0 0 0 0 0 0 0
|
||||
#
|
||||
# Where "type_ids" are used to indicate whether this is the first
|
||||
# sequence or the second sequence. The embedding vectors for `type=0` and
|
||||
@ -168,9 +168,9 @@ def read_examples(input_file):
|
||||
"""Read a list of `InputExample`s from an input file."""
|
||||
examples = []
|
||||
unique_id = 0
|
||||
with open(input_file, "r") as reader:
|
||||
with open(input_file, "r", encoding='utf-8') as reader:
|
||||
while True:
|
||||
line = convert_to_unicode(reader.readline())
|
||||
line = reader.readline()
|
||||
if not line:
|
||||
break
|
||||
line = line.strip()
|
||||
@ -199,6 +199,7 @@ def main():
|
||||
"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
|
||||
|
||||
## Other parameters
|
||||
parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
|
||||
parser.add_argument("--layers", default="-1,-2,-3,-4", type=str)
|
||||
parser.add_argument("--max_seq_length", default=128, type=int,
|
||||
help="The maximum total input sequence length after WordPiece tokenization. Sequences longer "
|
||||
@ -209,7 +210,6 @@ def main():
|
||||
default=-1,
|
||||
help = "local_rank for distributed training on gpus")
|
||||
parser.add_argument("--no_cuda",
|
||||
default=False,
|
||||
action='store_true',
|
||||
help="Whether not to use CUDA when available")
|
||||
|
||||
@ -227,7 +227,7 @@ def main():
|
||||
|
||||
layer_indexes = [int(x) for x in args.layers.split(",")]
|
||||
|
||||
tokenizer = BertTokenizer.from_pretrained(args.bert_model)
|
||||
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
|
||||
|
||||
examples = read_examples(args.input_file)
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@ -14,27 +15,28 @@
|
||||
# limitations under the License.
|
||||
"""BERT finetuning runner."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import csv
|
||||
import os
|
||||
import logging
|
||||
import argparse
|
||||
import csv
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
from tqdm import tqdm, trange
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
|
||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||
TensorDataset)
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from pytorch_pretrained_bert.tokenization import printable_text, convert_to_unicode, BertTokenizer
|
||||
from pytorch_pretrained_bert.modeling import BertForSequenceClassification
|
||||
from pytorch_pretrained_bert.optimization import BertAdam
|
||||
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
|
||||
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
|
||||
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
||||
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -93,6 +95,8 @@ class DataProcessor(object):
|
||||
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
|
||||
lines = []
|
||||
for line in reader:
|
||||
if sys.version_info[0] == 2:
|
||||
line = list(unicode(cell, 'utf-8') for cell in line)
|
||||
lines.append(line)
|
||||
return lines
|
||||
|
||||
@ -122,9 +126,9 @@ class MrpcProcessor(DataProcessor):
|
||||
if i == 0:
|
||||
continue
|
||||
guid = "%s-%s" % (set_type, i)
|
||||
text_a = convert_to_unicode(line[3])
|
||||
text_b = convert_to_unicode(line[4])
|
||||
label = convert_to_unicode(line[0])
|
||||
text_a = line[3]
|
||||
text_b = line[4]
|
||||
label = line[0]
|
||||
examples.append(
|
||||
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
|
||||
return examples
|
||||
@ -154,10 +158,10 @@ class MnliProcessor(DataProcessor):
|
||||
for (i, line) in enumerate(lines):
|
||||
if i == 0:
|
||||
continue
|
||||
guid = "%s-%s" % (set_type, convert_to_unicode(line[0]))
|
||||
text_a = convert_to_unicode(line[8])
|
||||
text_b = convert_to_unicode(line[9])
|
||||
label = convert_to_unicode(line[-1])
|
||||
guid = "%s-%s" % (set_type, line[0])
|
||||
text_a = line[8]
|
||||
text_b = line[9]
|
||||
label = line[-1]
|
||||
examples.append(
|
||||
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
|
||||
return examples
|
||||
@ -185,8 +189,8 @@ class ColaProcessor(DataProcessor):
|
||||
examples = []
|
||||
for (i, line) in enumerate(lines):
|
||||
guid = "%s-%s" % (set_type, i)
|
||||
text_a = convert_to_unicode(line[3])
|
||||
label = convert_to_unicode(line[1])
|
||||
text_a = line[3]
|
||||
label = line[1]
|
||||
examples.append(
|
||||
InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
|
||||
return examples
|
||||
@ -195,9 +199,7 @@ class ColaProcessor(DataProcessor):
|
||||
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
|
||||
"""Loads a data file into a list of `InputBatch`s."""
|
||||
|
||||
label_map = {}
|
||||
for (i, label) in enumerate(label_list):
|
||||
label_map[label] = i
|
||||
label_map = {label : i for i, label in enumerate(label_list)}
|
||||
|
||||
features = []
|
||||
for (ex_index, example) in enumerate(examples):
|
||||
@ -206,8 +208,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
|
||||
tokens_b = None
|
||||
if example.text_b:
|
||||
tokens_b = tokenizer.tokenize(example.text_b)
|
||||
|
||||
if tokens_b:
|
||||
# Modifies `tokens_a` and `tokens_b` in place so that the total
|
||||
# length is less than the specified length.
|
||||
# Account for [CLS], [SEP], [SEP] with "- 3"
|
||||
@ -215,7 +215,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
|
||||
else:
|
||||
# Account for [CLS] and [SEP] with "- 2"
|
||||
if len(tokens_a) > max_seq_length - 2:
|
||||
tokens_a = tokens_a[0:(max_seq_length - 2)]
|
||||
tokens_a = tokens_a[:(max_seq_length - 2)]
|
||||
|
||||
# The convention in BERT is:
|
||||
# (a) For sequence pairs:
|
||||
@ -235,22 +235,12 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
|
||||
# For classification tasks, the first vector (corresponding to [CLS]) is
|
||||
# used as as the "sentence vector". Note that this only makes sense because
|
||||
# the entire model is fine-tuned.
|
||||
tokens = []
|
||||
segment_ids = []
|
||||
tokens.append("[CLS]")
|
||||
segment_ids.append(0)
|
||||
for token in tokens_a:
|
||||
tokens.append(token)
|
||||
segment_ids.append(0)
|
||||
tokens.append("[SEP]")
|
||||
segment_ids.append(0)
|
||||
tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
|
||||
segment_ids = [0] * len(tokens)
|
||||
|
||||
if tokens_b:
|
||||
for token in tokens_b:
|
||||
tokens.append(token)
|
||||
segment_ids.append(1)
|
||||
tokens.append("[SEP]")
|
||||
segment_ids.append(1)
|
||||
tokens += tokens_b + ["[SEP]"]
|
||||
segment_ids += [1] * (len(tokens_b) + 1)
|
||||
|
||||
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||
|
||||
@ -259,10 +249,10 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
|
||||
input_mask = [1] * len(input_ids)
|
||||
|
||||
# Zero-pad up to the sequence length.
|
||||
while len(input_ids) < max_seq_length:
|
||||
input_ids.append(0)
|
||||
input_mask.append(0)
|
||||
segment_ids.append(0)
|
||||
padding = [0] * (max_seq_length - len(input_ids))
|
||||
input_ids += padding
|
||||
input_mask += padding
|
||||
segment_ids += padding
|
||||
|
||||
assert len(input_ids) == max_seq_length
|
||||
assert len(input_mask) == max_seq_length
|
||||
@ -273,7 +263,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
|
||||
logger.info("*** Example ***")
|
||||
logger.info("guid: %s" % (example.guid))
|
||||
logger.info("tokens: %s" % " ".join(
|
||||
[printable_text(x) for x in tokens]))
|
||||
[str(x) for x in tokens]))
|
||||
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
|
||||
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
|
||||
logger.info(
|
||||
@ -308,35 +298,6 @@ def accuracy(out, labels):
|
||||
outputs = np.argmax(out, axis=1)
|
||||
return np.sum(outputs == labels)
|
||||
|
||||
def copy_optimizer_params_to_model(named_params_model, named_params_optimizer):
|
||||
""" Utility function for optimize_on_cpu and 16-bits training.
|
||||
Copy the parameters optimized on CPU/RAM back to the model on GPU
|
||||
"""
|
||||
for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model):
|
||||
if name_opti != name_model:
|
||||
logger.error("name_opti != name_model: {} {}".format(name_opti, name_model))
|
||||
raise ValueError
|
||||
param_model.data.copy_(param_opti.data)
|
||||
|
||||
def set_optimizer_params_grad(named_params_optimizer, named_params_model, test_nan=False):
|
||||
""" Utility function for optimize_on_cpu and 16-bits training.
|
||||
Copy the gradient of the GPU parameters to the CPU/RAMM copy of the model
|
||||
"""
|
||||
is_nan = False
|
||||
for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model):
|
||||
if name_opti != name_model:
|
||||
logger.error("name_opti != name_model: {} {}".format(name_opti, name_model))
|
||||
raise ValueError
|
||||
if param_model.grad is not None:
|
||||
if test_nan and torch.isnan(param_model.grad).sum() > 0:
|
||||
is_nan = True
|
||||
if param_opti.grad is None:
|
||||
param_opti.grad = torch.nn.Parameter(param_opti.data.new().resize_(*param_opti.data.size()))
|
||||
param_opti.grad.data.copy_(param_model.grad.data)
|
||||
else:
|
||||
param_opti.grad = None
|
||||
return is_nan
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
@ -348,7 +309,8 @@ def main():
|
||||
help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
|
||||
parser.add_argument("--bert_model", default=None, type=str, required=True,
|
||||
help="Bert pre-trained model selected in the list: bert-base-uncased, "
|
||||
"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
|
||||
"bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
|
||||
"bert-base-multilingual-cased, bert-base-chinese.")
|
||||
parser.add_argument("--task_name",
|
||||
default=None,
|
||||
type=str,
|
||||
@ -358,9 +320,13 @@ def main():
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The output directory where the model checkpoints will be written.")
|
||||
help="The output directory where the model predictions and checkpoints will be written.")
|
||||
|
||||
## Other parameters
|
||||
parser.add_argument("--cache_dir",
|
||||
default="",
|
||||
type=str,
|
||||
help="Where do you want to store the pre-trained models downloaded from s3")
|
||||
parser.add_argument("--max_seq_length",
|
||||
default=128,
|
||||
type=int,
|
||||
@ -368,13 +334,14 @@ def main():
|
||||
"Sequences longer than this will be truncated, and sequences shorter \n"
|
||||
"than this will be padded.")
|
||||
parser.add_argument("--do_train",
|
||||
default=False,
|
||||
action='store_true',
|
||||
help="Whether to run training.")
|
||||
parser.add_argument("--do_eval",
|
||||
default=False,
|
||||
action='store_true',
|
||||
help="Whether to run eval on the dev set.")
|
||||
parser.add_argument("--do_lower_case",
|
||||
action='store_true',
|
||||
help="Set this flag if you are using an uncased model.")
|
||||
parser.add_argument("--train_batch_size",
|
||||
default=32,
|
||||
type=int,
|
||||
@ -397,59 +364,68 @@ def main():
|
||||
help="Proportion of training to perform linear learning rate warmup for. "
|
||||
"E.g., 0.1 = 10%% of training.")
|
||||
parser.add_argument("--no_cuda",
|
||||
default=False,
|
||||
action='store_true',
|
||||
help="Whether not to use CUDA when available")
|
||||
parser.add_argument("--local_rank",
|
||||
type=int,
|
||||
default=-1,
|
||||
help="local_rank for distributed training on gpus")
|
||||
parser.add_argument('--seed',
|
||||
type=int,
|
||||
parser.add_argument('--seed',
|
||||
type=int,
|
||||
default=42,
|
||||
help="random seed for initialization")
|
||||
parser.add_argument('--gradient_accumulation_steps',
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of updates steps to accumualte before performing a backward/update pass.")
|
||||
parser.add_argument('--optimize_on_cpu',
|
||||
default=False,
|
||||
action='store_true',
|
||||
help="Whether to perform optimization and keep the optimizer averages on CPU")
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.")
|
||||
parser.add_argument('--fp16',
|
||||
default=False,
|
||||
action='store_true',
|
||||
help="Whether to use 16-bit float precision instead of 32-bit")
|
||||
parser.add_argument('--loss_scale',
|
||||
type=float, default=128,
|
||||
help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
|
||||
|
||||
type=float, default=0,
|
||||
help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
|
||||
"0 (default value): dynamic loss scaling.\n"
|
||||
"Positive power of 2: static loss scaling value.\n")
|
||||
parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
|
||||
parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.server_ip and args.server_port:
|
||||
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||
import ptvsd
|
||||
print("Waiting for debugger attach")
|
||||
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||
ptvsd.wait_for_attach()
|
||||
|
||||
processors = {
|
||||
"cola": ColaProcessor,
|
||||
"mnli": MnliProcessor,
|
||||
"mrpc": MrpcProcessor,
|
||||
}
|
||||
|
||||
num_labels_task = {
|
||||
"cola": 2,
|
||||
"mnli": 3,
|
||||
"mrpc": 2,
|
||||
}
|
||||
|
||||
if args.local_rank == -1 or args.no_cuda:
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
n_gpu = torch.cuda.device_count()
|
||||
else:
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
n_gpu = 1
|
||||
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
torch.distributed.init_process_group(backend='nccl')
|
||||
if args.fp16:
|
||||
logger.info("16-bits training currently not supported in distributed training")
|
||||
args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496)
|
||||
logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))
|
||||
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
|
||||
device, n_gpu, bool(args.local_rank != -1), args.fp16))
|
||||
|
||||
if args.gradient_accumulation_steps < 1:
|
||||
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
|
||||
args.gradient_accumulation_steps))
|
||||
|
||||
args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)
|
||||
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
|
||||
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
@ -460,9 +436,10 @@ def main():
|
||||
if not args.do_train and not args.do_eval:
|
||||
raise ValueError("At least one of `do_train` or `do_eval` must be True.")
|
||||
|
||||
if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
|
||||
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
|
||||
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
task_name = args.task_name.lower()
|
||||
|
||||
@ -470,56 +447,77 @@ def main():
|
||||
raise ValueError("Task not found: %s" % (task_name))
|
||||
|
||||
processor = processors[task_name]()
|
||||
num_labels = num_labels_task[task_name]
|
||||
label_list = processor.get_labels()
|
||||
|
||||
tokenizer = BertTokenizer.from_pretrained(args.bert_model)
|
||||
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
|
||||
|
||||
train_examples = None
|
||||
num_train_steps = None
|
||||
num_train_optimization_steps = None
|
||||
if args.do_train:
|
||||
train_examples = processor.get_train_examples(args.data_dir)
|
||||
num_train_steps = int(
|
||||
len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)
|
||||
num_train_optimization_steps = int(
|
||||
len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
|
||||
if args.local_rank != -1:
|
||||
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
|
||||
|
||||
# Prepare model
|
||||
model = BertForSequenceClassification.from_pretrained(args.bert_model, len(label_list),
|
||||
cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank))
|
||||
cache_dir = args.cache_dir if args.cache_dir else os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank))
|
||||
model = BertForSequenceClassification.from_pretrained(args.bert_model,
|
||||
cache_dir=cache_dir,
|
||||
num_labels = num_labels)
|
||||
if args.fp16:
|
||||
model.half()
|
||||
model.to(device)
|
||||
if args.local_rank != -1:
|
||||
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
|
||||
output_device=args.local_rank)
|
||||
try:
|
||||
from apex.parallel import DistributedDataParallel as DDP
|
||||
except ImportError:
|
||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
|
||||
|
||||
model = DDP(model)
|
||||
elif n_gpu > 1:
|
||||
model = torch.nn.DataParallel(model)
|
||||
|
||||
# Prepare optimizer
|
||||
if args.fp16:
|
||||
param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \
|
||||
for n, param in model.named_parameters()]
|
||||
elif args.optimize_on_cpu:
|
||||
param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
|
||||
for n, param in model.named_parameters()]
|
||||
else:
|
||||
param_optimizer = list(model.named_parameters())
|
||||
no_decay = ['bias', 'gamma', 'beta']
|
||||
param_optimizer = list(model.named_parameters())
|
||||
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
|
||||
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
|
||||
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
|
||||
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||
]
|
||||
optimizer = BertAdam(optimizer_grouped_parameters,
|
||||
lr=args.learning_rate,
|
||||
warmup=args.warmup_proportion,
|
||||
t_total=num_train_steps)
|
||||
if args.fp16:
|
||||
try:
|
||||
from apex.optimizers import FP16_Optimizer
|
||||
from apex.optimizers import FusedAdam
|
||||
except ImportError:
|
||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
|
||||
|
||||
optimizer = FusedAdam(optimizer_grouped_parameters,
|
||||
lr=args.learning_rate,
|
||||
bias_correction=False,
|
||||
max_grad_norm=1.0)
|
||||
if args.loss_scale == 0:
|
||||
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
|
||||
else:
|
||||
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
|
||||
|
||||
else:
|
||||
optimizer = BertAdam(optimizer_grouped_parameters,
|
||||
lr=args.learning_rate,
|
||||
warmup=args.warmup_proportion,
|
||||
t_total=num_train_optimization_steps)
|
||||
|
||||
global_step = 0
|
||||
nb_tr_steps = 0
|
||||
tr_loss = 0
|
||||
if args.do_train:
|
||||
train_features = convert_examples_to_features(
|
||||
train_examples, label_list, args.max_seq_length, tokenizer)
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(" Num examples = %d", len(train_examples))
|
||||
logger.info(" Batch size = %d", args.train_batch_size)
|
||||
logger.info(" Num steps = %d", num_train_steps)
|
||||
logger.info(" Num steps = %d", num_train_optimization_steps)
|
||||
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
|
||||
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
|
||||
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
|
||||
@ -538,40 +536,49 @@ def main():
|
||||
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
|
||||
batch = tuple(t.to(device) for t in batch)
|
||||
input_ids, input_mask, segment_ids, label_ids = batch
|
||||
loss, _ = model(input_ids, segment_ids, input_mask, label_ids)
|
||||
loss = model(input_ids, segment_ids, input_mask, label_ids)
|
||||
if n_gpu > 1:
|
||||
loss = loss.mean() # mean() to average on multi-gpu.
|
||||
if args.fp16 and args.loss_scale != 1.0:
|
||||
# rescale loss for fp16 training
|
||||
# see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
|
||||
loss = loss * args.loss_scale
|
||||
if args.gradient_accumulation_steps > 1:
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
loss.backward()
|
||||
|
||||
if args.fp16:
|
||||
optimizer.backward(loss)
|
||||
else:
|
||||
loss.backward()
|
||||
|
||||
tr_loss += loss.item()
|
||||
nb_tr_examples += input_ids.size(0)
|
||||
nb_tr_steps += 1
|
||||
if (step + 1) % args.gradient_accumulation_steps == 0:
|
||||
if args.fp16 or args.optimize_on_cpu:
|
||||
if args.fp16 and args.loss_scale != 1.0:
|
||||
# scale down gradients for fp16 training
|
||||
for param in model.parameters():
|
||||
if param.grad is not None:
|
||||
param.grad.data = param.grad.data / args.loss_scale
|
||||
is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
|
||||
if is_nan:
|
||||
logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling")
|
||||
args.loss_scale = args.loss_scale / 2
|
||||
model.zero_grad()
|
||||
continue
|
||||
optimizer.step()
|
||||
copy_optimizer_params_to_model(model.named_parameters(), param_optimizer)
|
||||
else:
|
||||
optimizer.step()
|
||||
model.zero_grad()
|
||||
if args.fp16:
|
||||
# modify learning rate with special warm up BERT uses
|
||||
# if args.fp16 is False, BertAdam is used that handles this automatically
|
||||
lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group['lr'] = lr_this_step
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
global_step += 1
|
||||
|
||||
if args.do_eval:
|
||||
if args.do_train:
|
||||
# Save a trained model and the associated configuration
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
|
||||
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
|
||||
torch.save(model_to_save.state_dict(), output_model_file)
|
||||
output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
|
||||
with open(output_config_file, 'w') as f:
|
||||
f.write(model_to_save.config.to_json_string())
|
||||
|
||||
# Load a trained model and config that you have fine-tuned
|
||||
config = BertConfig(output_config_file)
|
||||
model = BertForSequenceClassification(config, num_labels=num_labels)
|
||||
model.load_state_dict(torch.load(output_model_file))
|
||||
else:
|
||||
model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
|
||||
model.to(device)
|
||||
|
||||
if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
|
||||
eval_examples = processor.get_dev_examples(args.data_dir)
|
||||
eval_features = convert_examples_to_features(
|
||||
eval_examples, label_list, args.max_seq_length, tokenizer)
|
||||
@ -583,23 +590,23 @@ def main():
|
||||
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
|
||||
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
|
||||
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
|
||||
if args.local_rank == -1:
|
||||
eval_sampler = SequentialSampler(eval_data)
|
||||
else:
|
||||
eval_sampler = DistributedSampler(eval_data)
|
||||
# Run prediction for full data
|
||||
eval_sampler = SequentialSampler(eval_data)
|
||||
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||
|
||||
model.eval()
|
||||
eval_loss, eval_accuracy = 0, 0
|
||||
nb_eval_steps, nb_eval_examples = 0, 0
|
||||
for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
|
||||
|
||||
for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
|
||||
input_ids = input_ids.to(device)
|
||||
input_mask = input_mask.to(device)
|
||||
segment_ids = segment_ids.to(device)
|
||||
label_ids = label_ids.to(device)
|
||||
|
||||
with torch.no_grad():
|
||||
tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids)
|
||||
tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
|
||||
logits = model(input_ids, segment_ids, input_mask)
|
||||
|
||||
logits = logits.detach().cpu().numpy()
|
||||
label_ids = label_ids.to('cpu').numpy()
|
||||
@ -613,11 +620,11 @@ def main():
|
||||
|
||||
eval_loss = eval_loss / nb_eval_steps
|
||||
eval_accuracy = eval_accuracy / nb_eval_examples
|
||||
|
||||
loss = tr_loss/nb_tr_steps if args.do_train else None
|
||||
result = {'eval_loss': eval_loss,
|
||||
'eval_accuracy': eval_accuracy,
|
||||
'global_step': global_step,
|
||||
'loss': tr_loss/nb_tr_steps}
|
||||
'loss': loss}
|
||||
|
||||
output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
|
||||
with open(output_eval_file, "w") as writer:
|
||||
|
105
examples/run_gpt2.py
Normal file
105
examples/run_gpt2.py
Normal file
@ -0,0 +1,105 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from tqdm import trange
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import numpy as np
|
||||
|
||||
from pytorch_pretrained_bert import GPT2LMHeadModel, GPT2Tokenizer
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def top_k_logits(logits, k):
|
||||
if k == 0:
|
||||
return logits
|
||||
values, _ = torch.topk(logits, k)
|
||||
min_values = values[:, -1]
|
||||
return torch.where(logits < min_values, torch.ones_like(logits, dtype=logits.dtype) * -1e10, logits)
|
||||
|
||||
def sample_sequence(model, length, start_token=None, batch_size=None, context=None, temperature=1, top_k=0, device='cuda', sample=True):
|
||||
if start_token is None:
|
||||
assert context is not None, 'Specify exactly one of start_token and context!'
|
||||
context = torch.tensor(context, device=device, dtype=torch.long).unsqueeze(0).repeat(batch_size, 1)
|
||||
else:
|
||||
assert context is None, 'Specify exactly one of start_token and context!'
|
||||
context = torch.full((batch_size, 1), start_token, device=device, dtype=torch.long)
|
||||
prev = context
|
||||
output = context
|
||||
past = None
|
||||
with torch.no_grad():
|
||||
for i in trange(length):
|
||||
logits, past = model(prev, past=past)
|
||||
logits = logits[:, -1, :] / temperature
|
||||
logits = top_k_logits(logits, k=top_k)
|
||||
log_probs = F.softmax(logits, dim=-1)
|
||||
if sample:
|
||||
prev = torch.multinomial(log_probs, num_samples=1)
|
||||
else:
|
||||
_, prev = torch.topk(log_probs, k=1, dim=-1)
|
||||
output = torch.cat((output, prev), dim=1)
|
||||
return output
|
||||
|
||||
def run_model():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--model_name_or_path', type=str, default='gpt2', help='pretrained model name or path to local checkpoint')
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument("--nsamples", type=int, default=1)
|
||||
parser.add_argument("--batch_size", type=int, default=-1)
|
||||
parser.add_argument("--length", type=int, default=-1)
|
||||
parser.add_argument("--temperature", type=int, default=1)
|
||||
parser.add_argument("--top_k", type=int, default=0)
|
||||
parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.')
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
||||
if args.batch_size == -1:
|
||||
args.batch_size = 1
|
||||
assert args.nsamples % args.batch_size == 0
|
||||
|
||||
np.random.seed(args.seed)
|
||||
torch.random.manual_seed(args.seed)
|
||||
torch.cuda.manual_seed(args.seed)
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
|
||||
model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path)
|
||||
model.to(device)
|
||||
model.eval()
|
||||
|
||||
if args.length == -1:
|
||||
args.length = model.config.n_ctx // 2
|
||||
elif args.length > model.config.n_ctx:
|
||||
raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx)
|
||||
|
||||
while not args.unconditional:
|
||||
if not args.unconditional:
|
||||
raw_text = input("Model prompt >>> ")
|
||||
while not raw_text:
|
||||
print('Prompt should not be empty!')
|
||||
raw_text = input("Model prompt >>> ")
|
||||
context_tokens = enc.encode(raw_text)
|
||||
generated = 0
|
||||
for _ in range(args.nsamples // args.batch_size):
|
||||
out = sample_sequence(
|
||||
model=model, length=args.length,
|
||||
context=context_tokens if not args.unconditional else None,
|
||||
start_token=enc.encoder['<|endoftext|>'] if args.unconditional else None,
|
||||
batch_size=args.batch_size,
|
||||
temperature=args.temperature, top_k=args.top_k, device=device
|
||||
)
|
||||
out = out[:, len(context_tokens):].tolist()
|
||||
for i in range(args.batch_size):
|
||||
generated += 1
|
||||
text = enc.decode(out[i])
|
||||
print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
|
||||
print(text)
|
||||
print("=" * 80)
|
||||
|
||||
if __name__ == '__main__':
|
||||
run_model()
|
88
examples/run_gpt2_generate_unconditional_samples.py
Normal file
88
examples/run_gpt2_generate_unconditional_samples.py
Normal file
@ -0,0 +1,88 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import numpy as np
|
||||
from tqdm import trange
|
||||
|
||||
from pytorch_pretrained_bert import GPT2LMHeadModel, GPT2Tokenizer
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def top_k_logits(logits, k):
|
||||
if k == 0:
|
||||
return logits
|
||||
values, _ = torch.topk(logits, k)
|
||||
min_values = values[:, -1]
|
||||
return torch.where(logits < min_values, torch.ones_like(logits, dtype=logits.dtype) * -1e10, logits)
|
||||
|
||||
def sample_sequence(model, length, start_token=None, batch_size=None, context=None, temperature=1, top_k=0, device='cuda'):
|
||||
if start_token is None:
|
||||
assert context is not None, 'Specify exactly one of start_token and context!'
|
||||
context = torch.tensor(context, device=device, dtype=torch.long)
|
||||
else:
|
||||
assert context is None, 'Specify exactly one of start_token and context!'
|
||||
context = torch.full((batch_size, 1), start_token, device=device, dtype=torch.long)
|
||||
prev = context
|
||||
output = context
|
||||
past = None
|
||||
with torch.no_grad():
|
||||
for i in trange(length):
|
||||
logits, past = model(prev, past=past)
|
||||
logits = logits[:, -1, :] / temperature
|
||||
logits = top_k_logits(logits, k=top_k)
|
||||
log_probs = F.softmax(logits, dim=-1)
|
||||
prev = torch.multinomial(log_probs, num_samples=1)
|
||||
output = torch.cat((output, prev), dim=1)
|
||||
return output
|
||||
|
||||
def sample_model():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--model_name_or_path', type=str, default='gpt2', help='pretrained model name or path to local checkpoint')
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument("--nsamples", type=int, default=0)
|
||||
parser.add_argument("--batch_size", type=int, default=1)
|
||||
parser.add_argument("--length", type=int, default=-1)
|
||||
parser.add_argument("--temperature", type=int, default=1)
|
||||
parser.add_argument("--top_k", type=int, default=0)
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
||||
np.random.seed(args.seed)
|
||||
torch.random.manual_seed(args.seed)
|
||||
torch.cuda.manual_seed(args.seed)
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
|
||||
model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path)
|
||||
model.to(device)
|
||||
model.eval()
|
||||
|
||||
if args.length == -1:
|
||||
args.length = model.config.n_ctx
|
||||
elif args.length > model.config.n_ctx:
|
||||
raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx)
|
||||
|
||||
generated = 0
|
||||
while args.nsamples == 0 or generated < args.nsamples:
|
||||
out = sample_sequence(
|
||||
model=model, length=args.length,
|
||||
start_token=enc.encoder['<|endoftext|>'],
|
||||
batch_size=args.batch_size,
|
||||
temperature=args.temperature, top_k=args.top_k, device=device
|
||||
)
|
||||
out = out.tolist()
|
||||
for i in range(args.batch_size):
|
||||
generated += args.batch_size
|
||||
text = enc.decode(out[i])
|
||||
print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
|
||||
print(text)
|
||||
|
||||
if __name__ == '__main__':
|
||||
sample_model()
|
645
examples/run_lm_finetuning.py
Normal file
645
examples/run_lm_finetuning.py
Normal file
@ -0,0 +1,645 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""BERT finetuning runner."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
from io import open
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader, Dataset, RandomSampler
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from pytorch_pretrained_bert.modeling import BertForPreTraining
|
||||
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
||||
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
|
||||
|
||||
from torch.utils.data import Dataset
|
||||
import random
|
||||
|
||||
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt='%m/%d/%Y %H:%M:%S',
|
||||
level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BERTDataset(Dataset):
|
||||
def __init__(self, corpus_path, tokenizer, seq_len, encoding="utf-8", corpus_lines=None, on_memory=True):
|
||||
self.vocab = tokenizer.vocab
|
||||
self.tokenizer = tokenizer
|
||||
self.seq_len = seq_len
|
||||
self.on_memory = on_memory
|
||||
self.corpus_lines = corpus_lines # number of non-empty lines in input corpus
|
||||
self.corpus_path = corpus_path
|
||||
self.encoding = encoding
|
||||
self.current_doc = 0 # to avoid random sentence from same doc
|
||||
|
||||
# for loading samples directly from file
|
||||
self.sample_counter = 0 # used to keep track of full epochs on file
|
||||
self.line_buffer = None # keep second sentence of a pair in memory and use as first sentence in next pair
|
||||
|
||||
# for loading samples in memory
|
||||
self.current_random_doc = 0
|
||||
self.num_docs = 0
|
||||
self.sample_to_doc = [] # map sample index to doc and line
|
||||
|
||||
# load samples into memory
|
||||
if on_memory:
|
||||
self.all_docs = []
|
||||
doc = []
|
||||
self.corpus_lines = 0
|
||||
with open(corpus_path, "r", encoding=encoding) as f:
|
||||
for line in tqdm(f, desc="Loading Dataset", total=corpus_lines):
|
||||
line = line.strip()
|
||||
if line == "":
|
||||
self.all_docs.append(doc)
|
||||
doc = []
|
||||
#remove last added sample because there won't be a subsequent line anymore in the doc
|
||||
self.sample_to_doc.pop()
|
||||
else:
|
||||
#store as one sample
|
||||
sample = {"doc_id": len(self.all_docs),
|
||||
"line": len(doc)}
|
||||
self.sample_to_doc.append(sample)
|
||||
doc.append(line)
|
||||
self.corpus_lines = self.corpus_lines + 1
|
||||
|
||||
# if last row in file is not empty
|
||||
if self.all_docs[-1] != doc:
|
||||
self.all_docs.append(doc)
|
||||
self.sample_to_doc.pop()
|
||||
|
||||
self.num_docs = len(self.all_docs)
|
||||
|
||||
# load samples later lazily from disk
|
||||
else:
|
||||
if self.corpus_lines is None:
|
||||
with open(corpus_path, "r", encoding=encoding) as f:
|
||||
self.corpus_lines = 0
|
||||
for line in tqdm(f, desc="Loading Dataset", total=corpus_lines):
|
||||
if line.strip() == "":
|
||||
self.num_docs += 1
|
||||
else:
|
||||
self.corpus_lines += 1
|
||||
|
||||
# if doc does not end with empty line
|
||||
if line.strip() != "":
|
||||
self.num_docs += 1
|
||||
|
||||
self.file = open(corpus_path, "r", encoding=encoding)
|
||||
self.random_file = open(corpus_path, "r", encoding=encoding)
|
||||
|
||||
def __len__(self):
|
||||
# last line of doc won't be used, because there's no "nextSentence". Additionally, we start counting at 0.
|
||||
return self.corpus_lines - self.num_docs - 1
|
||||
|
||||
def __getitem__(self, item):
|
||||
cur_id = self.sample_counter
|
||||
self.sample_counter += 1
|
||||
if not self.on_memory:
|
||||
# after one epoch we start again from beginning of file
|
||||
if cur_id != 0 and (cur_id % len(self) == 0):
|
||||
self.file.close()
|
||||
self.file = open(self.corpus_path, "r", encoding=self.encoding)
|
||||
|
||||
t1, t2, is_next_label = self.random_sent(item)
|
||||
|
||||
# tokenize
|
||||
tokens_a = self.tokenizer.tokenize(t1)
|
||||
tokens_b = self.tokenizer.tokenize(t2)
|
||||
|
||||
# combine to one sample
|
||||
cur_example = InputExample(guid=cur_id, tokens_a=tokens_a, tokens_b=tokens_b, is_next=is_next_label)
|
||||
|
||||
# transform sample to features
|
||||
cur_features = convert_example_to_features(cur_example, self.seq_len, self.tokenizer)
|
||||
|
||||
cur_tensors = (torch.tensor(cur_features.input_ids),
|
||||
torch.tensor(cur_features.input_mask),
|
||||
torch.tensor(cur_features.segment_ids),
|
||||
torch.tensor(cur_features.lm_label_ids),
|
||||
torch.tensor(cur_features.is_next))
|
||||
|
||||
return cur_tensors
|
||||
|
||||
def random_sent(self, index):
|
||||
"""
|
||||
Get one sample from corpus consisting of two sentences. With prob. 50% these are two subsequent sentences
|
||||
from one doc. With 50% the second sentence will be a random one from another doc.
|
||||
:param index: int, index of sample.
|
||||
:return: (str, str, int), sentence 1, sentence 2, isNextSentence Label
|
||||
"""
|
||||
t1, t2 = self.get_corpus_line(index)
|
||||
if random.random() > 0.5:
|
||||
label = 0
|
||||
else:
|
||||
t2 = self.get_random_line()
|
||||
label = 1
|
||||
|
||||
assert len(t1) > 0
|
||||
assert len(t2) > 0
|
||||
return t1, t2, label
|
||||
|
||||
def get_corpus_line(self, item):
|
||||
"""
|
||||
Get one sample from corpus consisting of a pair of two subsequent lines from the same doc.
|
||||
:param item: int, index of sample.
|
||||
:return: (str, str), two subsequent sentences from corpus
|
||||
"""
|
||||
t1 = ""
|
||||
t2 = ""
|
||||
assert item < self.corpus_lines
|
||||
if self.on_memory:
|
||||
sample = self.sample_to_doc[item]
|
||||
t1 = self.all_docs[sample["doc_id"]][sample["line"]]
|
||||
t2 = self.all_docs[sample["doc_id"]][sample["line"]+1]
|
||||
# used later to avoid random nextSentence from same doc
|
||||
self.current_doc = sample["doc_id"]
|
||||
return t1, t2
|
||||
else:
|
||||
if self.line_buffer is None:
|
||||
# read first non-empty line of file
|
||||
while t1 == "" :
|
||||
t1 = next(self.file).strip()
|
||||
t2 = next(self.file).strip()
|
||||
else:
|
||||
# use t2 from previous iteration as new t1
|
||||
t1 = self.line_buffer
|
||||
t2 = next(self.file).strip()
|
||||
# skip empty rows that are used for separating documents and keep track of current doc id
|
||||
while t2 == "" or t1 == "":
|
||||
t1 = next(self.file).strip()
|
||||
t2 = next(self.file).strip()
|
||||
self.current_doc = self.current_doc+1
|
||||
self.line_buffer = t2
|
||||
|
||||
assert t1 != ""
|
||||
assert t2 != ""
|
||||
return t1, t2
|
||||
|
||||
def get_random_line(self):
|
||||
"""
|
||||
Get random line from another document for nextSentence task.
|
||||
:return: str, content of one line
|
||||
"""
|
||||
# Similar to original tf repo: This outer loop should rarely go for more than one iteration for large
|
||||
# corpora. However, just to be careful, we try to make sure that
|
||||
# the random document is not the same as the document we're processing.
|
||||
for _ in range(10):
|
||||
if self.on_memory:
|
||||
rand_doc_idx = random.randint(0, len(self.all_docs)-1)
|
||||
rand_doc = self.all_docs[rand_doc_idx]
|
||||
line = rand_doc[random.randrange(len(rand_doc))]
|
||||
else:
|
||||
rand_index = random.randint(1, self.corpus_lines if self.corpus_lines < 1000 else 1000)
|
||||
#pick random line
|
||||
for _ in range(rand_index):
|
||||
line = self.get_next_line()
|
||||
#check if our picked random line is really from another doc like we want it to be
|
||||
if self.current_random_doc != self.current_doc:
|
||||
break
|
||||
return line
|
||||
|
||||
def get_next_line(self):
|
||||
""" Gets next line of random_file and starts over when reaching end of file"""
|
||||
try:
|
||||
line = next(self.random_file).strip()
|
||||
#keep track of which document we are currently looking at to later avoid having the same doc as t1
|
||||
if line == "":
|
||||
self.current_random_doc = self.current_random_doc + 1
|
||||
line = next(self.random_file).strip()
|
||||
except StopIteration:
|
||||
self.random_file.close()
|
||||
self.random_file = open(self.corpus_path, "r", encoding=self.encoding)
|
||||
line = next(self.random_file).strip()
|
||||
return line
|
||||
|
||||
|
||||
class InputExample(object):
|
||||
"""A single training/test example for the language model."""
|
||||
|
||||
def __init__(self, guid, tokens_a, tokens_b=None, is_next=None, lm_labels=None):
|
||||
"""Constructs a InputExample.
|
||||
|
||||
Args:
|
||||
guid: Unique id for the example.
|
||||
tokens_a: string. The untokenized text of the first sequence. For single
|
||||
sequence tasks, only this sequence must be specified.
|
||||
tokens_b: (Optional) string. The untokenized text of the second sequence.
|
||||
Only must be specified for sequence pair tasks.
|
||||
label: (Optional) string. The label of the example. This should be
|
||||
specified for train and dev examples, but not for test examples.
|
||||
"""
|
||||
self.guid = guid
|
||||
self.tokens_a = tokens_a
|
||||
self.tokens_b = tokens_b
|
||||
self.is_next = is_next # nextSentence
|
||||
self.lm_labels = lm_labels # masked words for language model
|
||||
|
||||
|
||||
class InputFeatures(object):
|
||||
"""A single set of features of data."""
|
||||
|
||||
def __init__(self, input_ids, input_mask, segment_ids, is_next, lm_label_ids):
|
||||
self.input_ids = input_ids
|
||||
self.input_mask = input_mask
|
||||
self.segment_ids = segment_ids
|
||||
self.is_next = is_next
|
||||
self.lm_label_ids = lm_label_ids
|
||||
|
||||
|
||||
def random_word(tokens, tokenizer):
|
||||
"""
|
||||
Masking some random tokens for Language Model task with probabilities as in the original BERT paper.
|
||||
:param tokens: list of str, tokenized sentence.
|
||||
:param tokenizer: Tokenizer, object used for tokenization (we need it's vocab here)
|
||||
:return: (list of str, list of int), masked tokens and related labels for LM prediction
|
||||
"""
|
||||
output_label = []
|
||||
|
||||
for i, token in enumerate(tokens):
|
||||
prob = random.random()
|
||||
# mask token with 15% probability
|
||||
if prob < 0.15:
|
||||
prob /= 0.15
|
||||
|
||||
# 80% randomly change token to mask token
|
||||
if prob < 0.8:
|
||||
tokens[i] = "[MASK]"
|
||||
|
||||
# 10% randomly change token to random token
|
||||
elif prob < 0.9:
|
||||
tokens[i] = random.choice(list(tokenizer.vocab.items()))[0]
|
||||
|
||||
# -> rest 10% randomly keep current token
|
||||
|
||||
# append current token to output (we will predict these later)
|
||||
try:
|
||||
output_label.append(tokenizer.vocab[token])
|
||||
except KeyError:
|
||||
# For unknown words (should not occur with BPE vocab)
|
||||
output_label.append(tokenizer.vocab["[UNK]"])
|
||||
logger.warning("Cannot find token '{}' in vocab. Using [UNK] insetad".format(token))
|
||||
else:
|
||||
# no masking token (will be ignored by loss function later)
|
||||
output_label.append(-1)
|
||||
|
||||
return tokens, output_label
|
||||
|
||||
|
||||
def convert_example_to_features(example, max_seq_length, tokenizer):
|
||||
"""
|
||||
Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with
|
||||
IDs, LM labels, input_mask, CLS and SEP tokens etc.
|
||||
:param example: InputExample, containing sentence input as strings and is_next label
|
||||
:param max_seq_length: int, maximum length of sequence.
|
||||
:param tokenizer: Tokenizer
|
||||
:return: InputFeatures, containing all inputs and labels of one sample as IDs (as used for model training)
|
||||
"""
|
||||
tokens_a = example.tokens_a
|
||||
tokens_b = example.tokens_b
|
||||
# Modifies `tokens_a` and `tokens_b` in place so that the total
|
||||
# length is less than the specified length.
|
||||
# Account for [CLS], [SEP], [SEP] with "- 3"
|
||||
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
|
||||
|
||||
tokens_a, t1_label = random_word(tokens_a, tokenizer)
|
||||
tokens_b, t2_label = random_word(tokens_b, tokenizer)
|
||||
# concatenate lm labels and account for CLS, SEP, SEP
|
||||
lm_label_ids = ([-1] + t1_label + [-1] + t2_label + [-1])
|
||||
|
||||
# The convention in BERT is:
|
||||
# (a) For sequence pairs:
|
||||
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
|
||||
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
|
||||
# (b) For single sequences:
|
||||
# tokens: [CLS] the dog is hairy . [SEP]
|
||||
# type_ids: 0 0 0 0 0 0 0
|
||||
#
|
||||
# Where "type_ids" are used to indicate whether this is the first
|
||||
# sequence or the second sequence. The embedding vectors for `type=0` and
|
||||
# `type=1` were learned during pre-training and are added to the wordpiece
|
||||
# embedding vector (and position vector). This is not *strictly* necessary
|
||||
# since the [SEP] token unambigiously separates the sequences, but it makes
|
||||
# it easier for the model to learn the concept of sequences.
|
||||
#
|
||||
# For classification tasks, the first vector (corresponding to [CLS]) is
|
||||
# used as as the "sentence vector". Note that this only makes sense because
|
||||
# the entire model is fine-tuned.
|
||||
tokens = []
|
||||
segment_ids = []
|
||||
tokens.append("[CLS]")
|
||||
segment_ids.append(0)
|
||||
for token in tokens_a:
|
||||
tokens.append(token)
|
||||
segment_ids.append(0)
|
||||
tokens.append("[SEP]")
|
||||
segment_ids.append(0)
|
||||
|
||||
assert len(tokens_b) > 0
|
||||
for token in tokens_b:
|
||||
tokens.append(token)
|
||||
segment_ids.append(1)
|
||||
tokens.append("[SEP]")
|
||||
segment_ids.append(1)
|
||||
|
||||
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||
|
||||
# The mask has 1 for real tokens and 0 for padding tokens. Only real
|
||||
# tokens are attended to.
|
||||
input_mask = [1] * len(input_ids)
|
||||
|
||||
# Zero-pad up to the sequence length.
|
||||
while len(input_ids) < max_seq_length:
|
||||
input_ids.append(0)
|
||||
input_mask.append(0)
|
||||
segment_ids.append(0)
|
||||
lm_label_ids.append(-1)
|
||||
|
||||
assert len(input_ids) == max_seq_length
|
||||
assert len(input_mask) == max_seq_length
|
||||
assert len(segment_ids) == max_seq_length
|
||||
assert len(lm_label_ids) == max_seq_length
|
||||
|
||||
if example.guid < 5:
|
||||
logger.info("*** Example ***")
|
||||
logger.info("guid: %s" % (example.guid))
|
||||
logger.info("tokens: %s" % " ".join(
|
||||
[str(x) for x in tokens]))
|
||||
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
|
||||
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
|
||||
logger.info(
|
||||
"segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
|
||||
logger.info("LM label: %s " % (lm_label_ids))
|
||||
logger.info("Is next sentence label: %s " % (example.is_next))
|
||||
|
||||
features = InputFeatures(input_ids=input_ids,
|
||||
input_mask=input_mask,
|
||||
segment_ids=segment_ids,
|
||||
lm_label_ids=lm_label_ids,
|
||||
is_next=example.is_next)
|
||||
return features
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
## Required parameters
|
||||
parser.add_argument("--train_file",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The input train corpus.")
|
||||
parser.add_argument("--bert_model", default=None, type=str, required=True,
|
||||
help="Bert pre-trained model selected in the list: bert-base-uncased, "
|
||||
"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
|
||||
parser.add_argument("--output_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The output directory where the model checkpoints will be written.")
|
||||
|
||||
## Other parameters
|
||||
parser.add_argument("--max_seq_length",
|
||||
default=128,
|
||||
type=int,
|
||||
help="The maximum total input sequence length after WordPiece tokenization. \n"
|
||||
"Sequences longer than this will be truncated, and sequences shorter \n"
|
||||
"than this will be padded.")
|
||||
parser.add_argument("--do_train",
|
||||
action='store_true',
|
||||
help="Whether to run training.")
|
||||
parser.add_argument("--train_batch_size",
|
||||
default=32,
|
||||
type=int,
|
||||
help="Total batch size for training.")
|
||||
parser.add_argument("--learning_rate",
|
||||
default=3e-5,
|
||||
type=float,
|
||||
help="The initial learning rate for Adam.")
|
||||
parser.add_argument("--num_train_epochs",
|
||||
default=3.0,
|
||||
type=float,
|
||||
help="Total number of training epochs to perform.")
|
||||
parser.add_argument("--warmup_proportion",
|
||||
default=0.1,
|
||||
type=float,
|
||||
help="Proportion of training to perform linear learning rate warmup for. "
|
||||
"E.g., 0.1 = 10%% of training.")
|
||||
parser.add_argument("--no_cuda",
|
||||
action='store_true',
|
||||
help="Whether not to use CUDA when available")
|
||||
parser.add_argument("--on_memory",
|
||||
action='store_true',
|
||||
help="Whether to load train samples into memory or use disk")
|
||||
parser.add_argument("--do_lower_case",
|
||||
action='store_true',
|
||||
help="Whether to lower case the input text. True for uncased models, False for cased models.")
|
||||
parser.add_argument("--local_rank",
|
||||
type=int,
|
||||
default=-1,
|
||||
help="local_rank for distributed training on gpus")
|
||||
parser.add_argument('--seed',
|
||||
type=int,
|
||||
default=42,
|
||||
help="random seed for initialization")
|
||||
parser.add_argument('--gradient_accumulation_steps',
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of updates steps to accumualte before performing a backward/update pass.")
|
||||
parser.add_argument('--fp16',
|
||||
action='store_true',
|
||||
help="Whether to use 16-bit float precision instead of 32-bit")
|
||||
parser.add_argument('--loss_scale',
|
||||
type = float, default = 0,
|
||||
help = "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
|
||||
"0 (default value): dynamic loss scaling.\n"
|
||||
"Positive power of 2: static loss scaling value.\n")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.local_rank == -1 or args.no_cuda:
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
n_gpu = torch.cuda.device_count()
|
||||
else:
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
n_gpu = 1
|
||||
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
torch.distributed.init_process_group(backend='nccl')
|
||||
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
|
||||
device, n_gpu, bool(args.local_rank != -1), args.fp16))
|
||||
|
||||
if args.gradient_accumulation_steps < 1:
|
||||
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
|
||||
args.gradient_accumulation_steps))
|
||||
|
||||
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
|
||||
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
if n_gpu > 0:
|
||||
torch.cuda.manual_seed_all(args.seed)
|
||||
|
||||
if not args.do_train:
|
||||
raise ValueError("Training is currently the only implemented execution option. Please set `do_train`.")
|
||||
|
||||
if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
|
||||
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
|
||||
|
||||
#train_examples = None
|
||||
num_train_optimization_steps = None
|
||||
if args.do_train:
|
||||
print("Loading Train Dataset", args.train_file)
|
||||
train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length,
|
||||
corpus_lines=None, on_memory=args.on_memory)
|
||||
num_train_optimization_steps = int(
|
||||
len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
|
||||
if args.local_rank != -1:
|
||||
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
|
||||
|
||||
# Prepare model
|
||||
model = BertForPreTraining.from_pretrained(args.bert_model)
|
||||
if args.fp16:
|
||||
model.half()
|
||||
model.to(device)
|
||||
if args.local_rank != -1:
|
||||
try:
|
||||
from apex.parallel import DistributedDataParallel as DDP
|
||||
except ImportError:
|
||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
|
||||
model = DDP(model)
|
||||
elif n_gpu > 1:
|
||||
model = torch.nn.DataParallel(model)
|
||||
|
||||
# Prepare optimizer
|
||||
param_optimizer = list(model.named_parameters())
|
||||
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
|
||||
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||
]
|
||||
|
||||
if args.fp16:
|
||||
try:
|
||||
from apex.optimizers import FP16_Optimizer
|
||||
from apex.optimizers import FusedAdam
|
||||
except ImportError:
|
||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
|
||||
|
||||
optimizer = FusedAdam(optimizer_grouped_parameters,
|
||||
lr=args.learning_rate,
|
||||
bias_correction=False,
|
||||
max_grad_norm=1.0)
|
||||
if args.loss_scale == 0:
|
||||
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
|
||||
else:
|
||||
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
|
||||
|
||||
else:
|
||||
optimizer = BertAdam(optimizer_grouped_parameters,
|
||||
lr=args.learning_rate,
|
||||
warmup=args.warmup_proportion,
|
||||
t_total=num_train_optimization_steps)
|
||||
|
||||
global_step = 0
|
||||
if args.do_train:
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(" Num examples = %d", len(train_dataset))
|
||||
logger.info(" Batch size = %d", args.train_batch_size)
|
||||
logger.info(" Num steps = %d", num_train_optimization_steps)
|
||||
|
||||
if args.local_rank == -1:
|
||||
train_sampler = RandomSampler(train_dataset)
|
||||
else:
|
||||
#TODO: check if this works with current data generator from disk that relies on next(file)
|
||||
# (it doesn't return item back by index)
|
||||
train_sampler = DistributedSampler(train_dataset)
|
||||
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
|
||||
|
||||
model.train()
|
||||
for _ in trange(int(args.num_train_epochs), desc="Epoch"):
|
||||
tr_loss = 0
|
||||
nb_tr_examples, nb_tr_steps = 0, 0
|
||||
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
|
||||
batch = tuple(t.to(device) for t in batch)
|
||||
input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
|
||||
loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next)
|
||||
if n_gpu > 1:
|
||||
loss = loss.mean() # mean() to average on multi-gpu.
|
||||
if args.gradient_accumulation_steps > 1:
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
if args.fp16:
|
||||
optimizer.backward(loss)
|
||||
else:
|
||||
loss.backward()
|
||||
tr_loss += loss.item()
|
||||
nb_tr_examples += input_ids.size(0)
|
||||
nb_tr_steps += 1
|
||||
if (step + 1) % args.gradient_accumulation_steps == 0:
|
||||
if args.fp16:
|
||||
# modify learning rate with special warm up BERT uses
|
||||
# if args.fp16 is False, BertAdam is used that handles this automatically
|
||||
lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group['lr'] = lr_this_step
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
global_step += 1
|
||||
|
||||
# Save a trained model
|
||||
logger.info("** ** * Saving fine - tuned model ** ** * ")
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
|
||||
output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
|
||||
if args.do_train:
|
||||
torch.save(model_to_save.state_dict(), output_model_file)
|
||||
|
||||
|
||||
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
|
||||
"""Truncates a sequence pair in place to the maximum length."""
|
||||
|
||||
# This is a simple heuristic which will always truncate the longer sequence
|
||||
# one token at a time. This makes more sense than truncating an equal percent
|
||||
# of tokens from each, since if one sequence is very short then each token
|
||||
# that's truncated likely contains more information than a longer sequence.
|
||||
while True:
|
||||
total_length = len(tokens_a) + len(tokens_b)
|
||||
if total_length <= max_length:
|
||||
break
|
||||
if len(tokens_a) > len(tokens_b):
|
||||
tokens_a.pop()
|
||||
else:
|
||||
tokens_b.pop()
|
||||
|
||||
|
||||
def accuracy(out, labels):
|
||||
outputs = np.argmax(out, axis=1)
|
||||
return np.sum(outputs == labels)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
259
examples/run_openai_gpt.py
Normal file
259
examples/run_openai_gpt.py
Normal file
@ -0,0 +1,259 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" OpenAI GPT model fine-tuning script.
|
||||
Adapted from https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/train.py
|
||||
It self adapted from https://github.com/openai/finetune-transformer-lm/blob/master/train.py
|
||||
|
||||
This script with default values fine-tunes and evaluate a pretrained OpenAI GPT on the RocStories dataset
|
||||
"""
|
||||
import argparse
|
||||
import os
|
||||
import csv
|
||||
import random
|
||||
import logging
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||
TensorDataset)
|
||||
|
||||
from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, OpenAIAdam, cached_path
|
||||
|
||||
ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def accuracy(out, labels):
|
||||
outputs = np.argmax(out, axis=1)
|
||||
return np.sum(outputs == labels)
|
||||
|
||||
def load_rocstories_dataset(dataset_path):
|
||||
""" Output a list of tuples(story, 1st continuation, 2nd continuation, label) """
|
||||
with open(dataset_path, encoding='utf_8') as f:
|
||||
f = csv.reader(f)
|
||||
output = []
|
||||
next(f) # skip the first line
|
||||
for line in tqdm(f):
|
||||
output.append((' '.join(line[1:5]), line[5], line[6], int(line[-1])-1))
|
||||
return output
|
||||
|
||||
def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):
|
||||
""" Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
|
||||
|
||||
To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
|
||||
input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
|
||||
"""
|
||||
tensor_datasets = []
|
||||
for dataset in encoded_datasets:
|
||||
n_batch = len(dataset)
|
||||
input_ids = np.zeros((n_batch, 2, input_len), dtype=np.int64)
|
||||
mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
|
||||
lm_labels = np.full((n_batch, 2, input_len), fill_value=-1, dtype=np.int64)
|
||||
mc_labels = np.zeros((n_batch,), dtype=np.int64)
|
||||
for i, (story, cont1, cont2, mc_label), in enumerate(dataset):
|
||||
with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
|
||||
with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
|
||||
input_ids[i, 0, :len(with_cont1)] = with_cont1
|
||||
input_ids[i, 1, :len(with_cont2)] = with_cont2
|
||||
mc_token_ids[i, 0] = len(with_cont1) - 1
|
||||
mc_token_ids[i, 1] = len(with_cont2) - 1
|
||||
lm_labels[i, 0, :len(with_cont1)-1] = with_cont1[1:]
|
||||
lm_labels[i, 1, :len(with_cont2)-1] = with_cont2[1:]
|
||||
mc_labels[i] = mc_label
|
||||
all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels)
|
||||
tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))
|
||||
return tensor_datasets
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--model_name', type=str, default='openai-gpt',
|
||||
help='pretrained model name')
|
||||
parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
|
||||
parser.add_argument("--output_dir", default=None, type=str, required=True,
|
||||
help="The output directory where the model predictions and checkpoints will be written.")
|
||||
parser.add_argument('--train_dataset', type=str, default='')
|
||||
parser.add_argument('--eval_dataset', type=str, default='')
|
||||
parser.add_argument('--seed', type=int, default=42)
|
||||
parser.add_argument('--num_train_epochs', type=int, default=3)
|
||||
parser.add_argument('--train_batch_size', type=int, default=8)
|
||||
parser.add_argument('--eval_batch_size', type=int, default=16)
|
||||
parser.add_argument('--max_grad_norm', type=int, default=1)
|
||||
parser.add_argument('--learning_rate', type=float, default=6.25e-5)
|
||||
parser.add_argument('--warmup_proportion', type=float, default=0.002)
|
||||
parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
|
||||
parser.add_argument('--weight_decay', type=float, default=0.01)
|
||||
parser.add_argument('--lm_coef', type=float, default=0.9)
|
||||
parser.add_argument('--n_valid', type=int, default=374)
|
||||
|
||||
parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
|
||||
parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
||||
if args.server_ip and args.server_port:
|
||||
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||
import ptvsd
|
||||
print("Waiting for debugger attach")
|
||||
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||
ptvsd.wait_for_attach()
|
||||
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
torch.cuda.manual_seed_all(args.seed)
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
n_gpu = torch.cuda.device_count()
|
||||
logger.info("device: {}, n_gpu {}".format(device, n_gpu))
|
||||
|
||||
if not args.do_train and not args.do_eval:
|
||||
raise ValueError("At least one of `do_train` or `do_eval` must be True.")
|
||||
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
# Load tokenizer and model
|
||||
# This loading functions also add new tokens and embeddings called `special tokens`
|
||||
# These new embeddings will be fine-tuned on the RocStories dataset
|
||||
special_tokens = ['_start_', '_delimiter_', '_classify_']
|
||||
tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens)
|
||||
special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
|
||||
model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens))
|
||||
model.to(device)
|
||||
|
||||
# Load and encode the datasets
|
||||
if not args.train_dataset and not args.eval_dataset:
|
||||
roc_stories = cached_path(ROCSTORIES_URL)
|
||||
def tokenize_and_encode(obj):
|
||||
""" Tokenize and encode a nested object """
|
||||
if isinstance(obj, str):
|
||||
return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
|
||||
elif isinstance(obj, int):
|
||||
return obj
|
||||
return list(tokenize_and_encode(o) for o in obj)
|
||||
logger.info("Encoding dataset...")
|
||||
train_dataset = load_rocstories_dataset(args.train_dataset)
|
||||
eval_dataset = load_rocstories_dataset(args.eval_dataset)
|
||||
datasets = (train_dataset, eval_dataset)
|
||||
encoded_datasets = tokenize_and_encode(datasets)
|
||||
|
||||
# Compute the mex input length for the Transformer
|
||||
max_length = model.config.n_positions // 2 - 2
|
||||
input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 \
|
||||
for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
|
||||
input_length = min(input_length, model.config.n_positions) # Max size of input for the pre-trained model
|
||||
|
||||
# Prepare inputs tensors and dataloaders
|
||||
tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids)
|
||||
train_tensor_dataset, eval_tensor_dataset = tensor_datasets[0], tensor_datasets[1]
|
||||
|
||||
train_data = TensorDataset(*train_tensor_dataset)
|
||||
train_sampler = RandomSampler(train_data)
|
||||
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
|
||||
|
||||
eval_data = TensorDataset(*eval_tensor_dataset)
|
||||
eval_sampler = SequentialSampler(eval_data)
|
||||
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||
|
||||
# Prepare optimizer
|
||||
param_optimizer = list(model.named_parameters())
|
||||
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
|
||||
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||
]
|
||||
num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size
|
||||
optimizer = OpenAIAdam(optimizer_grouped_parameters,
|
||||
lr=args.learning_rate,
|
||||
warmup=args.warmup_proportion,
|
||||
max_grad_norm=args.max_grad_norm,
|
||||
weight_decay=args.weight_decay,
|
||||
t_total=num_train_optimization_steps)
|
||||
|
||||
if args.do_train:
|
||||
nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
|
||||
model.train()
|
||||
for _ in trange(int(args.num_train_epochs), desc="Epoch"):
|
||||
tr_loss = 0
|
||||
nb_tr_steps = 0
|
||||
tqdm_bar = tqdm(train_dataloader, desc="Training")
|
||||
for step, batch in enumerate(tqdm_bar):
|
||||
batch = tuple(t.to(device) for t in batch)
|
||||
input_ids, mc_token_ids, lm_labels, mc_labels = batch
|
||||
losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)
|
||||
loss = args.lm_coef * losses[0] + losses[1]
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
tr_loss += loss.item()
|
||||
exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item()
|
||||
nb_tr_steps += 1
|
||||
tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, optimizer.get_lr()[0])
|
||||
|
||||
# Save a trained model
|
||||
if args.do_train:
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
|
||||
output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
|
||||
config = model.config
|
||||
torch.save(model_to_save.state_dict(), output_model_file)
|
||||
|
||||
# Load a trained model that you have fine-tuned
|
||||
model_state_dict = torch.load(output_model_file)
|
||||
model = OpenAIGPTDoubleHeadsModel(config)
|
||||
model.load_state_dict(model_state_dict)
|
||||
model.to(device)
|
||||
|
||||
if args.do_eval:
|
||||
model.eval()
|
||||
eval_loss, eval_accuracy = 0, 0
|
||||
nb_eval_steps, nb_eval_examples = 0, 0
|
||||
for batch in tqdm(eval_dataloader, desc="Evaluating"):
|
||||
batch = tuple(t.to(device) for t in batch)
|
||||
input_ids, mc_token_ids, lm_labels, mc_labels = batch
|
||||
with torch.no_grad():
|
||||
_, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels)
|
||||
_, mc_logits = model(input_ids, mc_token_ids)
|
||||
|
||||
mc_logits = mc_logits.detach().cpu().numpy()
|
||||
mc_labels = mc_labels.to('cpu').numpy()
|
||||
tmp_eval_accuracy = accuracy(mc_logits, mc_labels)
|
||||
|
||||
eval_loss += mc_loss.mean().item()
|
||||
eval_accuracy += tmp_eval_accuracy
|
||||
|
||||
nb_eval_examples += input_ids.size(0)
|
||||
nb_eval_steps += 1
|
||||
|
||||
eval_loss = eval_loss / nb_eval_steps
|
||||
eval_accuracy = eval_accuracy / nb_eval_examples
|
||||
train_loss = tr_loss/nb_tr_steps if args.do_train else None
|
||||
result = {'eval_loss': eval_loss,
|
||||
'eval_accuracy': eval_accuracy,
|
||||
'train_loss': train_loss}
|
||||
|
||||
output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
|
||||
with open(output_eval_file, "w") as writer:
|
||||
logger.info("***** Eval results *****")
|
||||
for key in sorted(result.keys()):
|
||||
logger.info(" %s = %s", key, str(result[key]))
|
||||
writer.write("%s = %s\n" % (key, str(result[key])))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -1,5 +1,6 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@ -14,36 +15,48 @@
|
||||
# limitations under the License.
|
||||
"""Run BERT on SQuAD."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import collections
|
||||
import logging
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
from tqdm import tqdm, trange
|
||||
import sys
|
||||
from io import open
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
|
||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||
TensorDataset)
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from pytorch_pretrained_bert.tokenization import printable_text, whitespace_tokenize, BasicTokenizer, BertTokenizer
|
||||
from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
|
||||
from pytorch_pretrained_bert.optimization import BertAdam
|
||||
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
|
||||
from pytorch_pretrained_bert.modeling import BertForQuestionAnswering, BertConfig, WEIGHTS_NAME, CONFIG_NAME
|
||||
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
|
||||
from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
|
||||
BertTokenizer,
|
||||
whitespace_tokenize)
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
if sys.version_info[0] == 2:
|
||||
import cPickle as pickle
|
||||
else:
|
||||
import pickle
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SquadExample(object):
|
||||
"""A single training/test example for simple sequence classification."""
|
||||
"""
|
||||
A single training/test example for the Squad dataset.
|
||||
For examples without an answer, the start and end position are -1.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
qas_id,
|
||||
@ -51,27 +64,31 @@ class SquadExample(object):
|
||||
doc_tokens,
|
||||
orig_answer_text=None,
|
||||
start_position=None,
|
||||
end_position=None):
|
||||
end_position=None,
|
||||
is_impossible=None):
|
||||
self.qas_id = qas_id
|
||||
self.question_text = question_text
|
||||
self.doc_tokens = doc_tokens
|
||||
self.orig_answer_text = orig_answer_text
|
||||
self.start_position = start_position
|
||||
self.end_position = end_position
|
||||
self.is_impossible = is_impossible
|
||||
|
||||
def __str__(self):
|
||||
return self.__repr__()
|
||||
|
||||
def __repr__(self):
|
||||
s = ""
|
||||
s += "qas_id: %s" % (printable_text(self.qas_id))
|
||||
s += "qas_id: %s" % (self.qas_id)
|
||||
s += ", question_text: %s" % (
|
||||
printable_text(self.question_text))
|
||||
self.question_text)
|
||||
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
|
||||
if self.start_position:
|
||||
s += ", start_position: %d" % (self.start_position)
|
||||
if self.start_position:
|
||||
s += ", end_position: %d" % (self.end_position)
|
||||
if self.start_position:
|
||||
s += ", is_impossible: %r" % (self.is_impossible)
|
||||
return s
|
||||
|
||||
|
||||
@ -89,7 +106,8 @@ class InputFeatures(object):
|
||||
input_mask,
|
||||
segment_ids,
|
||||
start_position=None,
|
||||
end_position=None):
|
||||
end_position=None,
|
||||
is_impossible=None):
|
||||
self.unique_id = unique_id
|
||||
self.example_index = example_index
|
||||
self.doc_span_index = doc_span_index
|
||||
@ -101,11 +119,12 @@ class InputFeatures(object):
|
||||
self.segment_ids = segment_ids
|
||||
self.start_position = start_position
|
||||
self.end_position = end_position
|
||||
self.is_impossible = is_impossible
|
||||
|
||||
|
||||
def read_squad_examples(input_file, is_training):
|
||||
def read_squad_examples(input_file, is_training, version_2_with_negative):
|
||||
"""Read a SQuAD json file into a list of SquadExample."""
|
||||
with open(input_file, "r") as reader:
|
||||
with open(input_file, "r", encoding='utf-8') as reader:
|
||||
input_data = json.load(reader)["data"]
|
||||
|
||||
def is_whitespace(c):
|
||||
@ -137,29 +156,37 @@ def read_squad_examples(input_file, is_training):
|
||||
start_position = None
|
||||
end_position = None
|
||||
orig_answer_text = None
|
||||
is_impossible = False
|
||||
if is_training:
|
||||
if len(qa["answers"]) != 1:
|
||||
if version_2_with_negative:
|
||||
is_impossible = qa["is_impossible"]
|
||||
if (len(qa["answers"]) != 1) and (not is_impossible):
|
||||
raise ValueError(
|
||||
"For training, each question should have exactly 1 answer.")
|
||||
answer = qa["answers"][0]
|
||||
orig_answer_text = answer["text"]
|
||||
answer_offset = answer["answer_start"]
|
||||
answer_length = len(orig_answer_text)
|
||||
start_position = char_to_word_offset[answer_offset]
|
||||
end_position = char_to_word_offset[answer_offset + answer_length - 1]
|
||||
# Only add answers where the text can be exactly recovered from the
|
||||
# document. If this CAN'T happen it's likely due to weird Unicode
|
||||
# stuff so we will just skip the example.
|
||||
#
|
||||
# Note that this means for training mode, every example is NOT
|
||||
# guaranteed to be preserved.
|
||||
actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
|
||||
cleaned_answer_text = " ".join(
|
||||
whitespace_tokenize(orig_answer_text))
|
||||
if actual_text.find(cleaned_answer_text) == -1:
|
||||
logger.warning("Could not find answer: '%s' vs. '%s'",
|
||||
if not is_impossible:
|
||||
answer = qa["answers"][0]
|
||||
orig_answer_text = answer["text"]
|
||||
answer_offset = answer["answer_start"]
|
||||
answer_length = len(orig_answer_text)
|
||||
start_position = char_to_word_offset[answer_offset]
|
||||
end_position = char_to_word_offset[answer_offset + answer_length - 1]
|
||||
# Only add answers where the text can be exactly recovered from the
|
||||
# document. If this CAN'T happen it's likely due to weird Unicode
|
||||
# stuff so we will just skip the example.
|
||||
#
|
||||
# Note that this means for training mode, every example is NOT
|
||||
# guaranteed to be preserved.
|
||||
actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
|
||||
cleaned_answer_text = " ".join(
|
||||
whitespace_tokenize(orig_answer_text))
|
||||
if actual_text.find(cleaned_answer_text) == -1:
|
||||
logger.warning("Could not find answer: '%s' vs. '%s'",
|
||||
actual_text, cleaned_answer_text)
|
||||
continue
|
||||
continue
|
||||
else:
|
||||
start_position = -1
|
||||
end_position = -1
|
||||
orig_answer_text = ""
|
||||
|
||||
example = SquadExample(
|
||||
qas_id=qas_id,
|
||||
@ -167,7 +194,8 @@ def read_squad_examples(input_file, is_training):
|
||||
doc_tokens=doc_tokens,
|
||||
orig_answer_text=orig_answer_text,
|
||||
start_position=start_position,
|
||||
end_position=end_position)
|
||||
end_position=end_position,
|
||||
is_impossible=is_impossible)
|
||||
examples.append(example)
|
||||
return examples
|
||||
|
||||
@ -197,7 +225,10 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
|
||||
|
||||
tok_start_position = None
|
||||
tok_end_position = None
|
||||
if is_training:
|
||||
if is_training and example.is_impossible:
|
||||
tok_start_position = -1
|
||||
tok_end_position = -1
|
||||
if is_training and not example.is_impossible:
|
||||
tok_start_position = orig_to_tok_index[example.start_position]
|
||||
if example.end_position < len(example.doc_tokens) - 1:
|
||||
tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
|
||||
@ -269,27 +300,31 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
|
||||
|
||||
start_position = None
|
||||
end_position = None
|
||||
if is_training:
|
||||
if is_training and not example.is_impossible:
|
||||
# For training, if our document chunk does not contain an annotation
|
||||
# we throw it out, since there is nothing to predict.
|
||||
doc_start = doc_span.start
|
||||
doc_end = doc_span.start + doc_span.length - 1
|
||||
if (example.start_position < doc_start or
|
||||
example.end_position < doc_start or
|
||||
example.start_position > doc_end or example.end_position > doc_end):
|
||||
continue
|
||||
|
||||
doc_offset = len(query_tokens) + 2
|
||||
start_position = tok_start_position - doc_start + doc_offset
|
||||
end_position = tok_end_position - doc_start + doc_offset
|
||||
|
||||
out_of_span = False
|
||||
if not (tok_start_position >= doc_start and
|
||||
tok_end_position <= doc_end):
|
||||
out_of_span = True
|
||||
if out_of_span:
|
||||
start_position = 0
|
||||
end_position = 0
|
||||
else:
|
||||
doc_offset = len(query_tokens) + 2
|
||||
start_position = tok_start_position - doc_start + doc_offset
|
||||
end_position = tok_end_position - doc_start + doc_offset
|
||||
if is_training and example.is_impossible:
|
||||
start_position = 0
|
||||
end_position = 0
|
||||
if example_index < 20:
|
||||
logger.info("*** Example ***")
|
||||
logger.info("unique_id: %s" % (unique_id))
|
||||
logger.info("example_index: %s" % (example_index))
|
||||
logger.info("doc_span_index: %s" % (doc_span_index))
|
||||
logger.info("tokens: %s" % " ".join(
|
||||
[printable_text(x) for x in tokens]))
|
||||
logger.info("tokens: %s" % " ".join(tokens))
|
||||
logger.info("token_to_orig_map: %s" % " ".join([
|
||||
"%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]))
|
||||
logger.info("token_is_max_context: %s" % " ".join([
|
||||
@ -300,12 +335,14 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
|
||||
"input_mask: %s" % " ".join([str(x) for x in input_mask]))
|
||||
logger.info(
|
||||
"segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
|
||||
if is_training:
|
||||
if is_training and example.is_impossible:
|
||||
logger.info("impossible example")
|
||||
if is_training and not example.is_impossible:
|
||||
answer_text = " ".join(tokens[start_position:(end_position + 1)])
|
||||
logger.info("start_position: %d" % (start_position))
|
||||
logger.info("end_position: %d" % (end_position))
|
||||
logger.info(
|
||||
"answer: %s" % (printable_text(answer_text)))
|
||||
"answer: %s" % (answer_text))
|
||||
|
||||
features.append(
|
||||
InputFeatures(
|
||||
@ -319,7 +356,8 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
|
||||
input_mask=input_mask,
|
||||
segment_ids=segment_ids,
|
||||
start_position=start_position,
|
||||
end_position=end_position))
|
||||
end_position=end_position,
|
||||
is_impossible=example.is_impossible))
|
||||
unique_id += 1
|
||||
|
||||
return features
|
||||
@ -399,15 +437,15 @@ def _check_is_max_context(doc_spans, cur_span_index, position):
|
||||
return cur_span_index == best_span_index
|
||||
|
||||
|
||||
|
||||
RawResult = collections.namedtuple("RawResult",
|
||||
["unique_id", "start_logits", "end_logits"])
|
||||
|
||||
|
||||
def write_predictions(all_examples, all_features, all_results, n_best_size,
|
||||
max_answer_length, do_lower_case, output_prediction_file,
|
||||
output_nbest_file, verbose_logging):
|
||||
"""Write final predictions to the json file."""
|
||||
output_nbest_file, output_null_log_odds_file, verbose_logging,
|
||||
version_2_with_negative, null_score_diff_threshold):
|
||||
"""Write final predictions to the json file and log-odds of null if needed."""
|
||||
logger.info("Writing predictions to: %s" % (output_prediction_file))
|
||||
logger.info("Writing nbest to: %s" % (output_nbest_file))
|
||||
|
||||
@ -425,15 +463,29 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
|
||||
|
||||
all_predictions = collections.OrderedDict()
|
||||
all_nbest_json = collections.OrderedDict()
|
||||
scores_diff_json = collections.OrderedDict()
|
||||
|
||||
for (example_index, example) in enumerate(all_examples):
|
||||
features = example_index_to_features[example_index]
|
||||
|
||||
prelim_predictions = []
|
||||
# keep track of the minimum score of null start+end of position 0
|
||||
score_null = 1000000 # large and positive
|
||||
min_null_feature_index = 0 # the paragraph slice with min mull score
|
||||
null_start_logit = 0 # the start logit at the slice with min null score
|
||||
null_end_logit = 0 # the end logit at the slice with min null score
|
||||
for (feature_index, feature) in enumerate(features):
|
||||
result = unique_id_to_result[feature.unique_id]
|
||||
|
||||
start_indexes = _get_best_indexes(result.start_logits, n_best_size)
|
||||
end_indexes = _get_best_indexes(result.end_logits, n_best_size)
|
||||
# if we could have irrelevant answers, get the min score of irrelevant
|
||||
if version_2_with_negative:
|
||||
feature_null_score = result.start_logits[0] + result.end_logits[0]
|
||||
if feature_null_score < score_null:
|
||||
score_null = feature_null_score
|
||||
min_null_feature_index = feature_index
|
||||
null_start_logit = result.start_logits[0]
|
||||
null_end_logit = result.end_logits[0]
|
||||
for start_index in start_indexes:
|
||||
for end_index in end_indexes:
|
||||
# We could hypothetically create invalid predictions, e.g., predict
|
||||
@ -461,7 +513,14 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
|
||||
end_index=end_index,
|
||||
start_logit=result.start_logits[start_index],
|
||||
end_logit=result.end_logits[end_index]))
|
||||
|
||||
if version_2_with_negative:
|
||||
prelim_predictions.append(
|
||||
_PrelimPrediction(
|
||||
feature_index=min_null_feature_index,
|
||||
start_index=0,
|
||||
end_index=0,
|
||||
start_logit=null_start_logit,
|
||||
end_logit=null_end_logit))
|
||||
prelim_predictions = sorted(
|
||||
prelim_predictions,
|
||||
key=lambda x: (x.start_logit + x.end_logit),
|
||||
@ -476,33 +535,51 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
|
||||
if len(nbest) >= n_best_size:
|
||||
break
|
||||
feature = features[pred.feature_index]
|
||||
if pred.start_index > 0: # this is a non-null prediction
|
||||
tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
|
||||
orig_doc_start = feature.token_to_orig_map[pred.start_index]
|
||||
orig_doc_end = feature.token_to_orig_map[pred.end_index]
|
||||
orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
|
||||
tok_text = " ".join(tok_tokens)
|
||||
|
||||
tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
|
||||
orig_doc_start = feature.token_to_orig_map[pred.start_index]
|
||||
orig_doc_end = feature.token_to_orig_map[pred.end_index]
|
||||
orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
|
||||
tok_text = " ".join(tok_tokens)
|
||||
# De-tokenize WordPieces that have been split off.
|
||||
tok_text = tok_text.replace(" ##", "")
|
||||
tok_text = tok_text.replace("##", "")
|
||||
|
||||
# De-tokenize WordPieces that have been split off.
|
||||
tok_text = tok_text.replace(" ##", "")
|
||||
tok_text = tok_text.replace("##", "")
|
||||
# Clean whitespace
|
||||
tok_text = tok_text.strip()
|
||||
tok_text = " ".join(tok_text.split())
|
||||
orig_text = " ".join(orig_tokens)
|
||||
|
||||
# Clean whitespace
|
||||
tok_text = tok_text.strip()
|
||||
tok_text = " ".join(tok_text.split())
|
||||
orig_text = " ".join(orig_tokens)
|
||||
final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
|
||||
if final_text in seen_predictions:
|
||||
continue
|
||||
|
||||
final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
|
||||
if final_text in seen_predictions:
|
||||
continue
|
||||
seen_predictions[final_text] = True
|
||||
else:
|
||||
final_text = ""
|
||||
seen_predictions[final_text] = True
|
||||
|
||||
seen_predictions[final_text] = True
|
||||
nbest.append(
|
||||
_NbestPrediction(
|
||||
text=final_text,
|
||||
start_logit=pred.start_logit,
|
||||
end_logit=pred.end_logit))
|
||||
|
||||
# if we didn't include the empty option in the n-best, include it
|
||||
if version_2_with_negative:
|
||||
if "" not in seen_predictions:
|
||||
nbest.append(
|
||||
_NbestPrediction(
|
||||
text="",
|
||||
start_logit=null_start_logit,
|
||||
end_logit=null_end_logit))
|
||||
|
||||
# In very rare edge cases we could only have single null prediction.
|
||||
# So we just create a nonce prediction in this case to avoid failure.
|
||||
if len(nbest)==1:
|
||||
nbest.insert(0,
|
||||
_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
|
||||
|
||||
# In very rare edge cases we could have no valid predictions. So we
|
||||
# just create a nonce prediction in this case to avoid failure.
|
||||
if not nbest:
|
||||
@ -512,8 +589,12 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
|
||||
assert len(nbest) >= 1
|
||||
|
||||
total_scores = []
|
||||
best_non_null_entry = None
|
||||
for entry in nbest:
|
||||
total_scores.append(entry.start_logit + entry.end_logit)
|
||||
if not best_non_null_entry:
|
||||
if entry.text:
|
||||
best_non_null_entry = entry
|
||||
|
||||
probs = _compute_softmax(total_scores)
|
||||
|
||||
@ -528,8 +609,18 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
|
||||
|
||||
assert len(nbest_json) >= 1
|
||||
|
||||
all_predictions[example.qas_id] = nbest_json[0]["text"]
|
||||
all_nbest_json[example.qas_id] = nbest_json
|
||||
if not version_2_with_negative:
|
||||
all_predictions[example.qas_id] = nbest_json[0]["text"]
|
||||
else:
|
||||
# predict "" iff the null score - the score of best non-null > threshold
|
||||
score_diff = score_null - best_non_null_entry.start_logit - (
|
||||
best_non_null_entry.end_logit)
|
||||
scores_diff_json[example.qas_id] = score_diff
|
||||
if score_diff > null_score_diff_threshold:
|
||||
all_predictions[example.qas_id] = ""
|
||||
else:
|
||||
all_predictions[example.qas_id] = best_non_null_entry.text
|
||||
all_nbest_json[example.qas_id] = nbest_json
|
||||
|
||||
with open(output_prediction_file, "w") as writer:
|
||||
writer.write(json.dumps(all_predictions, indent=4) + "\n")
|
||||
@ -537,6 +628,10 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
|
||||
with open(output_nbest_file, "w") as writer:
|
||||
writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
|
||||
|
||||
if version_2_with_negative:
|
||||
with open(output_null_log_odds_file, "w") as writer:
|
||||
writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
|
||||
|
||||
|
||||
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
|
||||
"""Project the tokenized prediction back to the original text."""
|
||||
@ -599,7 +694,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
|
||||
if len(orig_ns_text) != len(tok_ns_text):
|
||||
if verbose_logging:
|
||||
logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
|
||||
orig_ns_text, tok_ns_text)
|
||||
orig_ns_text, tok_ns_text)
|
||||
return orig_text
|
||||
|
||||
# We then project the characters in `pred_text` back to `orig_text` using
|
||||
@ -668,44 +763,16 @@ def _compute_softmax(scores):
|
||||
probs.append(score / total_sum)
|
||||
return probs
|
||||
|
||||
def copy_optimizer_params_to_model(named_params_model, named_params_optimizer):
|
||||
""" Utility function for optimize_on_cpu and 16-bits training.
|
||||
Copy the parameters optimized on CPU/RAM back to the model on GPU
|
||||
"""
|
||||
for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model):
|
||||
if name_opti != name_model:
|
||||
logger.error("name_opti != name_model: {} {}".format(name_opti, name_model))
|
||||
raise ValueError
|
||||
param_model.data.copy_(param_opti.data)
|
||||
|
||||
def set_optimizer_params_grad(named_params_optimizer, named_params_model, test_nan=False):
|
||||
""" Utility function for optimize_on_cpu and 16-bits training.
|
||||
Copy the gradient of the GPU parameters to the CPU/RAMM copy of the model
|
||||
"""
|
||||
is_nan = False
|
||||
for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model):
|
||||
if name_opti != name_model:
|
||||
logger.error("name_opti != name_model: {} {}".format(name_opti, name_model))
|
||||
raise ValueError
|
||||
if param_model.grad is not None:
|
||||
if test_nan and torch.isnan(param_model.grad).sum() > 0:
|
||||
is_nan = True
|
||||
if param_opti.grad is None:
|
||||
param_opti.grad = torch.nn.Parameter(param_opti.data.new().resize_(*param_opti.data.size()))
|
||||
param_opti.grad.data.copy_(param_model.grad.data)
|
||||
else:
|
||||
param_opti.grad = None
|
||||
return is_nan
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
## Required parameters
|
||||
parser.add_argument("--bert_model", default=None, type=str, required=True,
|
||||
help="Bert pre-trained model selected in the list: bert-base-uncased, "
|
||||
"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
|
||||
"bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
|
||||
"bert-base-multilingual-cased, bert-base-chinese.")
|
||||
parser.add_argument("--output_dir", default=None, type=str, required=True,
|
||||
help="The output directory where the model checkpoints will be written.")
|
||||
help="The output directory where the model checkpoints and predictions will be written.")
|
||||
|
||||
## Other parameters
|
||||
parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json")
|
||||
@ -719,15 +786,15 @@ def main():
|
||||
parser.add_argument("--max_query_length", default=64, type=int,
|
||||
help="The maximum number of tokens for the question. Questions longer than this will "
|
||||
"be truncated to this length.")
|
||||
parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.")
|
||||
parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.")
|
||||
parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
|
||||
parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.")
|
||||
parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
|
||||
parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
|
||||
parser.add_argument("--num_train_epochs", default=3.0, type=float,
|
||||
help="Total number of training epochs to perform.")
|
||||
parser.add_argument("--warmup_proportion", default=0.1, type=float,
|
||||
help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
|
||||
help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% "
|
||||
"of training.")
|
||||
parser.add_argument("--n_best_size", default=20, type=int,
|
||||
help="The total number of n-best predictions to generate in the nbest_predictions.json "
|
||||
@ -735,58 +802,60 @@ def main():
|
||||
parser.add_argument("--max_answer_length", default=30, type=int,
|
||||
help="The maximum length of an answer that can be generated. This is needed because the start "
|
||||
"and end predictions are not conditioned on one another.")
|
||||
parser.add_argument("--verbose_logging", default=False, action='store_true',
|
||||
parser.add_argument("--verbose_logging", action='store_true',
|
||||
help="If true, all of the warnings related to data processing will be printed. "
|
||||
"A number of warnings are expected for a normal SQuAD evaluation.")
|
||||
parser.add_argument("--no_cuda",
|
||||
default=False,
|
||||
action='store_true',
|
||||
help="Whether not to use CUDA when available")
|
||||
parser.add_argument('--seed',
|
||||
type=int,
|
||||
parser.add_argument('--seed',
|
||||
type=int,
|
||||
default=42,
|
||||
help="random seed for initialization")
|
||||
parser.add_argument('--gradient_accumulation_steps',
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.")
|
||||
parser.add_argument("--do_lower_case",
|
||||
action='store_true',
|
||||
help="Whether to lower case the input text. True for uncased models, False for cased models.")
|
||||
parser.add_argument("--local_rank",
|
||||
type=int,
|
||||
default=-1,
|
||||
help="local_rank for distributed training on gpus")
|
||||
parser.add_argument('--optimize_on_cpu',
|
||||
default=False,
|
||||
action='store_true',
|
||||
help="Whether to perform optimization and keep the optimizer averages on CPU")
|
||||
parser.add_argument('--fp16',
|
||||
default=False,
|
||||
action='store_true',
|
||||
help="Whether to use 16-bit float precision instead of 32-bit")
|
||||
parser.add_argument('--loss_scale',
|
||||
type=float, default=128,
|
||||
help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
|
||||
|
||||
type=float, default=0,
|
||||
help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
|
||||
"0 (default value): dynamic loss scaling.\n"
|
||||
"Positive power of 2: static loss scaling value.\n")
|
||||
parser.add_argument('--version_2_with_negative',
|
||||
action='store_true',
|
||||
help='If true, the SQuAD examples contain some that do not have an answer.')
|
||||
parser.add_argument('--null_score_diff_threshold',
|
||||
type=float, default=0.0,
|
||||
help="If null_score - best_non_null is greater than the threshold predict null.")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.local_rank == -1 or args.no_cuda:
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
n_gpu = torch.cuda.device_count()
|
||||
else:
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
n_gpu = 1
|
||||
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
torch.distributed.init_process_group(backend='nccl')
|
||||
if args.fp16:
|
||||
logger.info("16-bits training currently not supported in distributed training")
|
||||
args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496)
|
||||
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits trainiing: {}".format(
|
||||
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
|
||||
device, n_gpu, bool(args.local_rank != -1), args.fp16))
|
||||
|
||||
if args.gradient_accumulation_steps < 1:
|
||||
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
|
||||
args.gradient_accumulation_steps))
|
||||
|
||||
args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)
|
||||
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
|
||||
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
@ -806,65 +875,99 @@ def main():
|
||||
raise ValueError(
|
||||
"If `do_predict` is True, then `predict_file` must be specified.")
|
||||
|
||||
if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
|
||||
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
|
||||
raise ValueError("Output directory () already exists and is not empty.")
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
tokenizer = BertTokenizer.from_pretrained(args.bert_model)
|
||||
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
|
||||
|
||||
train_examples = None
|
||||
num_train_steps = None
|
||||
num_train_optimization_steps = None
|
||||
if args.do_train:
|
||||
train_examples = read_squad_examples(
|
||||
input_file=args.train_file, is_training=True)
|
||||
num_train_steps = int(
|
||||
len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)
|
||||
input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
|
||||
num_train_optimization_steps = int(
|
||||
len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
|
||||
if args.local_rank != -1:
|
||||
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
|
||||
|
||||
# Prepare model
|
||||
model = BertForQuestionAnswering.from_pretrained(args.bert_model,
|
||||
cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank))
|
||||
cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)))
|
||||
|
||||
if args.fp16:
|
||||
model.half()
|
||||
model.to(device)
|
||||
if args.local_rank != -1:
|
||||
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
|
||||
output_device=args.local_rank)
|
||||
try:
|
||||
from apex.parallel import DistributedDataParallel as DDP
|
||||
except ImportError:
|
||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
|
||||
|
||||
model = DDP(model)
|
||||
elif n_gpu > 1:
|
||||
model = torch.nn.DataParallel(model)
|
||||
|
||||
# Prepare optimizer
|
||||
if args.fp16:
|
||||
param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \
|
||||
for n, param in model.named_parameters()]
|
||||
elif args.optimize_on_cpu:
|
||||
param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
|
||||
for n, param in model.named_parameters()]
|
||||
else:
|
||||
param_optimizer = list(model.named_parameters())
|
||||
no_decay = ['bias', 'gamma', 'beta']
|
||||
param_optimizer = list(model.named_parameters())
|
||||
|
||||
# hack to remove pooler, which is not used
|
||||
# thus it produce None grad that break apex
|
||||
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
|
||||
|
||||
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
|
||||
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
|
||||
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
|
||||
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||
]
|
||||
optimizer = BertAdam(optimizer_grouped_parameters,
|
||||
lr=args.learning_rate,
|
||||
warmup=args.warmup_proportion,
|
||||
t_total=num_train_steps)
|
||||
|
||||
if args.fp16:
|
||||
try:
|
||||
from apex.optimizers import FP16_Optimizer
|
||||
from apex.optimizers import FusedAdam
|
||||
except ImportError:
|
||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
|
||||
|
||||
optimizer = FusedAdam(optimizer_grouped_parameters,
|
||||
lr=args.learning_rate,
|
||||
bias_correction=False,
|
||||
max_grad_norm=1.0)
|
||||
if args.loss_scale == 0:
|
||||
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
|
||||
else:
|
||||
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
|
||||
else:
|
||||
optimizer = BertAdam(optimizer_grouped_parameters,
|
||||
lr=args.learning_rate,
|
||||
warmup=args.warmup_proportion,
|
||||
t_total=num_train_optimization_steps)
|
||||
|
||||
global_step = 0
|
||||
if args.do_train:
|
||||
train_features = convert_examples_to_features(
|
||||
examples=train_examples,
|
||||
tokenizer=tokenizer,
|
||||
max_seq_length=args.max_seq_length,
|
||||
doc_stride=args.doc_stride,
|
||||
max_query_length=args.max_query_length,
|
||||
is_training=True)
|
||||
cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format(
|
||||
list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length))
|
||||
train_features = None
|
||||
try:
|
||||
with open(cached_train_features_file, "rb") as reader:
|
||||
train_features = pickle.load(reader)
|
||||
except:
|
||||
train_features = convert_examples_to_features(
|
||||
examples=train_examples,
|
||||
tokenizer=tokenizer,
|
||||
max_seq_length=args.max_seq_length,
|
||||
doc_stride=args.doc_stride,
|
||||
max_query_length=args.max_query_length,
|
||||
is_training=True)
|
||||
if args.local_rank == -1 or torch.distributed.get_rank() == 0:
|
||||
logger.info(" Saving train features into cached file %s", cached_train_features_file)
|
||||
with open(cached_train_features_file, "wb") as writer:
|
||||
pickle.dump(train_features, writer)
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(" Num orig examples = %d", len(train_examples))
|
||||
logger.info(" Num split examples = %d", len(train_features))
|
||||
logger.info(" Batch size = %d", args.train_batch_size)
|
||||
logger.info(" Num steps = %d", num_train_steps)
|
||||
logger.info(" Num steps = %d", num_train_optimization_steps)
|
||||
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
|
||||
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
|
||||
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
|
||||
@ -887,36 +990,45 @@ def main():
|
||||
loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions)
|
||||
if n_gpu > 1:
|
||||
loss = loss.mean() # mean() to average on multi-gpu.
|
||||
if args.fp16 and args.loss_scale != 1.0:
|
||||
# rescale loss for fp16 training
|
||||
# see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
|
||||
loss = loss * args.loss_scale
|
||||
if args.gradient_accumulation_steps > 1:
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
loss.backward()
|
||||
|
||||
if args.fp16:
|
||||
optimizer.backward(loss)
|
||||
else:
|
||||
loss.backward()
|
||||
if (step + 1) % args.gradient_accumulation_steps == 0:
|
||||
if args.fp16 or args.optimize_on_cpu:
|
||||
if args.fp16 and args.loss_scale != 1.0:
|
||||
# scale down gradients for fp16 training
|
||||
for param in model.parameters():
|
||||
if param.grad is not None:
|
||||
param.grad.data = param.grad.data / args.loss_scale
|
||||
is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
|
||||
if is_nan:
|
||||
logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling")
|
||||
args.loss_scale = args.loss_scale / 2
|
||||
model.zero_grad()
|
||||
continue
|
||||
optimizer.step()
|
||||
copy_optimizer_params_to_model(model.named_parameters(), param_optimizer)
|
||||
else:
|
||||
optimizer.step()
|
||||
model.zero_grad()
|
||||
if args.fp16:
|
||||
# modify learning rate with special warm up BERT uses
|
||||
# if args.fp16 is False, BertAdam is used and handles this automatically
|
||||
lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group['lr'] = lr_this_step
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
global_step += 1
|
||||
|
||||
if args.do_predict:
|
||||
if args.do_train:
|
||||
# Save a trained model and the associated configuration
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
|
||||
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
|
||||
torch.save(model_to_save.state_dict(), output_model_file)
|
||||
output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
|
||||
with open(output_config_file, 'w') as f:
|
||||
f.write(model_to_save.config.to_json_string())
|
||||
|
||||
# Load a trained model and config that you have fine-tuned
|
||||
config = BertConfig(output_config_file)
|
||||
model = BertForQuestionAnswering(config)
|
||||
model.load_state_dict(torch.load(output_model_file))
|
||||
else:
|
||||
model = BertForQuestionAnswering.from_pretrained(args.bert_model)
|
||||
|
||||
model.to(device)
|
||||
|
||||
if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
|
||||
eval_examples = read_squad_examples(
|
||||
input_file=args.predict_file, is_training=False)
|
||||
input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative)
|
||||
eval_features = convert_examples_to_features(
|
||||
examples=eval_examples,
|
||||
tokenizer=tokenizer,
|
||||
@ -935,10 +1047,8 @@ def main():
|
||||
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
|
||||
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
|
||||
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
|
||||
if args.local_rank == -1:
|
||||
eval_sampler = SequentialSampler(eval_data)
|
||||
else:
|
||||
eval_sampler = DistributedSampler(eval_data)
|
||||
# Run prediction for full data
|
||||
eval_sampler = SequentialSampler(eval_data)
|
||||
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
|
||||
|
||||
model.eval()
|
||||
@ -962,10 +1072,12 @@ def main():
|
||||
end_logits=end_logits))
|
||||
output_prediction_file = os.path.join(args.output_dir, "predictions.json")
|
||||
output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
|
||||
output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
|
||||
write_predictions(eval_examples, eval_features, all_results,
|
||||
args.n_best_size, args.max_answer_length,
|
||||
args.do_lower_case, output_prediction_file,
|
||||
output_nbest_file, args.verbose_logging)
|
||||
output_nbest_file, output_null_log_odds_file, args.verbose_logging,
|
||||
args.version_2_with_negative, args.null_score_diff_threshold)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
547
examples/run_swag.py
Normal file
547
examples/run_swag.py
Normal file
@ -0,0 +1,547 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""BERT finetuning runner."""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
from io import open
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||
TensorDataset)
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
|
||||
from pytorch_pretrained_bert.modeling import BertForMultipleChoice
|
||||
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
|
||||
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SwagExample(object):
|
||||
"""A single training/test example for the SWAG dataset."""
|
||||
def __init__(self,
|
||||
swag_id,
|
||||
context_sentence,
|
||||
start_ending,
|
||||
ending_0,
|
||||
ending_1,
|
||||
ending_2,
|
||||
ending_3,
|
||||
label = None):
|
||||
self.swag_id = swag_id
|
||||
self.context_sentence = context_sentence
|
||||
self.start_ending = start_ending
|
||||
self.endings = [
|
||||
ending_0,
|
||||
ending_1,
|
||||
ending_2,
|
||||
ending_3,
|
||||
]
|
||||
self.label = label
|
||||
|
||||
def __str__(self):
|
||||
return self.__repr__()
|
||||
|
||||
def __repr__(self):
|
||||
l = [
|
||||
"swag_id: {}".format(self.swag_id),
|
||||
"context_sentence: {}".format(self.context_sentence),
|
||||
"start_ending: {}".format(self.start_ending),
|
||||
"ending_0: {}".format(self.endings[0]),
|
||||
"ending_1: {}".format(self.endings[1]),
|
||||
"ending_2: {}".format(self.endings[2]),
|
||||
"ending_3: {}".format(self.endings[3]),
|
||||
]
|
||||
|
||||
if self.label is not None:
|
||||
l.append("label: {}".format(self.label))
|
||||
|
||||
return ", ".join(l)
|
||||
|
||||
|
||||
class InputFeatures(object):
|
||||
def __init__(self,
|
||||
example_id,
|
||||
choices_features,
|
||||
label
|
||||
|
||||
):
|
||||
self.example_id = example_id
|
||||
self.choices_features = [
|
||||
{
|
||||
'input_ids': input_ids,
|
||||
'input_mask': input_mask,
|
||||
'segment_ids': segment_ids
|
||||
}
|
||||
for _, input_ids, input_mask, segment_ids in choices_features
|
||||
]
|
||||
self.label = label
|
||||
|
||||
|
||||
def read_swag_examples(input_file, is_training):
|
||||
with open(input_file, 'r', encoding='utf-8') as f:
|
||||
reader = csv.reader(f)
|
||||
lines = []
|
||||
for line in reader:
|
||||
if sys.version_info[0] == 2:
|
||||
line = list(unicode(cell, 'utf-8') for cell in line)
|
||||
lines.append(line)
|
||||
|
||||
if is_training and lines[0][-1] != 'label':
|
||||
raise ValueError(
|
||||
"For training, the input file must contain a label column."
|
||||
)
|
||||
|
||||
examples = [
|
||||
SwagExample(
|
||||
swag_id = line[2],
|
||||
context_sentence = line[4],
|
||||
start_ending = line[5], # in the swag dataset, the
|
||||
# common beginning of each
|
||||
# choice is stored in "sent2".
|
||||
ending_0 = line[7],
|
||||
ending_1 = line[8],
|
||||
ending_2 = line[9],
|
||||
ending_3 = line[10],
|
||||
label = int(line[11]) if is_training else None
|
||||
) for line in lines[1:] # we skip the line with the column names
|
||||
]
|
||||
|
||||
return examples
|
||||
|
||||
def convert_examples_to_features(examples, tokenizer, max_seq_length,
|
||||
is_training):
|
||||
"""Loads a data file into a list of `InputBatch`s."""
|
||||
|
||||
# Swag is a multiple choice task. To perform this task using Bert,
|
||||
# we will use the formatting proposed in "Improving Language
|
||||
# Understanding by Generative Pre-Training" and suggested by
|
||||
# @jacobdevlin-google in this issue
|
||||
# https://github.com/google-research/bert/issues/38.
|
||||
#
|
||||
# Each choice will correspond to a sample on which we run the
|
||||
# inference. For a given Swag example, we will create the 4
|
||||
# following inputs:
|
||||
# - [CLS] context [SEP] choice_1 [SEP]
|
||||
# - [CLS] context [SEP] choice_2 [SEP]
|
||||
# - [CLS] context [SEP] choice_3 [SEP]
|
||||
# - [CLS] context [SEP] choice_4 [SEP]
|
||||
# The model will output a single value for each input. To get the
|
||||
# final decision of the model, we will run a softmax over these 4
|
||||
# outputs.
|
||||
features = []
|
||||
for example_index, example in enumerate(examples):
|
||||
context_tokens = tokenizer.tokenize(example.context_sentence)
|
||||
start_ending_tokens = tokenizer.tokenize(example.start_ending)
|
||||
|
||||
choices_features = []
|
||||
for ending_index, ending in enumerate(example.endings):
|
||||
# We create a copy of the context tokens in order to be
|
||||
# able to shrink it according to ending_tokens
|
||||
context_tokens_choice = context_tokens[:]
|
||||
ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)
|
||||
# Modifies `context_tokens_choice` and `ending_tokens` in
|
||||
# place so that the total length is less than the
|
||||
# specified length. Account for [CLS], [SEP], [SEP] with
|
||||
# "- 3"
|
||||
_truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)
|
||||
|
||||
tokens = ["[CLS]"] + context_tokens_choice + ["[SEP]"] + ending_tokens + ["[SEP]"]
|
||||
segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)
|
||||
|
||||
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||
input_mask = [1] * len(input_ids)
|
||||
|
||||
# Zero-pad up to the sequence length.
|
||||
padding = [0] * (max_seq_length - len(input_ids))
|
||||
input_ids += padding
|
||||
input_mask += padding
|
||||
segment_ids += padding
|
||||
|
||||
assert len(input_ids) == max_seq_length
|
||||
assert len(input_mask) == max_seq_length
|
||||
assert len(segment_ids) == max_seq_length
|
||||
|
||||
choices_features.append((tokens, input_ids, input_mask, segment_ids))
|
||||
|
||||
label = example.label
|
||||
if example_index < 5:
|
||||
logger.info("*** Example ***")
|
||||
logger.info("swag_id: {}".format(example.swag_id))
|
||||
for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
|
||||
logger.info("choice: {}".format(choice_idx))
|
||||
logger.info("tokens: {}".format(' '.join(tokens)))
|
||||
logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
|
||||
logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
|
||||
logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
|
||||
if is_training:
|
||||
logger.info("label: {}".format(label))
|
||||
|
||||
features.append(
|
||||
InputFeatures(
|
||||
example_id = example.swag_id,
|
||||
choices_features = choices_features,
|
||||
label = label
|
||||
)
|
||||
)
|
||||
|
||||
return features
|
||||
|
||||
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
|
||||
"""Truncates a sequence pair in place to the maximum length."""
|
||||
|
||||
# This is a simple heuristic which will always truncate the longer sequence
|
||||
# one token at a time. This makes more sense than truncating an equal percent
|
||||
# of tokens from each, since if one sequence is very short then each token
|
||||
# that's truncated likely contains more information than a longer sequence.
|
||||
while True:
|
||||
total_length = len(tokens_a) + len(tokens_b)
|
||||
if total_length <= max_length:
|
||||
break
|
||||
if len(tokens_a) > len(tokens_b):
|
||||
tokens_a.pop()
|
||||
else:
|
||||
tokens_b.pop()
|
||||
|
||||
def accuracy(out, labels):
|
||||
outputs = np.argmax(out, axis=1)
|
||||
return np.sum(outputs == labels)
|
||||
|
||||
def select_field(features, field):
|
||||
return [
|
||||
[
|
||||
choice[field]
|
||||
for choice in feature.choices_features
|
||||
]
|
||||
for feature in features
|
||||
]
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
## Required parameters
|
||||
parser.add_argument("--data_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The input data dir. Should contain the .csv files (or other data files) for the task.")
|
||||
parser.add_argument("--bert_model", default=None, type=str, required=True,
|
||||
help="Bert pre-trained model selected in the list: bert-base-uncased, "
|
||||
"bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
|
||||
"bert-base-multilingual-cased, bert-base-chinese.")
|
||||
parser.add_argument("--output_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The output directory where the model checkpoints will be written.")
|
||||
|
||||
## Other parameters
|
||||
parser.add_argument("--max_seq_length",
|
||||
default=128,
|
||||
type=int,
|
||||
help="The maximum total input sequence length after WordPiece tokenization. \n"
|
||||
"Sequences longer than this will be truncated, and sequences shorter \n"
|
||||
"than this will be padded.")
|
||||
parser.add_argument("--do_train",
|
||||
action='store_true',
|
||||
help="Whether to run training.")
|
||||
parser.add_argument("--do_eval",
|
||||
action='store_true',
|
||||
help="Whether to run eval on the dev set.")
|
||||
parser.add_argument("--do_lower_case",
|
||||
action='store_true',
|
||||
help="Set this flag if you are using an uncased model.")
|
||||
parser.add_argument("--train_batch_size",
|
||||
default=32,
|
||||
type=int,
|
||||
help="Total batch size for training.")
|
||||
parser.add_argument("--eval_batch_size",
|
||||
default=8,
|
||||
type=int,
|
||||
help="Total batch size for eval.")
|
||||
parser.add_argument("--learning_rate",
|
||||
default=5e-5,
|
||||
type=float,
|
||||
help="The initial learning rate for Adam.")
|
||||
parser.add_argument("--num_train_epochs",
|
||||
default=3.0,
|
||||
type=float,
|
||||
help="Total number of training epochs to perform.")
|
||||
parser.add_argument("--warmup_proportion",
|
||||
default=0.1,
|
||||
type=float,
|
||||
help="Proportion of training to perform linear learning rate warmup for. "
|
||||
"E.g., 0.1 = 10%% of training.")
|
||||
parser.add_argument("--no_cuda",
|
||||
action='store_true',
|
||||
help="Whether not to use CUDA when available")
|
||||
parser.add_argument("--local_rank",
|
||||
type=int,
|
||||
default=-1,
|
||||
help="local_rank for distributed training on gpus")
|
||||
parser.add_argument('--seed',
|
||||
type=int,
|
||||
default=42,
|
||||
help="random seed for initialization")
|
||||
parser.add_argument('--gradient_accumulation_steps',
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.")
|
||||
parser.add_argument('--fp16',
|
||||
action='store_true',
|
||||
help="Whether to use 16-bit float precision instead of 32-bit")
|
||||
parser.add_argument('--loss_scale',
|
||||
type=float, default=0,
|
||||
help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
|
||||
"0 (default value): dynamic loss scaling.\n"
|
||||
"Positive power of 2: static loss scaling value.\n")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.local_rank == -1 or args.no_cuda:
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
n_gpu = torch.cuda.device_count()
|
||||
else:
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
n_gpu = 1
|
||||
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
torch.distributed.init_process_group(backend='nccl')
|
||||
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
|
||||
device, n_gpu, bool(args.local_rank != -1), args.fp16))
|
||||
|
||||
if args.gradient_accumulation_steps < 1:
|
||||
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
|
||||
args.gradient_accumulation_steps))
|
||||
|
||||
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
|
||||
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
if n_gpu > 0:
|
||||
torch.cuda.manual_seed_all(args.seed)
|
||||
|
||||
if not args.do_train and not args.do_eval:
|
||||
raise ValueError("At least one of `do_train` or `do_eval` must be True.")
|
||||
|
||||
if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
|
||||
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
|
||||
|
||||
train_examples = None
|
||||
num_train_optimization_steps = None
|
||||
if args.do_train:
|
||||
train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True)
|
||||
num_train_optimization_steps = int(
|
||||
len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
|
||||
if args.local_rank != -1:
|
||||
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
|
||||
|
||||
# Prepare model
|
||||
model = BertForMultipleChoice.from_pretrained(args.bert_model,
|
||||
cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)),
|
||||
num_choices=4)
|
||||
if args.fp16:
|
||||
model.half()
|
||||
model.to(device)
|
||||
if args.local_rank != -1:
|
||||
try:
|
||||
from apex.parallel import DistributedDataParallel as DDP
|
||||
except ImportError:
|
||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
|
||||
|
||||
model = DDP(model)
|
||||
elif n_gpu > 1:
|
||||
model = torch.nn.DataParallel(model)
|
||||
|
||||
# Prepare optimizer
|
||||
param_optimizer = list(model.named_parameters())
|
||||
|
||||
# hack to remove pooler, which is not used
|
||||
# thus it produce None grad that break apex
|
||||
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
|
||||
|
||||
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
|
||||
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||
]
|
||||
if args.fp16:
|
||||
try:
|
||||
from apex.optimizers import FP16_Optimizer
|
||||
from apex.optimizers import FusedAdam
|
||||
except ImportError:
|
||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
|
||||
|
||||
optimizer = FusedAdam(optimizer_grouped_parameters,
|
||||
lr=args.learning_rate,
|
||||
bias_correction=False,
|
||||
max_grad_norm=1.0)
|
||||
if args.loss_scale == 0:
|
||||
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
|
||||
else:
|
||||
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
|
||||
else:
|
||||
optimizer = BertAdam(optimizer_grouped_parameters,
|
||||
lr=args.learning_rate,
|
||||
warmup=args.warmup_proportion,
|
||||
t_total=num_train_optimization_steps)
|
||||
|
||||
global_step = 0
|
||||
if args.do_train:
|
||||
train_features = convert_examples_to_features(
|
||||
train_examples, tokenizer, args.max_seq_length, True)
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(" Num examples = %d", len(train_examples))
|
||||
logger.info(" Batch size = %d", args.train_batch_size)
|
||||
logger.info(" Num steps = %d", num_train_optimization_steps)
|
||||
all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long)
|
||||
all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long)
|
||||
all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long)
|
||||
all_label = torch.tensor([f.label for f in train_features], dtype=torch.long)
|
||||
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
|
||||
if args.local_rank == -1:
|
||||
train_sampler = RandomSampler(train_data)
|
||||
else:
|
||||
train_sampler = DistributedSampler(train_data)
|
||||
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
|
||||
|
||||
model.train()
|
||||
for _ in trange(int(args.num_train_epochs), desc="Epoch"):
|
||||
tr_loss = 0
|
||||
nb_tr_examples, nb_tr_steps = 0, 0
|
||||
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
|
||||
batch = tuple(t.to(device) for t in batch)
|
||||
input_ids, input_mask, segment_ids, label_ids = batch
|
||||
loss = model(input_ids, segment_ids, input_mask, label_ids)
|
||||
if n_gpu > 1:
|
||||
loss = loss.mean() # mean() to average on multi-gpu.
|
||||
if args.fp16 and args.loss_scale != 1.0:
|
||||
# rescale loss for fp16 training
|
||||
# see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
|
||||
loss = loss * args.loss_scale
|
||||
if args.gradient_accumulation_steps > 1:
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
tr_loss += loss.item()
|
||||
nb_tr_examples += input_ids.size(0)
|
||||
nb_tr_steps += 1
|
||||
|
||||
if args.fp16:
|
||||
optimizer.backward(loss)
|
||||
else:
|
||||
loss.backward()
|
||||
if (step + 1) % args.gradient_accumulation_steps == 0:
|
||||
if args.fp16:
|
||||
# modify learning rate with special warm up BERT uses
|
||||
# if args.fp16 is False, BertAdam is used that handles this automatically
|
||||
lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group['lr'] = lr_this_step
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
global_step += 1
|
||||
|
||||
|
||||
if args.do_train:
|
||||
# Save a trained model and the associated configuration
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
|
||||
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
|
||||
torch.save(model_to_save.state_dict(), output_model_file)
|
||||
output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
|
||||
with open(output_config_file, 'w') as f:
|
||||
f.write(model_to_save.config.to_json_string())
|
||||
|
||||
# Load a trained model and config that you have fine-tuned
|
||||
config = BertConfig(output_config_file)
|
||||
model = BertForMultipleChoice(config, num_choices=4)
|
||||
model.load_state_dict(torch.load(output_model_file))
|
||||
else:
|
||||
model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4)
|
||||
model.to(device)
|
||||
|
||||
|
||||
if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
|
||||
eval_examples = read_swag_examples(os.path.join(args.data_dir, 'val.csv'), is_training = True)
|
||||
eval_features = convert_examples_to_features(
|
||||
eval_examples, tokenizer, args.max_seq_length, True)
|
||||
logger.info("***** Running evaluation *****")
|
||||
logger.info(" Num examples = %d", len(eval_examples))
|
||||
logger.info(" Batch size = %d", args.eval_batch_size)
|
||||
all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long)
|
||||
all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long)
|
||||
all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long)
|
||||
all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long)
|
||||
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
|
||||
# Run prediction for full data
|
||||
eval_sampler = SequentialSampler(eval_data)
|
||||
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||
|
||||
model.eval()
|
||||
eval_loss, eval_accuracy = 0, 0
|
||||
nb_eval_steps, nb_eval_examples = 0, 0
|
||||
for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
|
||||
input_ids = input_ids.to(device)
|
||||
input_mask = input_mask.to(device)
|
||||
segment_ids = segment_ids.to(device)
|
||||
label_ids = label_ids.to(device)
|
||||
|
||||
with torch.no_grad():
|
||||
tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
|
||||
logits = model(input_ids, segment_ids, input_mask)
|
||||
|
||||
logits = logits.detach().cpu().numpy()
|
||||
label_ids = label_ids.to('cpu').numpy()
|
||||
tmp_eval_accuracy = accuracy(logits, label_ids)
|
||||
|
||||
eval_loss += tmp_eval_loss.mean().item()
|
||||
eval_accuracy += tmp_eval_accuracy
|
||||
|
||||
nb_eval_examples += input_ids.size(0)
|
||||
nb_eval_steps += 1
|
||||
|
||||
eval_loss = eval_loss / nb_eval_steps
|
||||
eval_accuracy = eval_accuracy / nb_eval_examples
|
||||
|
||||
result = {'eval_loss': eval_loss,
|
||||
'eval_accuracy': eval_accuracy,
|
||||
'global_step': global_step,
|
||||
'loss': tr_loss/nb_tr_steps}
|
||||
|
||||
output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
|
||||
with open(output_eval_file, "w") as writer:
|
||||
logger.info("***** Eval results *****")
|
||||
for key in sorted(result.keys()):
|
||||
logger.info(" %s = %s", key, str(result[key]))
|
||||
writer.write("%s = %s\n" % (key, str(result[key])))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
152
examples/run_transfo_xl.py
Normal file
152
examples/run_transfo_xl.py
Normal file
@ -0,0 +1,152 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" PyTorch Transformer XL model evaluation script.
|
||||
Adapted from https://github.com/kimiyoung/transformer-xl.
|
||||
In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
|
||||
|
||||
This script with default values evaluates a pretrained Transformer-XL on WikiText 103
|
||||
"""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import time
|
||||
import math
|
||||
|
||||
import torch
|
||||
|
||||
from pytorch_pretrained_bert import TransfoXLLMHeadModel, TransfoXLCorpus
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
|
||||
parser.add_argument('--model_name', type=str, default='transfo-xl-wt103',
|
||||
help='pretrained model name')
|
||||
parser.add_argument('--split', type=str, default='test',
|
||||
choices=['all', 'valid', 'test'],
|
||||
help='which split to evaluate')
|
||||
parser.add_argument('--batch_size', type=int, default=10,
|
||||
help='batch size')
|
||||
parser.add_argument('--tgt_len', type=int, default=128,
|
||||
help='number of tokens to predict')
|
||||
parser.add_argument('--ext_len', type=int, default=0,
|
||||
help='length of the extended context')
|
||||
parser.add_argument('--mem_len', type=int, default=1600,
|
||||
help='length of the retained previous heads')
|
||||
parser.add_argument('--clamp_len', type=int, default=1000,
|
||||
help='max positional embedding index')
|
||||
parser.add_argument('--no_cuda', action='store_true',
|
||||
help='Do not use CUDA even though CUA is available')
|
||||
parser.add_argument('--work_dir', type=str, required=True,
|
||||
help='path to the work_dir')
|
||||
parser.add_argument('--no_log', action='store_true',
|
||||
help='do not log the eval result')
|
||||
parser.add_argument('--same_length', action='store_true',
|
||||
help='set same length attention with masking')
|
||||
parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
|
||||
parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
|
||||
args = parser.parse_args()
|
||||
assert args.ext_len >= 0, 'extended context length must be non-negative'
|
||||
|
||||
if args.server_ip and args.server_port:
|
||||
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||
import ptvsd
|
||||
print("Waiting for debugger attach")
|
||||
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||
ptvsd.wait_for_attach()
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
logger.info("device: {}".format(device))
|
||||
|
||||
# Load a pre-processed dataset
|
||||
# You can also build the corpus yourself using TransfoXLCorpus methods
|
||||
# The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax
|
||||
# and tokenizing the dataset
|
||||
# The pre-processed corpus is a convertion (using the conversion script )
|
||||
corpus = TransfoXLCorpus.from_pretrained(args.model_name)
|
||||
ntokens = len(corpus.vocab)
|
||||
|
||||
va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len,
|
||||
device=device, ext_len=args.ext_len)
|
||||
te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len,
|
||||
device=device, ext_len=args.ext_len)
|
||||
|
||||
# Load a pre-trained model
|
||||
model = TransfoXLLMHeadModel.from_pretrained(args.model_name)
|
||||
model = model.to(device)
|
||||
|
||||
logger.info('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
|
||||
args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len))
|
||||
|
||||
model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
|
||||
if args.clamp_len > 0:
|
||||
model.clamp_len = args.clamp_len
|
||||
if args.same_length:
|
||||
model.same_length = True
|
||||
|
||||
###############################################################################
|
||||
# Evaluation code
|
||||
###############################################################################
|
||||
def evaluate(eval_iter):
|
||||
# Turn on evaluation mode which disables dropout.
|
||||
model.eval()
|
||||
total_len, total_loss = 0, 0.
|
||||
start_time = time.time()
|
||||
with torch.no_grad():
|
||||
mems = None
|
||||
for idx, (data, target, seq_len) in enumerate(eval_iter):
|
||||
ret = model(data, target, mems)
|
||||
loss, mems = ret
|
||||
loss = loss.mean()
|
||||
total_loss += seq_len * loss.item()
|
||||
total_len += seq_len
|
||||
total_time = time.time() - start_time
|
||||
logger.info('Time : {:.2f}s, {:.2f}ms/segment'.format(
|
||||
total_time, 1000 * total_time / (idx+1)))
|
||||
return total_loss / total_len
|
||||
|
||||
# Run on test data.
|
||||
if args.split == 'all':
|
||||
test_loss = evaluate(te_iter)
|
||||
valid_loss = evaluate(va_iter)
|
||||
elif args.split == 'valid':
|
||||
valid_loss = evaluate(va_iter)
|
||||
test_loss = None
|
||||
elif args.split == 'test':
|
||||
test_loss = evaluate(te_iter)
|
||||
valid_loss = None
|
||||
|
||||
def format_log(loss, split):
|
||||
log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format(
|
||||
split, loss, math.exp(loss))
|
||||
return log_str
|
||||
|
||||
log_str = ''
|
||||
if valid_loss is not None:
|
||||
log_str += format_log(valid_loss, 'valid')
|
||||
if test_loss is not None:
|
||||
log_str += format_log(test_loss, 'test')
|
||||
|
||||
logger.info('=' * 100)
|
||||
logger.info(log_str)
|
||||
logger.info('=' * 100)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -133,7 +133,7 @@
|
||||
" unique_id = 0\n",
|
||||
" with tf.gfile.GFile(input_file, \"r\") as reader:\n",
|
||||
" while True:\n",
|
||||
" line = reader.readline()#tokenization.convert_to_unicode(reader.readline())\n",
|
||||
" line = reader.readline()\n",
|
||||
" if not line:\n",
|
||||
" break\n",
|
||||
" line = line.strip()\n",
|
||||
|
@ -1,6 +1,24 @@
|
||||
__version__ = "0.6.0"
|
||||
from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
|
||||
from .tokenization_openai import OpenAIGPTTokenizer
|
||||
from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
|
||||
from .tokenization_gpt2 import GPT2Tokenizer
|
||||
|
||||
from .modeling import (BertConfig, BertModel, BertForPreTraining,
|
||||
BertForMaskedLM, BertForNextSentencePrediction,
|
||||
BertForSequenceClassification, BertForQuestionAnswering)
|
||||
BertForSequenceClassification, BertForMultipleChoice,
|
||||
BertForTokenClassification, BertForQuestionAnswering,
|
||||
load_tf_weights_in_bert)
|
||||
from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel,
|
||||
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
|
||||
load_tf_weights_in_openai_gpt)
|
||||
from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel,
|
||||
load_tf_weights_in_transfo_xl)
|
||||
from .modeling_gpt2 import (GPT2Config, GPT2Model,
|
||||
GPT2LMHeadModel, GPT2DoubleHeadsModel,
|
||||
load_tf_weights_in_gpt2)
|
||||
|
||||
from .optimization import BertAdam
|
||||
from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE
|
||||
from .optimization_openai import OpenAIAdam
|
||||
|
||||
from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path
|
||||
|
@ -1,19 +1,83 @@
|
||||
# coding: utf8
|
||||
if __name__ == '__main__':
|
||||
def main():
|
||||
import sys
|
||||
try:
|
||||
from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
|
||||
except ModuleNotFoundError:
|
||||
print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
|
||||
"In that case, it requires TensorFlow to be installed. Please see "
|
||||
"https://www.tensorflow.org/install/ for installation instructions.")
|
||||
raise
|
||||
|
||||
if len(sys.argv) != 5:
|
||||
# pylint: disable=line-too-long
|
||||
print("Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
|
||||
if (len(sys.argv) != 4 and len(sys.argv) != 5) or sys.argv[1] not in [
|
||||
"convert_tf_checkpoint_to_pytorch",
|
||||
"convert_openai_checkpoint",
|
||||
"convert_transfo_xl_checkpoint",
|
||||
"convert_gpt2_checkpoint",
|
||||
]:
|
||||
print(
|
||||
"Should be used as one of: \n"
|
||||
">> `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`, \n"
|
||||
">> `pytorch_pretrained_bert convert_openai_checkpoint OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`, \n"
|
||||
">> `pytorch_pretrained_bert convert_transfo_xl_checkpoint TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG]` or \n"
|
||||
">> `pytorch_pretrained_bert convert_gpt2_checkpoint TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG]`")
|
||||
else:
|
||||
PYTORCH_DUMP_OUTPUT = sys.argv.pop()
|
||||
TF_CONFIG = sys.argv.pop()
|
||||
TF_CHECKPOINT = sys.argv.pop()
|
||||
convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
|
||||
if sys.argv[1] == "convert_tf_checkpoint_to_pytorch":
|
||||
try:
|
||||
from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
|
||||
except ImportError:
|
||||
print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
|
||||
"In that case, it requires TensorFlow to be installed. Please see "
|
||||
"https://www.tensorflow.org/install/ for installation instructions.")
|
||||
raise
|
||||
|
||||
if len(sys.argv) != 5:
|
||||
# pylint: disable=line-too-long
|
||||
print("Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
|
||||
else:
|
||||
PYTORCH_DUMP_OUTPUT = sys.argv.pop()
|
||||
TF_CONFIG = sys.argv.pop()
|
||||
TF_CHECKPOINT = sys.argv.pop()
|
||||
convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
|
||||
elif sys.argv[1] == "convert_openai_checkpoint":
|
||||
from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
|
||||
OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
|
||||
PYTORCH_DUMP_OUTPUT = sys.argv[3]
|
||||
if len(sys.argv) == 5:
|
||||
OPENAI_GPT_CONFIG = sys.argv[4]
|
||||
else:
|
||||
OPENAI_GPT_CONFIG = ""
|
||||
convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH,
|
||||
OPENAI_GPT_CONFIG,
|
||||
PYTORCH_DUMP_OUTPUT)
|
||||
elif sys.argv[1] == "convert_transfo_xl_checkpoint":
|
||||
try:
|
||||
from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
|
||||
except ImportError:
|
||||
print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
|
||||
"In that case, it requires TensorFlow to be installed. Please see "
|
||||
"https://www.tensorflow.org/install/ for installation instructions.")
|
||||
raise
|
||||
|
||||
if 'ckpt' in sys.argv[2].lower():
|
||||
TF_CHECKPOINT = sys.argv[2]
|
||||
TF_DATASET_FILE = ""
|
||||
else:
|
||||
TF_DATASET_FILE = sys.argv[2]
|
||||
TF_CHECKPOINT = ""
|
||||
PYTORCH_DUMP_OUTPUT = sys.argv[3]
|
||||
if len(sys.argv) == 5:
|
||||
TF_CONFIG = sys.argv[4]
|
||||
else:
|
||||
TF_CONFIG = ""
|
||||
convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE)
|
||||
else:
|
||||
try:
|
||||
from .convert_gpt2_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
|
||||
except ImportError:
|
||||
print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
|
||||
"In that case, it requires TensorFlow to be installed. Please see "
|
||||
"https://www.tensorflow.org/install/ for installation instructions.")
|
||||
raise
|
||||
|
||||
TF_CHECKPOINT = sys.argv[2]
|
||||
PYTORCH_DUMP_OUTPUT = sys.argv[3]
|
||||
if len(sys.argv) == 5:
|
||||
TF_CONFIG = sys.argv[4]
|
||||
else:
|
||||
TF_CONFIG = ""
|
||||
convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
72
pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py
Executable file
72
pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py
Executable file
@ -0,0 +1,72 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The HugginFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Convert OpenAI GPT checkpoint."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
from io import open
|
||||
|
||||
import torch
|
||||
|
||||
from pytorch_pretrained_bert.modeling_gpt2 import (CONFIG_NAME, WEIGHTS_NAME,
|
||||
GPT2Config,
|
||||
GPT2Model,
|
||||
load_tf_weights_in_gpt2)
|
||||
|
||||
|
||||
def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
|
||||
# Construct model
|
||||
if gpt2_config_file == "":
|
||||
config = GPT2Config()
|
||||
else:
|
||||
config = GPT2Config(gpt2_config_file)
|
||||
model = GPT2Model(config)
|
||||
|
||||
# Load weights from numpy
|
||||
load_tf_weights_in_gpt2(model, gpt2_checkpoint_path)
|
||||
|
||||
# Save pytorch-model
|
||||
pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
|
||||
pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
|
||||
print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
|
||||
torch.save(model.state_dict(), pytorch_weights_dump_path)
|
||||
print("Save configuration file to {}".format(pytorch_config_dump_path))
|
||||
with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
|
||||
f.write(config.to_json_string())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
## Required parameters
|
||||
parser.add_argument("--gpt2_checkpoint_path",
|
||||
default = None,
|
||||
type = str,
|
||||
required = True,
|
||||
help = "Path the TensorFlow checkpoint path.")
|
||||
parser.add_argument("--pytorch_dump_folder_path",
|
||||
default = None,
|
||||
type = str,
|
||||
required = True,
|
||||
help = "Path to the output PyTorch model.")
|
||||
parser.add_argument("--gpt2_config_file",
|
||||
default = "",
|
||||
type = str,
|
||||
help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
|
||||
"This specifies the model architecture.")
|
||||
args = parser.parse_args()
|
||||
convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path,
|
||||
args.gpt2_config_file,
|
||||
args.pytorch_dump_folder_path)
|
72
pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
Executable file
72
pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
Executable file
@ -0,0 +1,72 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The HugginFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Convert OpenAI GPT checkpoint."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
from io import open
|
||||
|
||||
import torch
|
||||
|
||||
from pytorch_pretrained_bert.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME,
|
||||
OpenAIGPTConfig,
|
||||
OpenAIGPTModel,
|
||||
load_tf_weights_in_openai_gpt)
|
||||
|
||||
|
||||
def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
|
||||
# Construct model
|
||||
if openai_config_file == "":
|
||||
config = OpenAIGPTConfig()
|
||||
else:
|
||||
config = OpenAIGPTConfig(openai_config_file)
|
||||
model = OpenAIGPTModel(config)
|
||||
|
||||
# Load weights from numpy
|
||||
load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path)
|
||||
|
||||
# Save pytorch-model
|
||||
pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
|
||||
pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
|
||||
print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
|
||||
torch.save(model.state_dict(), pytorch_weights_dump_path)
|
||||
print("Save configuration file to {}".format(pytorch_config_dump_path))
|
||||
with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
|
||||
f.write(config.to_json_string())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
## Required parameters
|
||||
parser.add_argument("--openai_checkpoint_folder_path",
|
||||
default = None,
|
||||
type = str,
|
||||
required = True,
|
||||
help = "Path the TensorFlow checkpoint path.")
|
||||
parser.add_argument("--pytorch_dump_folder_path",
|
||||
default = None,
|
||||
type = str,
|
||||
required = True,
|
||||
help = "Path to the output PyTorch model.")
|
||||
parser.add_argument("--openai_config_file",
|
||||
default = "",
|
||||
type = str,
|
||||
help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
|
||||
"This specifies the model architecture.")
|
||||
args = parser.parse_args()
|
||||
convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path,
|
||||
args.openai_config_file,
|
||||
args.pytorch_dump_folder_path)
|
@ -25,62 +25,16 @@ import tensorflow as tf
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
from .modeling import BertConfig, BertForPreTraining
|
||||
from pytorch_pretrained_bert.modeling import BertConfig, BertForPreTraining, load_tf_weights_in_bert
|
||||
|
||||
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
|
||||
config_path = os.path.abspath(bert_config_file)
|
||||
tf_path = os.path.abspath(tf_checkpoint_path)
|
||||
print("Converting TensorFlow checkpoint from {} with config at {}".format(tf_path, config_path))
|
||||
# Load weights from TF model
|
||||
init_vars = tf.train.list_variables(tf_path)
|
||||
names = []
|
||||
arrays = []
|
||||
for name, shape in init_vars:
|
||||
print("Loading TF weight {} with shape {}".format(name, shape))
|
||||
array = tf.train.load_variable(tf_path, name)
|
||||
names.append(name)
|
||||
arrays.append(array)
|
||||
|
||||
# Initialise PyTorch model
|
||||
config = BertConfig.from_json_file(bert_config_file)
|
||||
print("Building PyTorch model from configuration: {}".format(str(config)))
|
||||
model = BertForPreTraining(config)
|
||||
|
||||
for name, array in zip(names, arrays):
|
||||
name = name.split('/')
|
||||
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
|
||||
# which are not required for using pretrained model
|
||||
if name[-1] in ["adam_v", "adam_m"]:
|
||||
print("Skipping {}".format("/".join(name)))
|
||||
continue
|
||||
pointer = model
|
||||
for m_name in name:
|
||||
if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
|
||||
l = re.split(r'_(\d+)', m_name)
|
||||
else:
|
||||
l = [m_name]
|
||||
if l[0] == 'kernel':
|
||||
pointer = getattr(pointer, 'weight')
|
||||
elif l[0] == 'output_bias':
|
||||
pointer = getattr(pointer, 'bias')
|
||||
elif l[0] == 'output_weights':
|
||||
pointer = getattr(pointer, 'weight')
|
||||
else:
|
||||
pointer = getattr(pointer, l[0])
|
||||
if len(l) >= 2:
|
||||
num = int(l[1])
|
||||
pointer = pointer[num]
|
||||
if m_name[-11:] == '_embeddings':
|
||||
pointer = getattr(pointer, 'weight')
|
||||
elif m_name == 'kernel':
|
||||
array = np.transpose(array)
|
||||
try:
|
||||
assert pointer.shape == array.shape
|
||||
except AssertionError as e:
|
||||
e.args += (pointer.shape, array.shape)
|
||||
raise
|
||||
print("Initialize PyTorch weight {}".format(name))
|
||||
pointer.data = torch.from_numpy(array)
|
||||
# Load weights from tf checkpoint
|
||||
load_tf_weights_in_bert(model, tf_checkpoint_path)
|
||||
|
||||
# Save pytorch-model
|
||||
print("Save PyTorch model to {}".format(pytorch_dump_path))
|
||||
|
116
pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
Executable file
116
pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
Executable file
@ -0,0 +1,116 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The HugginFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Convert Transformer XL checkpoint and datasets."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
from io import open
|
||||
|
||||
import torch
|
||||
|
||||
import pytorch_pretrained_bert.tokenization_transfo_xl as data_utils
|
||||
from pytorch_pretrained_bert.modeling_transfo_xl import (CONFIG_NAME,
|
||||
WEIGHTS_NAME,
|
||||
TransfoXLConfig,
|
||||
TransfoXLLMHeadModel,
|
||||
load_tf_weights_in_transfo_xl)
|
||||
from pytorch_pretrained_bert.tokenization_transfo_xl import (CORPUS_NAME,
|
||||
VOCAB_NAME)
|
||||
|
||||
if sys.version_info[0] == 2:
|
||||
import cPickle as pickle
|
||||
else:
|
||||
import pickle
|
||||
|
||||
# We do this to be able to load python 2 datasets pickles
|
||||
# See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918
|
||||
data_utils.Vocab = data_utils.TransfoXLTokenizer
|
||||
data_utils.Corpus = data_utils.TransfoXLCorpus
|
||||
sys.modules['data_utils'] = data_utils
|
||||
sys.modules['vocabulary'] = data_utils
|
||||
|
||||
def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
|
||||
transfo_xl_config_file,
|
||||
pytorch_dump_folder_path,
|
||||
transfo_xl_dataset_file):
|
||||
if transfo_xl_dataset_file:
|
||||
# Convert a pre-processed corpus (see original TensorFlow repo)
|
||||
with open(transfo_xl_dataset_file, "rb") as fp:
|
||||
corpus = pickle.load(fp, encoding="latin1")
|
||||
# Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term)
|
||||
pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_NAME
|
||||
print("Save vocabulary to {}".format(pytorch_vocab_dump_path))
|
||||
corpus_vocab_dict = corpus.vocab.__dict__
|
||||
torch.save(corpus_vocab_dict, pytorch_vocab_dump_path)
|
||||
|
||||
corpus_dict_no_vocab = corpus.__dict__
|
||||
corpus_dict_no_vocab.pop('vocab', None)
|
||||
pytorch_dataset_dump_path = pytorch_dump_folder_path + '/' + CORPUS_NAME
|
||||
print("Save dataset to {}".format(pytorch_dataset_dump_path))
|
||||
torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path)
|
||||
|
||||
if tf_checkpoint_path:
|
||||
# Convert a pre-trained TensorFlow model
|
||||
config_path = os.path.abspath(transfo_xl_config_file)
|
||||
tf_path = os.path.abspath(tf_checkpoint_path)
|
||||
|
||||
print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path))
|
||||
# Initialise PyTorch model
|
||||
if transfo_xl_config_file == "":
|
||||
config = TransfoXLConfig()
|
||||
else:
|
||||
config = TransfoXLConfig(transfo_xl_config_file)
|
||||
print("Building PyTorch model from configuration: {}".format(str(config)))
|
||||
model = TransfoXLLMHeadModel(config)
|
||||
|
||||
model = load_tf_weights_in_transfo_xl(model, config, tf_path)
|
||||
# Save pytorch-model
|
||||
pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
|
||||
pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
|
||||
print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
|
||||
torch.save(model.state_dict(), pytorch_weights_dump_path)
|
||||
print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
|
||||
with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
|
||||
f.write(config.to_json_string())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--pytorch_dump_folder_path",
|
||||
default = None,
|
||||
type = str,
|
||||
required = True,
|
||||
help = "Path to the folder to store the PyTorch model or dataset/vocab.")
|
||||
parser.add_argument("--tf_checkpoint_path",
|
||||
default = "",
|
||||
type = str,
|
||||
help = "An optional path to a TensorFlow checkpoint path to be converted.")
|
||||
parser.add_argument("--transfo_xl_config_file",
|
||||
default = "",
|
||||
type = str,
|
||||
help = "An optional config json file corresponding to the pre-trained BERT model. \n"
|
||||
"This specifies the model architecture.")
|
||||
parser.add_argument("--transfo_xl_dataset_file",
|
||||
default = "",
|
||||
type = str,
|
||||
help = "An optional dataset file to be converted in a vocabulary.")
|
||||
args = parser.parse_args()
|
||||
convert_transfo_xl_checkpoint_to_pytorch(args.tf_checkpoint_path,
|
||||
args.transfo_xl_config_file,
|
||||
args.pytorch_dump_folder_path,
|
||||
args.transfo_xl_dataset_file)
|
@ -3,31 +3,40 @@ Utilities for working with the local dataset cache.
|
||||
This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
|
||||
Copyright by the AllenNLP authors.
|
||||
"""
|
||||
from __future__ import (absolute_import, division, print_function, unicode_literals)
|
||||
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import json
|
||||
from urllib.parse import urlparse
|
||||
from pathlib import Path
|
||||
from typing import Optional, Tuple, Union, IO, Callable, Set
|
||||
from hashlib import sha256
|
||||
from functools import wraps
|
||||
|
||||
from tqdm import tqdm
|
||||
from hashlib import sha256
|
||||
import sys
|
||||
from io import open
|
||||
|
||||
import boto3
|
||||
from botocore.exceptions import ClientError
|
||||
import requests
|
||||
from botocore.exceptions import ClientError
|
||||
from tqdm import tqdm
|
||||
|
||||
try:
|
||||
from urllib.parse import urlparse
|
||||
except ImportError:
|
||||
from urlparse import urlparse
|
||||
|
||||
try:
|
||||
from pathlib import Path
|
||||
PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
|
||||
Path.home() / '.pytorch_pretrained_bert'))
|
||||
except AttributeError:
|
||||
PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
|
||||
os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
|
||||
|
||||
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
|
||||
Path.home() / '.pytorch_pretrained_bert'))
|
||||
|
||||
|
||||
def url_to_filename(url: str, etag: str = None) -> str:
|
||||
def url_to_filename(url, etag=None):
|
||||
"""
|
||||
Convert `url` into a hashed filename in a repeatable way.
|
||||
If `etag` is specified, append its hash to the url's, delimited
|
||||
@ -45,23 +54,25 @@ def url_to_filename(url: str, etag: str = None) -> str:
|
||||
return filename
|
||||
|
||||
|
||||
def filename_to_url(filename: str, cache_dir: str = None) -> Tuple[str, str]:
|
||||
def filename_to_url(filename, cache_dir=None):
|
||||
"""
|
||||
Return the url and etag (which may be ``None``) stored for `filename`.
|
||||
Raise ``FileNotFoundError`` if `filename` or its stored metadata do not exist.
|
||||
Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
|
||||
"""
|
||||
if cache_dir is None:
|
||||
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
|
||||
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
||||
cache_dir = str(cache_dir)
|
||||
|
||||
cache_path = os.path.join(cache_dir, filename)
|
||||
if not os.path.exists(cache_path):
|
||||
raise FileNotFoundError("file {} not found".format(cache_path))
|
||||
raise EnvironmentError("file {} not found".format(cache_path))
|
||||
|
||||
meta_path = cache_path + '.json'
|
||||
if not os.path.exists(meta_path):
|
||||
raise FileNotFoundError("file {} not found".format(meta_path))
|
||||
raise EnvironmentError("file {} not found".format(meta_path))
|
||||
|
||||
with open(meta_path) as meta_file:
|
||||
with open(meta_path, encoding="utf-8") as meta_file:
|
||||
metadata = json.load(meta_file)
|
||||
url = metadata['url']
|
||||
etag = metadata['etag']
|
||||
@ -69,7 +80,7 @@ def filename_to_url(filename: str, cache_dir: str = None) -> Tuple[str, str]:
|
||||
return url, etag
|
||||
|
||||
|
||||
def cached_path(url_or_filename: Union[str, Path], cache_dir: str = None) -> str:
|
||||
def cached_path(url_or_filename, cache_dir=None):
|
||||
"""
|
||||
Given something that might be a URL (or might be a local path),
|
||||
determine which. If it's a URL, download the file and cache it, and
|
||||
@ -78,8 +89,10 @@ def cached_path(url_or_filename: Union[str, Path], cache_dir: str = None) -> str
|
||||
"""
|
||||
if cache_dir is None:
|
||||
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
|
||||
if isinstance(url_or_filename, Path):
|
||||
if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
|
||||
url_or_filename = str(url_or_filename)
|
||||
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
||||
cache_dir = str(cache_dir)
|
||||
|
||||
parsed = urlparse(url_or_filename)
|
||||
|
||||
@ -91,13 +104,13 @@ def cached_path(url_or_filename: Union[str, Path], cache_dir: str = None) -> str
|
||||
return url_or_filename
|
||||
elif parsed.scheme == '':
|
||||
# File, but it doesn't exist.
|
||||
raise FileNotFoundError("file {} not found".format(url_or_filename))
|
||||
raise EnvironmentError("file {} not found".format(url_or_filename))
|
||||
else:
|
||||
# Something unknown
|
||||
raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
|
||||
|
||||
|
||||
def split_s3_path(url: str) -> Tuple[str, str]:
|
||||
def split_s3_path(url):
|
||||
"""Split a full s3 path into the bucket name and path."""
|
||||
parsed = urlparse(url)
|
||||
if not parsed.netloc or not parsed.path:
|
||||
@ -110,19 +123,19 @@ def split_s3_path(url: str) -> Tuple[str, str]:
|
||||
return bucket_name, s3_path
|
||||
|
||||
|
||||
def s3_request(func: Callable):
|
||||
def s3_request(func):
|
||||
"""
|
||||
Wrapper function for s3 requests in order to create more helpful error
|
||||
messages.
|
||||
"""
|
||||
|
||||
@wraps(func)
|
||||
def wrapper(url: str, *args, **kwargs):
|
||||
def wrapper(url, *args, **kwargs):
|
||||
try:
|
||||
return func(url, *args, **kwargs)
|
||||
except ClientError as exc:
|
||||
if int(exc.response["Error"]["Code"]) == 404:
|
||||
raise FileNotFoundError("file {} not found".format(url))
|
||||
raise EnvironmentError("file {} not found".format(url))
|
||||
else:
|
||||
raise
|
||||
|
||||
@ -130,7 +143,7 @@ def s3_request(func: Callable):
|
||||
|
||||
|
||||
@s3_request
|
||||
def s3_etag(url: str) -> Optional[str]:
|
||||
def s3_etag(url):
|
||||
"""Check ETag on S3 object."""
|
||||
s3_resource = boto3.resource("s3")
|
||||
bucket_name, s3_path = split_s3_path(url)
|
||||
@ -139,14 +152,14 @@ def s3_etag(url: str) -> Optional[str]:
|
||||
|
||||
|
||||
@s3_request
|
||||
def s3_get(url: str, temp_file: IO) -> None:
|
||||
def s3_get(url, temp_file):
|
||||
"""Pull a file directly from S3."""
|
||||
s3_resource = boto3.resource("s3")
|
||||
bucket_name, s3_path = split_s3_path(url)
|
||||
s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
|
||||
|
||||
|
||||
def http_get(url: str, temp_file: IO) -> None:
|
||||
def http_get(url, temp_file):
|
||||
req = requests.get(url, stream=True)
|
||||
content_length = req.headers.get('Content-Length')
|
||||
total = int(content_length) if content_length is not None else None
|
||||
@ -158,15 +171,18 @@ def http_get(url: str, temp_file: IO) -> None:
|
||||
progress.close()
|
||||
|
||||
|
||||
def get_from_cache(url: str, cache_dir: str = None) -> str:
|
||||
def get_from_cache(url, cache_dir=None):
|
||||
"""
|
||||
Given a URL, look for the corresponding dataset in the local cache.
|
||||
If it's not there, download it. Then return the path to the cached file.
|
||||
"""
|
||||
if cache_dir is None:
|
||||
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
|
||||
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
||||
cache_dir = str(cache_dir)
|
||||
|
||||
os.makedirs(cache_dir, exist_ok=True)
|
||||
if not os.path.exists(cache_dir):
|
||||
os.makedirs(cache_dir)
|
||||
|
||||
# Get eTag to add to filename, if it exists.
|
||||
if url.startswith("s3://"):
|
||||
@ -207,7 +223,7 @@ def get_from_cache(url: str, cache_dir: str = None) -> str:
|
||||
logger.info("creating metadata file for %s", cache_path)
|
||||
meta = {'url': url, 'etag': etag}
|
||||
meta_path = cache_path + '.json'
|
||||
with open(meta_path, 'w') as meta_file:
|
||||
with open(meta_path, 'w', encoding="utf-8") as meta_file:
|
||||
json.dump(meta, meta_file)
|
||||
|
||||
logger.info("removing temp file %s", temp_file.name)
|
||||
@ -215,19 +231,19 @@ def get_from_cache(url: str, cache_dir: str = None) -> str:
|
||||
return cache_path
|
||||
|
||||
|
||||
def read_set_from_file(filename: str) -> Set[str]:
|
||||
def read_set_from_file(filename):
|
||||
'''
|
||||
Extract a de-duped collection (set) of text from a file.
|
||||
Expected file format is one item per line.
|
||||
'''
|
||||
collection = set()
|
||||
with open(filename, 'r') as file_:
|
||||
with open(filename, 'r', encoding='utf-8') as file_:
|
||||
for line in file_:
|
||||
collection.add(line.rstrip())
|
||||
return collection
|
||||
|
||||
|
||||
def get_file_extension(path: str, dot=True, lower: bool = True):
|
||||
def get_file_extension(path, dot=True, lower=True):
|
||||
ext = os.path.splitext(path)[1]
|
||||
ext = ext if dot else ext[1:]
|
||||
return ext.lower() if lower else ext
|
||||
|
@ -1,5 +1,6 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@ -14,18 +15,18 @@
|
||||
# limitations under the License.
|
||||
"""PyTorch BERT model."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import os
|
||||
import copy
|
||||
import json
|
||||
import math
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import shutil
|
||||
import tarfile
|
||||
import tempfile
|
||||
import shutil
|
||||
import sys
|
||||
from io import open
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -33,25 +34,87 @@ from torch.nn import CrossEntropyLoss
|
||||
|
||||
from .file_utils import cached_path
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||
'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
|
||||
'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
|
||||
'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
|
||||
'bert-base-multilingual': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual.tar.gz",
|
||||
'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
|
||||
'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz",
|
||||
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
|
||||
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
|
||||
}
|
||||
CONFIG_NAME = 'bert_config.json'
|
||||
WEIGHTS_NAME = 'pytorch_model.bin'
|
||||
TF_WEIGHTS_NAME = 'model.ckpt'
|
||||
|
||||
def load_tf_weights_in_bert(model, tf_checkpoint_path):
|
||||
""" Load tf checkpoints in a pytorch model
|
||||
"""
|
||||
try:
|
||||
import re
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
except ImportError:
|
||||
print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
|
||||
"https://www.tensorflow.org/install/ for installation instructions.")
|
||||
raise
|
||||
tf_path = os.path.abspath(tf_checkpoint_path)
|
||||
print("Converting TensorFlow checkpoint from {}".format(tf_path))
|
||||
# Load weights from TF model
|
||||
init_vars = tf.train.list_variables(tf_path)
|
||||
names = []
|
||||
arrays = []
|
||||
for name, shape in init_vars:
|
||||
print("Loading TF weight {} with shape {}".format(name, shape))
|
||||
array = tf.train.load_variable(tf_path, name)
|
||||
names.append(name)
|
||||
arrays.append(array)
|
||||
|
||||
for name, array in zip(names, arrays):
|
||||
name = name.split('/')
|
||||
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
|
||||
# which are not required for using pretrained model
|
||||
if any(n in ["adam_v", "adam_m"] for n in name):
|
||||
print("Skipping {}".format("/".join(name)))
|
||||
continue
|
||||
pointer = model
|
||||
for m_name in name:
|
||||
if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
|
||||
l = re.split(r'_(\d+)', m_name)
|
||||
else:
|
||||
l = [m_name]
|
||||
if l[0] == 'kernel' or l[0] == 'gamma':
|
||||
pointer = getattr(pointer, 'weight')
|
||||
elif l[0] == 'output_bias' or l[0] == 'beta':
|
||||
pointer = getattr(pointer, 'bias')
|
||||
elif l[0] == 'output_weights':
|
||||
pointer = getattr(pointer, 'weight')
|
||||
else:
|
||||
pointer = getattr(pointer, l[0])
|
||||
if len(l) >= 2:
|
||||
num = int(l[1])
|
||||
pointer = pointer[num]
|
||||
if m_name[-11:] == '_embeddings':
|
||||
pointer = getattr(pointer, 'weight')
|
||||
elif m_name == 'kernel':
|
||||
array = np.transpose(array)
|
||||
try:
|
||||
assert pointer.shape == array.shape
|
||||
except AssertionError as e:
|
||||
e.args += (pointer.shape, array.shape)
|
||||
raise
|
||||
print("Initialize PyTorch weight {}".format(name))
|
||||
pointer.data = torch.from_numpy(array)
|
||||
return model
|
||||
|
||||
|
||||
def gelu(x):
|
||||
"""Implementation of the gelu activation function.
|
||||
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
|
||||
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
||||
Also see https://arxiv.org/abs/1606.08415
|
||||
"""
|
||||
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
|
||||
|
||||
@ -102,8 +165,9 @@ class BertConfig(object):
|
||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
"""
|
||||
if isinstance(vocab_size_or_config_json_file, str):
|
||||
with open(vocab_size_or_config_json_file, "r") as reader:
|
||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
||||
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
|
||||
json_config = json.loads(reader.read())
|
||||
for key, value in json_config.items():
|
||||
self.__dict__[key] = value
|
||||
@ -134,7 +198,7 @@ class BertConfig(object):
|
||||
@classmethod
|
||||
def from_json_file(cls, json_file):
|
||||
"""Constructs a `BertConfig` from a json file of parameters."""
|
||||
with open(json_file, "r") as reader:
|
||||
with open(json_file, "r", encoding='utf-8') as reader:
|
||||
text = reader.read()
|
||||
return cls.from_dict(json.loads(text))
|
||||
|
||||
@ -150,22 +214,24 @@ class BertConfig(object):
|
||||
"""Serializes this instance to a JSON string."""
|
||||
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
|
||||
|
||||
try:
|
||||
from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
|
||||
except ImportError:
|
||||
print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.")
|
||||
class BertLayerNorm(nn.Module):
|
||||
def __init__(self, hidden_size, eps=1e-12):
|
||||
"""Construct a layernorm module in the TF style (epsilon inside the square root).
|
||||
"""
|
||||
super(BertLayerNorm, self).__init__()
|
||||
self.weight = nn.Parameter(torch.ones(hidden_size))
|
||||
self.bias = nn.Parameter(torch.zeros(hidden_size))
|
||||
self.variance_epsilon = eps
|
||||
|
||||
class BertLayerNorm(nn.Module):
|
||||
def __init__(self, config, variance_epsilon=1e-12):
|
||||
"""Construct a layernorm module in the TF style (epsilon inside the square root).
|
||||
"""
|
||||
super(BertLayerNorm, self).__init__()
|
||||
self.gamma = nn.Parameter(torch.ones(config.hidden_size))
|
||||
self.beta = nn.Parameter(torch.zeros(config.hidden_size))
|
||||
self.variance_epsilon = variance_epsilon
|
||||
|
||||
def forward(self, x):
|
||||
u = x.mean(-1, keepdim=True)
|
||||
s = (x - u).pow(2).mean(-1, keepdim=True)
|
||||
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
|
||||
return self.gamma * x + self.beta
|
||||
|
||||
def forward(self, x):
|
||||
u = x.mean(-1, keepdim=True)
|
||||
s = (x - u).pow(2).mean(-1, keepdim=True)
|
||||
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
|
||||
return self.weight * x + self.bias
|
||||
|
||||
class BertEmbeddings(nn.Module):
|
||||
"""Construct the embeddings from word, position and token_type embeddings.
|
||||
@ -178,7 +244,7 @@ class BertEmbeddings(nn.Module):
|
||||
|
||||
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
|
||||
# any TensorFlow checkpoint file
|
||||
self.LayerNorm = BertLayerNorm(config)
|
||||
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
def forward(self, input_ids, token_type_ids=None):
|
||||
@ -253,7 +319,7 @@ class BertSelfOutput(nn.Module):
|
||||
def __init__(self, config):
|
||||
super(BertSelfOutput, self).__init__()
|
||||
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
||||
self.LayerNorm = BertLayerNorm(config)
|
||||
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
def forward(self, hidden_states, input_tensor):
|
||||
@ -279,8 +345,10 @@ class BertIntermediate(nn.Module):
|
||||
def __init__(self, config):
|
||||
super(BertIntermediate, self).__init__()
|
||||
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
|
||||
self.intermediate_act_fn = ACT2FN[config.hidden_act] \
|
||||
if isinstance(config.hidden_act, str) else config.hidden_act
|
||||
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
|
||||
self.intermediate_act_fn = ACT2FN[config.hidden_act]
|
||||
else:
|
||||
self.intermediate_act_fn = config.hidden_act
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
@ -292,7 +360,7 @@ class BertOutput(nn.Module):
|
||||
def __init__(self, config):
|
||||
super(BertOutput, self).__init__()
|
||||
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
|
||||
self.LayerNorm = BertLayerNorm(config)
|
||||
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
def forward(self, hidden_states, input_tensor):
|
||||
@ -320,7 +388,7 @@ class BertEncoder(nn.Module):
|
||||
def __init__(self, config):
|
||||
super(BertEncoder, self).__init__()
|
||||
layer = BertLayer(config)
|
||||
self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
|
||||
self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
|
||||
|
||||
def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
|
||||
all_encoder_layers = []
|
||||
@ -352,9 +420,11 @@ class BertPredictionHeadTransform(nn.Module):
|
||||
def __init__(self, config):
|
||||
super(BertPredictionHeadTransform, self).__init__()
|
||||
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
||||
self.transform_act_fn = ACT2FN[config.hidden_act] \
|
||||
if isinstance(config.hidden_act, str) else config.hidden_act
|
||||
self.LayerNorm = BertLayerNorm(config)
|
||||
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
|
||||
self.transform_act_fn = ACT2FN[config.hidden_act]
|
||||
else:
|
||||
self.transform_act_fn = config.hidden_act
|
||||
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
@ -414,12 +484,12 @@ class BertPreTrainingHeads(nn.Module):
|
||||
return prediction_scores, seq_relationship_score
|
||||
|
||||
|
||||
class PreTrainedBertModel(nn.Module):
|
||||
class BertPreTrainedModel(nn.Module):
|
||||
""" An abstract class to handle weights initialization and
|
||||
a simple interface for dowloading and loading pretrained models.
|
||||
"""
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super(PreTrainedBertModel, self).__init__()
|
||||
super(BertPreTrainedModel, self).__init__()
|
||||
if not isinstance(config, BertConfig):
|
||||
raise ValueError(
|
||||
"Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
|
||||
@ -437,46 +507,55 @@ class PreTrainedBertModel(nn.Module):
|
||||
# cf https://github.com/pytorch/pytorch/pull/5617
|
||||
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
||||
elif isinstance(module, BertLayerNorm):
|
||||
module.beta.data.normal_(mean=0.0, std=self.config.initializer_range)
|
||||
module.gamma.data.normal_(mean=0.0, std=self.config.initializer_range)
|
||||
module.bias.data.zero_()
|
||||
module.weight.data.fill_(1.0)
|
||||
if isinstance(module, nn.Linear) and module.bias is not None:
|
||||
module.bias.data.zero_()
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs):
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, state_dict=None, cache_dir=None,
|
||||
from_tf=False, *inputs, **kwargs):
|
||||
"""
|
||||
Instantiate a PreTrainedBertModel from a pre-trained model file.
|
||||
Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
|
||||
Download and cache the pre-trained model file if needed.
|
||||
|
||||
|
||||
Params:
|
||||
pretrained_model_name: either:
|
||||
pretrained_model_name_or_path: either:
|
||||
- a str with the name of a pre-trained model to load selected in the list of:
|
||||
. `bert-base-uncased`
|
||||
. `bert-large-uncased`
|
||||
. `bert-base-cased`
|
||||
. `bert-base-multilingual`
|
||||
. `bert-large-cased`
|
||||
. `bert-base-multilingual-uncased`
|
||||
. `bert-base-multilingual-cased`
|
||||
. `bert-base-chinese`
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `bert_config.json` a configuration file for the model
|
||||
. `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `bert_config.json` a configuration file for the model
|
||||
. `model.chkpt` a TensorFlow checkpoint
|
||||
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
|
||||
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
|
||||
state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
|
||||
*inputs, **kwargs: additional input for the specific Bert class
|
||||
(ex: num_labels for BertForSequenceClassification)
|
||||
"""
|
||||
if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP:
|
||||
archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name]
|
||||
if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
|
||||
archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
|
||||
else:
|
||||
archive_file = pretrained_model_name
|
||||
archive_file = pretrained_model_name_or_path
|
||||
# redirect to the cache, if necessary
|
||||
try:
|
||||
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
|
||||
except FileNotFoundError:
|
||||
except EnvironmentError:
|
||||
logger.error(
|
||||
"Model name '{}' was not found in model name list ({}). "
|
||||
"We assumed '{}' was a path or url but couldn't find any file "
|
||||
"associated to this path or url.".format(
|
||||
pretrained_model_name,
|
||||
pretrained_model_name_or_path,
|
||||
', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
|
||||
pretrained_model_name))
|
||||
archive_file))
|
||||
return None
|
||||
if resolved_archive_file == archive_file:
|
||||
logger.info("loading archive file {}".format(archive_file))
|
||||
@ -484,7 +563,7 @@ class PreTrainedBertModel(nn.Module):
|
||||
logger.info("loading archive file {} from cache at {}".format(
|
||||
archive_file, resolved_archive_file))
|
||||
tempdir = None
|
||||
if os.path.isdir(resolved_archive_file):
|
||||
if os.path.isdir(resolved_archive_file) or from_tf:
|
||||
serialization_dir = resolved_archive_file
|
||||
else:
|
||||
# Extract archive to temp dir
|
||||
@ -500,8 +579,30 @@ class PreTrainedBertModel(nn.Module):
|
||||
logger.info("Model config {}".format(config))
|
||||
# Instantiate model.
|
||||
model = cls(config, *inputs, **kwargs)
|
||||
weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
|
||||
state_dict = torch.load(weights_path)
|
||||
if state_dict is None and not from_tf:
|
||||
weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
|
||||
state_dict = torch.load(weights_path, map_location='cpu' if not torch.cuda.is_available() else None)
|
||||
if tempdir:
|
||||
# Clean up temp dir
|
||||
shutil.rmtree(tempdir)
|
||||
if from_tf:
|
||||
# Directly load from a TensorFlow checkpoint
|
||||
weights_path = os.path.join(serialization_dir, TF_WEIGHTS_NAME)
|
||||
return load_tf_weights_in_bert(model, weights_path)
|
||||
# Load from a PyTorch state_dict
|
||||
old_keys = []
|
||||
new_keys = []
|
||||
for key in state_dict.keys():
|
||||
new_key = None
|
||||
if 'gamma' in key:
|
||||
new_key = key.replace('gamma', 'weight')
|
||||
if 'beta' in key:
|
||||
new_key = key.replace('beta', 'bias')
|
||||
if new_key:
|
||||
old_keys.append(key)
|
||||
new_keys.append(new_key)
|
||||
for old_key, new_key in zip(old_keys, new_keys):
|
||||
state_dict[new_key] = state_dict.pop(old_key)
|
||||
|
||||
missing_keys = []
|
||||
unexpected_keys = []
|
||||
@ -519,20 +620,23 @@ class PreTrainedBertModel(nn.Module):
|
||||
for name, child in module._modules.items():
|
||||
if child is not None:
|
||||
load(child, prefix + name + '.')
|
||||
load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
|
||||
start_prefix = ''
|
||||
if not hasattr(model, 'bert') and any(s.startswith('bert.') for s in state_dict.keys()):
|
||||
start_prefix = 'bert.'
|
||||
load(model, prefix=start_prefix)
|
||||
if len(missing_keys) > 0:
|
||||
logger.info("Weights of {} not initialized from pretrained model: {}".format(
|
||||
model.__class__.__name__, missing_keys))
|
||||
if len(unexpected_keys) > 0:
|
||||
logger.info("Weights from pretrained model not used in {}: {}".format(
|
||||
model.__class__.__name__, unexpected_keys))
|
||||
if tempdir:
|
||||
# Clean up temp dir
|
||||
shutil.rmtree(tempdir)
|
||||
if len(error_msgs) > 0:
|
||||
raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
|
||||
model.__class__.__name__, "\n\t".join(error_msgs)))
|
||||
return model
|
||||
|
||||
|
||||
class BertModel(PreTrainedBertModel):
|
||||
class BertModel(BertPreTrainedModel):
|
||||
"""BERT model ("Bidirectional Embedding Representations from a Transformer").
|
||||
|
||||
Params:
|
||||
@ -557,20 +661,20 @@ class BertModel(PreTrainedBertModel):
|
||||
of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
|
||||
encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
|
||||
- `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
|
||||
to the last attention block,
|
||||
to the last attention block of shape [batch_size, sequence_length, hidden_size],
|
||||
`pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
|
||||
classifier pretrained on top of the hidden state associated to the first character of the
|
||||
input (`CLF`) to train on the Next-Sentence task (see BERT's paper).
|
||||
input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
|
||||
|
||||
Example usage:
|
||||
```python
|
||||
# Already been converted into WordPiece token ids
|
||||
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
|
||||
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
|
||||
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]])
|
||||
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
|
||||
|
||||
config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
|
||||
num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
|
||||
config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
|
||||
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
|
||||
|
||||
model = modeling.BertModel(config=config)
|
||||
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
|
||||
@ -615,7 +719,7 @@ class BertModel(PreTrainedBertModel):
|
||||
return encoded_layers, pooled_output
|
||||
|
||||
|
||||
class BertForPreTraining(PreTrainedBertModel):
|
||||
class BertForPreTraining(BertPreTrainedModel):
|
||||
"""BERT model with pre-training heads.
|
||||
This module comprises the BERT model followed by the two pre-training heads:
|
||||
- the masked language modeling head, and
|
||||
@ -635,10 +739,10 @@ class BertForPreTraining(PreTrainedBertModel):
|
||||
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
|
||||
input sequence length in the current batch. It's the mask that we typically use for attention when
|
||||
a batch has varying length sentences.
|
||||
`masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
|
||||
`masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
|
||||
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
|
||||
is only computed for the labels set in [0, ..., vocab_size]
|
||||
`next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
|
||||
`next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
|
||||
with indices selected in [0, 1].
|
||||
0 => next sentence is the continuation, 1 => next sentence is a random sentence.
|
||||
|
||||
@ -648,18 +752,18 @@ class BertForPreTraining(PreTrainedBertModel):
|
||||
sentence classification loss.
|
||||
if `masked_lm_labels` or `next_sentence_label` is `None`:
|
||||
Outputs a tuple comprising
|
||||
- the masked language modeling logits, and
|
||||
- the next sentence classification logits.
|
||||
- the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
|
||||
- the next sentence classification logits of shape [batch_size, 2].
|
||||
|
||||
Example usage:
|
||||
```python
|
||||
# Already been converted into WordPiece token ids
|
||||
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
|
||||
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
|
||||
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]])
|
||||
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
|
||||
|
||||
config = BertConfig(vocab_size=32000, hidden_size=512,
|
||||
num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
|
||||
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
|
||||
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
|
||||
|
||||
model = BertForPreTraining(config)
|
||||
masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
|
||||
@ -678,7 +782,7 @@ class BertForPreTraining(PreTrainedBertModel):
|
||||
|
||||
if masked_lm_labels is not None and next_sentence_label is not None:
|
||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels(-1))
|
||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
|
||||
next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
|
||||
total_loss = masked_lm_loss + next_sentence_loss
|
||||
return total_loss
|
||||
@ -686,7 +790,7 @@ class BertForPreTraining(PreTrainedBertModel):
|
||||
return prediction_scores, seq_relationship_score
|
||||
|
||||
|
||||
class BertForMaskedLM(PreTrainedBertModel):
|
||||
class BertForMaskedLM(BertPreTrainedModel):
|
||||
"""BERT model with the masked language modeling head.
|
||||
This module comprises the BERT model followed by the masked language modeling head.
|
||||
|
||||
@ -709,20 +813,20 @@ class BertForMaskedLM(PreTrainedBertModel):
|
||||
is only computed for the labels set in [0, ..., vocab_size]
|
||||
|
||||
Outputs:
|
||||
if `masked_lm_labels` is `None`:
|
||||
if `masked_lm_labels` is not `None`:
|
||||
Outputs the masked language modeling loss.
|
||||
if `masked_lm_labels` is `None`:
|
||||
Outputs the masked language modeling logits.
|
||||
Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
|
||||
|
||||
Example usage:
|
||||
```python
|
||||
# Already been converted into WordPiece token ids
|
||||
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
|
||||
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
|
||||
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]])
|
||||
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
|
||||
|
||||
config = BertConfig(vocab_size=32000, hidden_size=512,
|
||||
num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
|
||||
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
|
||||
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
|
||||
|
||||
model = BertForMaskedLM(config)
|
||||
masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
|
||||
@ -747,7 +851,7 @@ class BertForMaskedLM(PreTrainedBertModel):
|
||||
return prediction_scores
|
||||
|
||||
|
||||
class BertForNextSentencePrediction(PreTrainedBertModel):
|
||||
class BertForNextSentencePrediction(BertPreTrainedModel):
|
||||
"""BERT model with next sentence prediction head.
|
||||
This module comprises the BERT model followed by the next sentence classification head.
|
||||
|
||||
@ -774,7 +878,7 @@ class BertForNextSentencePrediction(PreTrainedBertModel):
|
||||
Outputs the total_loss which is the sum of the masked language modeling loss and the next
|
||||
sentence classification loss.
|
||||
if `next_sentence_label` is `None`:
|
||||
Outputs the next sentence classification logits.
|
||||
Outputs the next sentence classification logits of shape [batch_size, 2].
|
||||
|
||||
Example usage:
|
||||
```python
|
||||
@ -783,8 +887,8 @@ class BertForNextSentencePrediction(PreTrainedBertModel):
|
||||
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
|
||||
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
|
||||
|
||||
config = BertConfig(vocab_size=32000, hidden_size=512,
|
||||
num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
|
||||
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
|
||||
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
|
||||
|
||||
model = BertForNextSentencePrediction(config)
|
||||
seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
|
||||
@ -809,7 +913,7 @@ class BertForNextSentencePrediction(PreTrainedBertModel):
|
||||
return seq_relationship_score
|
||||
|
||||
|
||||
class BertForSequenceClassification(PreTrainedBertModel):
|
||||
class BertForSequenceClassification(BertPreTrainedModel):
|
||||
"""BERT model for classification.
|
||||
This module is composed of the BERT model with a linear layer on top of
|
||||
the pooled output.
|
||||
@ -836,17 +940,17 @@ class BertForSequenceClassification(PreTrainedBertModel):
|
||||
if `labels` is not `None`:
|
||||
Outputs the CrossEntropy classification loss of the output with the labels.
|
||||
if `labels` is `None`:
|
||||
Outputs the classification logits.
|
||||
Outputs the classification logits of shape [batch_size, num_labels].
|
||||
|
||||
Example usage:
|
||||
```python
|
||||
# Already been converted into WordPiece token ids
|
||||
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
|
||||
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
|
||||
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]])
|
||||
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
|
||||
|
||||
config = BertConfig(vocab_size=32000, hidden_size=512,
|
||||
num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
|
||||
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
|
||||
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
|
||||
|
||||
num_labels = 2
|
||||
|
||||
@ -854,7 +958,7 @@ class BertForSequenceClassification(PreTrainedBertModel):
|
||||
logits = model(input_ids, token_type_ids, input_mask)
|
||||
```
|
||||
"""
|
||||
def __init__(self, config, num_labels=2):
|
||||
def __init__(self, config, num_labels):
|
||||
super(BertForSequenceClassification, self).__init__(config)
|
||||
self.num_labels = num_labels
|
||||
self.bert = BertModel(config)
|
||||
@ -870,26 +974,160 @@ class BertForSequenceClassification(PreTrainedBertModel):
|
||||
if labels is not None:
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
return loss, logits
|
||||
return loss
|
||||
else:
|
||||
return logits
|
||||
|
||||
|
||||
class BertForQuestionAnswering(PreTrainedBertModel):
|
||||
class BertForMultipleChoice(BertPreTrainedModel):
|
||||
"""BERT model for multiple choice tasks.
|
||||
This module is composed of the BERT model with a linear layer on top of
|
||||
the pooled output.
|
||||
|
||||
Params:
|
||||
`config`: a BertConfig class instance with the configuration to build a new model.
|
||||
`num_choices`: the number of classes for the classifier. Default = 2.
|
||||
|
||||
Inputs:
|
||||
`input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
|
||||
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
|
||||
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
|
||||
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
|
||||
with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
|
||||
and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
|
||||
`attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
|
||||
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
|
||||
input sequence length in the current batch. It's the mask that we typically use for attention when
|
||||
a batch has varying length sentences.
|
||||
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
|
||||
with indices selected in [0, ..., num_choices].
|
||||
|
||||
Outputs:
|
||||
if `labels` is not `None`:
|
||||
Outputs the CrossEntropy classification loss of the output with the labels.
|
||||
if `labels` is `None`:
|
||||
Outputs the classification logits of shape [batch_size, num_labels].
|
||||
|
||||
Example usage:
|
||||
```python
|
||||
# Already been converted into WordPiece token ids
|
||||
input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
|
||||
input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
|
||||
token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
|
||||
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
|
||||
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
|
||||
|
||||
num_choices = 2
|
||||
|
||||
model = BertForMultipleChoice(config, num_choices)
|
||||
logits = model(input_ids, token_type_ids, input_mask)
|
||||
```
|
||||
"""
|
||||
def __init__(self, config, num_choices):
|
||||
super(BertForMultipleChoice, self).__init__(config)
|
||||
self.num_choices = num_choices
|
||||
self.bert = BertModel(config)
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
self.classifier = nn.Linear(config.hidden_size, 1)
|
||||
self.apply(self.init_bert_weights)
|
||||
|
||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
|
||||
flat_input_ids = input_ids.view(-1, input_ids.size(-1))
|
||||
flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
|
||||
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1))
|
||||
_, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False)
|
||||
pooled_output = self.dropout(pooled_output)
|
||||
logits = self.classifier(pooled_output)
|
||||
reshaped_logits = logits.view(-1, self.num_choices)
|
||||
|
||||
if labels is not None:
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(reshaped_logits, labels)
|
||||
return loss
|
||||
else:
|
||||
return reshaped_logits
|
||||
|
||||
|
||||
class BertForTokenClassification(BertPreTrainedModel):
|
||||
"""BERT model for token-level classification.
|
||||
This module is composed of the BERT model with a linear layer on top of
|
||||
the full hidden state of the last layer.
|
||||
|
||||
Params:
|
||||
`config`: a BertConfig class instance with the configuration to build a new model.
|
||||
`num_labels`: the number of classes for the classifier. Default = 2.
|
||||
|
||||
Inputs:
|
||||
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
|
||||
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
|
||||
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
|
||||
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
|
||||
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
|
||||
a `sentence B` token (see BERT paper for more details).
|
||||
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
|
||||
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
|
||||
input sequence length in the current batch. It's the mask that we typically use for attention when
|
||||
a batch has varying length sentences.
|
||||
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
|
||||
with indices selected in [0, ..., num_labels].
|
||||
|
||||
Outputs:
|
||||
if `labels` is not `None`:
|
||||
Outputs the CrossEntropy classification loss of the output with the labels.
|
||||
if `labels` is `None`:
|
||||
Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
|
||||
|
||||
Example usage:
|
||||
```python
|
||||
# Already been converted into WordPiece token ids
|
||||
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
|
||||
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
|
||||
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
|
||||
|
||||
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
|
||||
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
|
||||
|
||||
num_labels = 2
|
||||
|
||||
model = BertForTokenClassification(config, num_labels)
|
||||
logits = model(input_ids, token_type_ids, input_mask)
|
||||
```
|
||||
"""
|
||||
def __init__(self, config, num_labels):
|
||||
super(BertForTokenClassification, self).__init__(config)
|
||||
self.num_labels = num_labels
|
||||
self.bert = BertModel(config)
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
self.classifier = nn.Linear(config.hidden_size, num_labels)
|
||||
self.apply(self.init_bert_weights)
|
||||
|
||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
|
||||
sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
|
||||
sequence_output = self.dropout(sequence_output)
|
||||
logits = self.classifier(sequence_output)
|
||||
|
||||
if labels is not None:
|
||||
loss_fct = CrossEntropyLoss()
|
||||
# Only keep active parts of the loss
|
||||
if attention_mask is not None:
|
||||
active_loss = attention_mask.view(-1) == 1
|
||||
active_logits = logits.view(-1, self.num_labels)[active_loss]
|
||||
active_labels = labels.view(-1)[active_loss]
|
||||
loss = loss_fct(active_logits, active_labels)
|
||||
else:
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
return loss
|
||||
else:
|
||||
return logits
|
||||
|
||||
|
||||
class BertForQuestionAnswering(BertPreTrainedModel):
|
||||
"""BERT model for Question Answering (span extraction).
|
||||
This module is composed of the BERT model with a linear layer on top of
|
||||
the sequence output that computes start_logits and end_logits
|
||||
|
||||
Params:
|
||||
`config`: either
|
||||
- a BertConfig class instance with the configuration to build a new model, or
|
||||
- a str with the name of a pre-trained model to load selected in the list of:
|
||||
. `bert-base-uncased`
|
||||
. `bert-large-uncased`
|
||||
. `bert-base-cased`
|
||||
. `bert-base-multilingual`
|
||||
. `bert-base-chinese`
|
||||
The pre-trained model will be downloaded and cached if needed.
|
||||
`config`: a BertConfig class instance with the configuration to build a new model.
|
||||
|
||||
Inputs:
|
||||
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
|
||||
@ -914,17 +1152,17 @@ class BertForQuestionAnswering(PreTrainedBertModel):
|
||||
Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
|
||||
if `start_positions` or `end_positions` is `None`:
|
||||
Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
|
||||
position tokens.
|
||||
position tokens of shape [batch_size, sequence_length].
|
||||
|
||||
Example usage:
|
||||
```python
|
||||
# Already been converted into WordPiece token ids
|
||||
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
|
||||
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
|
||||
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]])
|
||||
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
|
||||
|
||||
config = BertConfig(vocab_size=32000, hidden_size=512,
|
||||
num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
|
||||
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
|
||||
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
|
||||
|
||||
model = BertForQuestionAnswering(config)
|
||||
start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
|
||||
|
684
pytorch_pretrained_bert/modeling_gpt2.py
Normal file
684
pytorch_pretrained_bert/modeling_gpt2.py
Normal file
@ -0,0 +1,684 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The OpenAI Team Authors and HugginFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""PyTorch OpenAI GPT-2 model."""
|
||||
|
||||
import collections
|
||||
import copy
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import shutil
|
||||
import tarfile
|
||||
import tempfile
|
||||
import sys
|
||||
from io import open
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
from torch.nn.parameter import Parameter
|
||||
|
||||
from .file_utils import cached_path
|
||||
from .modeling import BertLayerNorm as LayerNorm
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin"}
|
||||
PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json"}
|
||||
|
||||
CONFIG_NAME = "config.json"
|
||||
WEIGHTS_NAME = "pytorch_model.bin"
|
||||
|
||||
def load_tf_weights_in_gpt2(model, gpt2_checkpoint_path):
|
||||
""" Load tf checkpoints in a pytorch model
|
||||
"""
|
||||
try:
|
||||
import re
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
except ImportError:
|
||||
print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
|
||||
"https://www.tensorflow.org/install/ for installation instructions.")
|
||||
raise
|
||||
tf_path = os.path.abspath(gpt2_checkpoint_path)
|
||||
print("Converting TensorFlow checkpoint from {}".format(tf_path))
|
||||
# Load weights from TF model
|
||||
init_vars = tf.train.list_variables(tf_path)
|
||||
names = []
|
||||
arrays = []
|
||||
for name, shape in init_vars:
|
||||
print("Loading TF weight {} with shape {}".format(name, shape))
|
||||
array = tf.train.load_variable(tf_path, name)
|
||||
names.append(name)
|
||||
arrays.append(array.squeeze())
|
||||
|
||||
for name, array in zip(names, arrays):
|
||||
name = name[6:] # skip "model/"
|
||||
name = name.split('/')
|
||||
pointer = model
|
||||
for m_name in name:
|
||||
if re.fullmatch(r'[A-Za-z]+\d+', m_name):
|
||||
l = re.split(r'(\d+)', m_name)
|
||||
else:
|
||||
l = [m_name]
|
||||
if l[0] == 'w' or l[0] == 'g':
|
||||
pointer = getattr(pointer, 'weight')
|
||||
elif l[0] == 'b':
|
||||
pointer = getattr(pointer, 'bias')
|
||||
elif l[0] == 'wpe' or l[0] == 'wte':
|
||||
pointer = getattr(pointer, l[0])
|
||||
pointer = getattr(pointer, 'weight')
|
||||
else:
|
||||
pointer = getattr(pointer, l[0])
|
||||
if len(l) >= 2:
|
||||
num = int(l[1])
|
||||
pointer = pointer[num]
|
||||
try:
|
||||
assert pointer.shape == array.shape
|
||||
except AssertionError as e:
|
||||
e.args += (pointer.shape, array.shape)
|
||||
raise
|
||||
print("Initialize PyTorch weight {}".format(name))
|
||||
pointer.data = torch.from_numpy(array)
|
||||
return model
|
||||
|
||||
|
||||
def gelu(x):
|
||||
return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
||||
|
||||
|
||||
class GPT2Config(object):
|
||||
"""Configuration class to store the configuration of a `GPT2Model`.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size_or_config_json_file=50257,
|
||||
n_positions=1024,
|
||||
n_ctx=1024,
|
||||
n_embd=768,
|
||||
n_layer=12,
|
||||
n_head=12,
|
||||
layer_norm_epsilon=1e-5,
|
||||
initializer_range=0.02,
|
||||
):
|
||||
"""Constructs GPT2Config.
|
||||
|
||||
Args:
|
||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
||||
n_positions: Number of positional embeddings.
|
||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||
n_embd: Dimensionality of the embeddings and hidden states.
|
||||
n_layer: Number of hidden layers in the Transformer encoder.
|
||||
n_head: Number of attention heads for each attention layer in
|
||||
the Transformer encoder.
|
||||
layer_norm_epsilon: epsilon to use in the layer norm layers
|
||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
"""
|
||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
||||
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
|
||||
json_config = json.loads(reader.read())
|
||||
for key, value in json_config.items():
|
||||
self.__dict__[key] = value
|
||||
elif isinstance(vocab_size_or_config_json_file, int):
|
||||
self.vocab_size = vocab_size_or_config_json_file
|
||||
self.n_ctx = n_ctx
|
||||
self.n_positions = n_positions
|
||||
self.n_embd = n_embd
|
||||
self.n_layer = n_layer
|
||||
self.n_head = n_head
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.initializer_range = initializer_range
|
||||
else:
|
||||
raise ValueError(
|
||||
"First argument must be either a vocabulary size (int)"
|
||||
"or the path to a pretrained model config file (str)"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, json_object):
|
||||
"""Constructs a `GPT2Config` from a Python dictionary of parameters."""
|
||||
config = GPT2Config(vocab_size_or_config_json_file=-1)
|
||||
for key, value in json_object.items():
|
||||
config.__dict__[key] = value
|
||||
return config
|
||||
|
||||
@classmethod
|
||||
def from_json_file(cls, json_file):
|
||||
"""Constructs a `GPT2Config` from a json file of parameters."""
|
||||
with open(json_file, "r", encoding="utf-8") as reader:
|
||||
text = reader.read()
|
||||
return cls.from_dict(json.loads(text))
|
||||
|
||||
def __repr__(self):
|
||||
return str(self.to_json_string())
|
||||
|
||||
def to_dict(self):
|
||||
"""Serializes this instance to a Python dictionary."""
|
||||
output = copy.deepcopy(self.__dict__)
|
||||
return output
|
||||
|
||||
def to_json_string(self):
|
||||
"""Serializes this instance to a JSON string."""
|
||||
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
|
||||
|
||||
|
||||
class Conv1D(nn.Module):
|
||||
def __init__(self, nf, nx):
|
||||
super(Conv1D, self).__init__()
|
||||
self.nf = nf
|
||||
w = torch.empty(nx, nf)
|
||||
nn.init.normal_(w, std=0.02)
|
||||
self.weight = Parameter(w)
|
||||
self.bias = Parameter(torch.zeros(nf))
|
||||
|
||||
def forward(self, x):
|
||||
size_out = x.size()[:-1] + (self.nf,)
|
||||
x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
|
||||
x = x.view(*size_out)
|
||||
return x
|
||||
|
||||
|
||||
class Attention(nn.Module):
|
||||
def __init__(self, nx, n_ctx, config, scale=False):
|
||||
super(Attention, self).__init__()
|
||||
n_state = nx # in Attention: n_state=768 (nx=n_embd)
|
||||
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
|
||||
assert n_state % config.n_head == 0
|
||||
self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
|
||||
self.n_head = config.n_head
|
||||
self.split_size = n_state
|
||||
self.scale = scale
|
||||
self.c_attn = Conv1D(n_state * 3, nx)
|
||||
self.c_proj = Conv1D(n_state, nx)
|
||||
|
||||
def _attn(self, q, k, v):
|
||||
w = torch.matmul(q, k)
|
||||
if self.scale:
|
||||
w = w / math.sqrt(v.size(-1))
|
||||
nd, ns = w.size(-2), w.size(-1)
|
||||
b = self.bias[:, :, ns-nd:ns, :ns]
|
||||
w = w * b - 1e10 * (1 - b)
|
||||
|
||||
w = nn.Softmax(dim=-1)(w)
|
||||
return torch.matmul(w, v)
|
||||
|
||||
def merge_heads(self, x):
|
||||
x = x.permute(0, 2, 1, 3).contiguous()
|
||||
new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
|
||||
return x.view(*new_x_shape) # in Tensorflow implem: fct merge_states
|
||||
|
||||
def split_heads(self, x, k=False):
|
||||
new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
|
||||
x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states
|
||||
if k:
|
||||
return x.permute(0, 2, 3, 1) # (batch, head, head_features, seq_length)
|
||||
else:
|
||||
return x.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features)
|
||||
|
||||
def forward(self, x, layer_past=None):
|
||||
x = self.c_attn(x)
|
||||
query, key, value = x.split(self.split_size, dim=2)
|
||||
query = self.split_heads(query)
|
||||
key = self.split_heads(key, k=True)
|
||||
value = self.split_heads(value)
|
||||
if layer_past is not None:
|
||||
past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1] # transpose back cf below
|
||||
key = torch.cat((past_key, key), dim=-1)
|
||||
value = torch.cat((past_value, value), dim=-2)
|
||||
present = torch.stack((key.transpose(-2, -1), value)) # transpose to have same shapes for stacking
|
||||
a = self._attn(query, key, value)
|
||||
a = self.merge_heads(a)
|
||||
a = self.c_proj(a)
|
||||
return a, present
|
||||
|
||||
|
||||
class MLP(nn.Module):
|
||||
def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd)
|
||||
super(MLP, self).__init__()
|
||||
nx = config.n_embd
|
||||
self.c_fc = Conv1D(n_state, nx)
|
||||
self.c_proj = Conv1D(nx, n_state)
|
||||
self.act = gelu
|
||||
|
||||
def forward(self, x):
|
||||
h = self.act(self.c_fc(x))
|
||||
h2 = self.c_proj(h)
|
||||
return h2
|
||||
|
||||
|
||||
class Block(nn.Module):
|
||||
def __init__(self, n_ctx, config, scale=False):
|
||||
super(Block, self).__init__()
|
||||
nx = config.n_embd
|
||||
self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
|
||||
self.attn = Attention(nx, n_ctx, config, scale)
|
||||
self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
|
||||
self.mlp = MLP(4 * nx, config)
|
||||
|
||||
def forward(self, x, layer_past=None):
|
||||
a, present = self.attn(self.ln_1(x), layer_past=layer_past)
|
||||
x = x + a
|
||||
m = self.mlp(self.ln_2(x))
|
||||
x = x + m
|
||||
return x, present
|
||||
|
||||
|
||||
class GPT2LMHead(nn.Module):
|
||||
""" Language Model Head for the transformer """
|
||||
|
||||
def __init__(self, model_embeddings_weights, config):
|
||||
super(GPT2LMHead, self).__init__()
|
||||
self.n_embd = config.n_embd
|
||||
self.set_embeddings_weights(model_embeddings_weights)
|
||||
|
||||
def set_embeddings_weights(self, model_embeddings_weights):
|
||||
embed_shape = model_embeddings_weights.shape
|
||||
self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
|
||||
self.decoder.weight = model_embeddings_weights # Tied weights
|
||||
|
||||
def forward(self, hidden_state):
|
||||
# Truncated Language modeling logits (we remove the last token)
|
||||
# h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd)
|
||||
lm_logits = self.decoder(hidden_state)
|
||||
return lm_logits
|
||||
|
||||
|
||||
class GPT2MultipleChoiceHead(nn.Module):
|
||||
""" Classifier Head for the transformer """
|
||||
|
||||
def __init__(self, config):
|
||||
super(GPT2MultipleChoiceHead, self).__init__()
|
||||
self.n_embd = config.n_embd
|
||||
self.linear = nn.Linear(config.n_embd, 1)
|
||||
|
||||
nn.init.normal_(self.linear.weight, std=0.02)
|
||||
nn.init.normal_(self.linear.bias, 0)
|
||||
|
||||
def forward(self, hidden_states, mc_token_ids):
|
||||
# Classification logits
|
||||
# hidden_state (bsz, num_choices, seq_length, hidden_size)
|
||||
# mc_token_ids (bsz, num_choices)
|
||||
mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))
|
||||
# (bsz, num_choices, 1, hidden_size)
|
||||
multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2)
|
||||
# (bsz, num_choices, hidden_size)
|
||||
multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1)
|
||||
# (bsz, num_choices)
|
||||
return multiple_choice_logits
|
||||
|
||||
|
||||
class GPT2PreTrainedModel(nn.Module):
|
||||
""" An abstract class to handle weights initialization and
|
||||
a simple interface for dowloading and loading pretrained models.
|
||||
"""
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super(GPT2PreTrainedModel, self).__init__()
|
||||
if not isinstance(config, GPT2Config):
|
||||
raise ValueError(
|
||||
"Parameter config in `{}(config)` should be an instance of class `GPT2Config`. "
|
||||
"To create a model from a pretrained model use "
|
||||
"`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
|
||||
self.__class__.__name__, self.__class__.__name__
|
||||
)
|
||||
)
|
||||
self.config = config
|
||||
|
||||
def set_tied(self):
|
||||
pass
|
||||
|
||||
def init_weights(self, module):
|
||||
""" Initialize the weights.
|
||||
"""
|
||||
if isinstance(module, (nn.Linear, nn.Embedding)):
|
||||
# Slightly different from the TF version which uses truncated_normal for initialization
|
||||
# cf https://github.com/pytorch/pytorch/pull/5617
|
||||
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
||||
elif isinstance(module, LayerNorm):
|
||||
module.bias.data.zero_()
|
||||
module.weight.data.fill_(1.0)
|
||||
if isinstance(module, nn.Linear) and module.bias is not None:
|
||||
module.bias.data.zero_()
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls, pretrained_model_name_or_path, state_dict=None, cache_dir=None, from_tf=False, *inputs, **kwargs
|
||||
):
|
||||
"""
|
||||
Instantiate a GPT2PreTrainedModel from a pre-trained model file or a pytorch state dict.
|
||||
Download and cache the pre-trained model file if needed.
|
||||
|
||||
Params:
|
||||
pretrained_model_name_or_path: either:
|
||||
- a str with the name of a pre-trained model to load selected in the list of:
|
||||
. `openai-gpt`
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `gpt2_config.json` a configuration file for the model
|
||||
. `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `bert_config.json` a configuration file for the model
|
||||
. a TensorFlow checkpoint with trained weights
|
||||
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
|
||||
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
|
||||
state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
|
||||
*inputs, **kwargs: additional input for the specific Bert class
|
||||
(ex: num_labels for BertForSequenceClassification)
|
||||
"""
|
||||
if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
|
||||
archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
|
||||
config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
|
||||
else:
|
||||
archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
|
||||
config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
|
||||
# redirect to the cache, if necessary
|
||||
try:
|
||||
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
|
||||
resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
|
||||
except EnvironmentError:
|
||||
logger.error(
|
||||
"Model name '{}' was not found in model name list ({}). "
|
||||
"We assumed '{}' was a path or url but couldn't find files {} and {} "
|
||||
"at this path or url.".format(
|
||||
pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
|
||||
archive_file, config_file
|
||||
)
|
||||
)
|
||||
return None
|
||||
if resolved_archive_file == archive_file and resolved_config_file == config_file:
|
||||
logger.info("loading weights file {}".format(archive_file))
|
||||
logger.info("loading configuration file {}".format(config_file))
|
||||
else:
|
||||
logger.info("loading weights file {} from cache at {}".format(
|
||||
archive_file, resolved_archive_file))
|
||||
logger.info("loading configuration file {} from cache at {}".format(
|
||||
config_file, resolved_config_file))
|
||||
# Load config
|
||||
config = GPT2Config.from_json_file(resolved_config_file)
|
||||
logger.info("Model config {}".format(config))
|
||||
# Instantiate model.
|
||||
model = cls(config, *inputs, **kwargs)
|
||||
if state_dict is None and not from_tf:
|
||||
state_dict = torch.load(resolved_archive_file, map_location='cpu' if not torch.cuda.is_available() else None)
|
||||
if from_tf:
|
||||
# Directly load from a TensorFlow checkpoint (stored as NumPy array)
|
||||
return load_tf_weights_in_gpt2(model, resolved_archive_file)
|
||||
|
||||
old_keys = []
|
||||
new_keys = []
|
||||
for key in state_dict.keys():
|
||||
new_key = None
|
||||
if key.endswith(".g"):
|
||||
new_key = key[:-2] + ".weight"
|
||||
elif key.endswith(".b"):
|
||||
new_key = key[:-2] + ".bias"
|
||||
elif key.endswith(".w"):
|
||||
new_key = key[:-2] + ".weight"
|
||||
if new_key:
|
||||
old_keys.append(key)
|
||||
new_keys.append(new_key)
|
||||
for old_key, new_key in zip(old_keys, new_keys):
|
||||
state_dict[new_key] = state_dict.pop(old_key)
|
||||
|
||||
missing_keys = []
|
||||
unexpected_keys = []
|
||||
error_msgs = []
|
||||
# copy state_dict so _load_from_state_dict can modify it
|
||||
metadata = getattr(state_dict, "_metadata", None)
|
||||
state_dict = state_dict.copy()
|
||||
if metadata is not None:
|
||||
state_dict._metadata = metadata
|
||||
|
||||
def load(module, prefix=""):
|
||||
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
|
||||
module._load_from_state_dict(
|
||||
state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs
|
||||
)
|
||||
for name, child in module._modules.items():
|
||||
if child is not None:
|
||||
load(child, prefix + name + ".")
|
||||
|
||||
start_model = model
|
||||
if hasattr(model, "transformer") and all(not s.startswith('transformer.') for s in state_dict.keys()):
|
||||
start_model = model.transformer
|
||||
load(start_model, prefix="")
|
||||
|
||||
if len(missing_keys) > 0:
|
||||
logger.info(
|
||||
"Weights of {} not initialized from pretrained model: {}".format(model.__class__.__name__, missing_keys)
|
||||
)
|
||||
if len(unexpected_keys) > 0:
|
||||
logger.info(
|
||||
"Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys)
|
||||
)
|
||||
if len(error_msgs) > 0:
|
||||
raise RuntimeError(
|
||||
"Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
|
||||
)
|
||||
|
||||
# Make sure we are still sharing the output and input embeddings after loading weights
|
||||
model.set_tied()
|
||||
return model
|
||||
|
||||
|
||||
class GPT2Model(GPT2PreTrainedModel):
|
||||
"""OpenAI GPT-2 model ("Language Models are Unsupervised Multitask Learners").
|
||||
|
||||
Params:
|
||||
config: a GPT2Config class instance with the configuration to build a new model
|
||||
|
||||
Inputs:
|
||||
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
|
||||
were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
|
||||
`position_ids`: an optional torch.LongTensor with the same shape as input_ids
|
||||
with the position indices (selected in the range [0, config.n_positions - 1[.
|
||||
`token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
|
||||
You can use it to add a third type of embedding to each input token in the sequence
|
||||
(the previous two being the word and position embeddings).
|
||||
The input, position and token_type embeddings are summed inside the Transformer before the first
|
||||
self-attention block.
|
||||
|
||||
Outputs:
|
||||
`hidden_states`: the encoded-hidden-states at the top of the model
|
||||
as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
|
||||
(or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
|
||||
|
||||
Example usage:
|
||||
```python
|
||||
# Already been converted into BPE token ids
|
||||
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
|
||||
|
||||
config = modeling_gpt2.GPT2Config()
|
||||
|
||||
model = modeling_gpt2.GPT2Model(config)
|
||||
hidden_states = model(input_ids)
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
super(GPT2Model, self).__init__(config)
|
||||
self.wte = nn.Embedding(config.vocab_size, config.n_embd)
|
||||
self.wpe = nn.Embedding(config.n_positions, config.n_embd)
|
||||
block = Block(config.n_ctx, config, scale=True)
|
||||
self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
|
||||
self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
|
||||
def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None):
|
||||
if past is None:
|
||||
past_length = 0
|
||||
past = [None] * len(self.h)
|
||||
else:
|
||||
past_length = past[0][0].size(-2)
|
||||
if position_ids is None:
|
||||
position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device)
|
||||
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
|
||||
|
||||
input_shape = input_ids.size()
|
||||
input_ids = input_ids.view(-1, input_ids.size(-1))
|
||||
position_ids = position_ids.view(-1, position_ids.size(-1))
|
||||
|
||||
inputs_embeds = self.wte(input_ids)
|
||||
position_embeds = self.wpe(position_ids)
|
||||
if token_type_ids is not None:
|
||||
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
|
||||
token_type_embeds = self.wte(token_type_ids)
|
||||
else:
|
||||
token_type_embeds = 0
|
||||
hidden_states = inputs_embeds + position_embeds + token_type_embeds
|
||||
presents = []
|
||||
for block, layer_past in zip(self.h, past):
|
||||
hidden_states, present = block(hidden_states, layer_past)
|
||||
presents.append(present)
|
||||
hidden_states = self.ln_f(hidden_states)
|
||||
output_shape = input_shape + (hidden_states.size(-1),)
|
||||
return hidden_states.view(*output_shape), presents
|
||||
|
||||
|
||||
class GPT2LMHeadModel(GPT2PreTrainedModel):
|
||||
"""OpenAI GPT-2 model with a Language Modeling head ("Language Models are Unsupervised Multitask Learners").
|
||||
|
||||
Params:
|
||||
config: a GPT2Config class instance with the configuration to build a new model
|
||||
|
||||
Inputs:
|
||||
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
|
||||
were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
|
||||
`position_ids`: an optional torch.LongTensor with the same shape as input_ids
|
||||
with the position indices (selected in the range [0, config.n_positions - 1[.
|
||||
`token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
|
||||
You can use it to add a third type of embedding to each input token in the sequence
|
||||
(the previous two being the word and position embeddings).
|
||||
The input, position and token_type embeddings are summed inside the Transformer before the first
|
||||
self-attention block.
|
||||
`lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
|
||||
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
|
||||
is only computed for the labels set in [0, ..., vocab_size]
|
||||
|
||||
Outputs:
|
||||
if `lm_labels` is not `None`:
|
||||
Outputs the language modeling loss.
|
||||
else:
|
||||
`lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, config.vocab_size]
|
||||
(or more generally [d_1, ..., d_n, config.vocab_size] were d_1 ... d_n are the dimension of input_ids)
|
||||
|
||||
Example usage:
|
||||
```python
|
||||
# Already been converted into BPE token ids
|
||||
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
|
||||
|
||||
config = modeling_gpt2.GPT2Config()
|
||||
|
||||
model = modeling_gpt2.GPT2LMHeadModel(config)
|
||||
lm_logits = model(input_ids)
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
super(GPT2LMHeadModel, self).__init__(config)
|
||||
self.transformer = GPT2Model(config)
|
||||
self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
|
||||
self.apply(self.init_weights)
|
||||
|
||||
def set_tied(self):
|
||||
""" Make sure we are sharing the embeddings
|
||||
"""
|
||||
self.lm_head.set_embeddings_weights(self.transformer.wte.weight)
|
||||
|
||||
def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None):
|
||||
hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
|
||||
lm_logits = self.lm_head(hidden_states)
|
||||
if lm_labels is not None:
|
||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
||||
loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))
|
||||
return loss
|
||||
return lm_logits, presents
|
||||
|
||||
|
||||
class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
"""OpenAI GPT-2 model with a Language Modeling and a Multiple Choice head ("Language Models are Unsupervised Multitask Learners").
|
||||
|
||||
Params:
|
||||
config: a GPT2Config class instance with the configuration to build a new model
|
||||
|
||||
Inputs:
|
||||
`input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
|
||||
indices selected in the range [0, config.vocab_size[
|
||||
`mc_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token from
|
||||
which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
|
||||
`position_ids`: an optional torch.LongTensor with the same shape as input_ids
|
||||
with the position indices (selected in the range [0, config.n_positions - 1[.
|
||||
`token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
|
||||
You can use it to add a third type of embedding to each input token in the sequence
|
||||
(the previous two being the word and position embeddings).
|
||||
The input, position and token_type embeddings are summed inside the Transformer before the first
|
||||
self-attention block.
|
||||
`lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length]
|
||||
with indices selected in [-1, 0, ..., config.vocab_size]. All labels set to -1 are ignored (masked), the loss
|
||||
is only computed for the labels set in [0, ..., config.vocab_size]
|
||||
`multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
|
||||
with indices selected in [0, ..., num_choices].
|
||||
|
||||
Outputs:
|
||||
if `lm_labels` and `multiple_choice_labels` are not `None`:
|
||||
Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
|
||||
else: a tuple with
|
||||
`lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, config.vocab_size]
|
||||
`multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
|
||||
|
||||
Example usage:
|
||||
```python
|
||||
# Already been converted into BPE token ids
|
||||
input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]]) # (bsz, number of choice, seq length)
|
||||
mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
|
||||
|
||||
config = modeling_gpt2.GPT2Config()
|
||||
|
||||
model = modeling_gpt2.GPT2LMHeadModel(config)
|
||||
lm_logits, multiple_choice_logits = model(input_ids, mc_token_ids)
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
super(GPT2DoubleHeadsModel, self).__init__(config)
|
||||
self.transformer = GPT2Model(config)
|
||||
self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
|
||||
self.multiple_choice_head = GPT2MultipleChoiceHead(config)
|
||||
self.apply(self.init_weights)
|
||||
|
||||
def set_tied(self):
|
||||
""" Make sure we are sharing the embeddings
|
||||
"""
|
||||
self.lm_head.set_embeddings_weights(self.transformer.wte.weight)
|
||||
|
||||
def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None, past=None):
|
||||
hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
|
||||
lm_logits = self.lm_head(hidden_states)
|
||||
mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
|
||||
losses = []
|
||||
if lm_labels is not None:
|
||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
||||
losses.append(loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)))
|
||||
if mc_labels is not None:
|
||||
loss_fct = CrossEntropyLoss()
|
||||
losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
|
||||
if losses:
|
||||
return losses
|
||||
return lm_logits, mc_logits, presents
|
810
pytorch_pretrained_bert/modeling_openai.py
Normal file
810
pytorch_pretrained_bert/modeling_openai.py
Normal file
@ -0,0 +1,810 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The OpenAI Team Authors and HugginFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""PyTorch OpenAI GPT model."""
|
||||
|
||||
import collections
|
||||
import copy
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import shutil
|
||||
import tarfile
|
||||
import tempfile
|
||||
import sys
|
||||
from io import open
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
from torch.nn.parameter import Parameter
|
||||
|
||||
from .file_utils import cached_path
|
||||
from .modeling import BertLayerNorm as LayerNorm
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"}
|
||||
PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"}
|
||||
|
||||
CONFIG_NAME = "config.json"
|
||||
WEIGHTS_NAME = "pytorch_model.bin"
|
||||
|
||||
def load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path):
|
||||
""" Load tf pre-trained weights in a pytorch model (from NumPy arrays here)
|
||||
"""
|
||||
import re
|
||||
import numpy as np
|
||||
print("Loading weights...")
|
||||
names = json.load(open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8'))
|
||||
shapes = json.load(open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8'))
|
||||
offsets = np.cumsum([np.prod(shape) for shape in shapes])
|
||||
init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)]
|
||||
init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
|
||||
init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
|
||||
|
||||
# This was used when we had a single embedding matrix for positions and tokens
|
||||
# init_params[0] = np.concatenate([init_params[1], init_params[0]], 0)
|
||||
# del init_params[1]
|
||||
init_params = [arr.squeeze() for arr in init_params]
|
||||
|
||||
try:
|
||||
assert model.tokens_embed.weight.shape == init_params[1].shape
|
||||
assert model.positions_embed.weight.shape == init_params[0].shape
|
||||
except AssertionError as e:
|
||||
e.args += (model.tokens_embed.weight.shape, init_params[1].shape)
|
||||
e.args += (model.positions_embed.weight.shape, init_params[0].shape)
|
||||
raise
|
||||
|
||||
model.tokens_embed.weight.data = torch.from_numpy(init_params[1])
|
||||
model.positions_embed.weight.data = torch.from_numpy(init_params[0])
|
||||
names.pop(0)
|
||||
# Pop position and token embedding arrays
|
||||
init_params.pop(0)
|
||||
init_params.pop(0)
|
||||
|
||||
for name, array in zip(names, init_params): # names[1:n_transfer], init_params[1:n_transfer]):
|
||||
name = name[6:] # skip "model/"
|
||||
assert name[-2:] == ":0"
|
||||
name = name[:-2]
|
||||
name = name.split('/')
|
||||
pointer = model
|
||||
for m_name in name:
|
||||
if re.fullmatch(r'[A-Za-z]+\d+', m_name):
|
||||
l = re.split(r'(\d+)', m_name)
|
||||
else:
|
||||
l = [m_name]
|
||||
if l[0] == 'g':
|
||||
pointer = getattr(pointer, 'weight')
|
||||
elif l[0] == 'b':
|
||||
pointer = getattr(pointer, 'bias')
|
||||
elif l[0] == 'w':
|
||||
pointer = getattr(pointer, 'weight')
|
||||
else:
|
||||
pointer = getattr(pointer, l[0])
|
||||
if len(l) >= 2:
|
||||
num = int(l[1])
|
||||
pointer = pointer[num]
|
||||
try:
|
||||
assert pointer.shape == array.shape
|
||||
except AssertionError as e:
|
||||
e.args += (pointer.shape, array.shape)
|
||||
raise
|
||||
try:
|
||||
assert pointer.shape == array.shape
|
||||
except AssertionError as e:
|
||||
e.args += (pointer.shape, array.shape)
|
||||
raise
|
||||
print("Initialize PyTorch weight {}".format(name))
|
||||
pointer.data = torch.from_numpy(array)
|
||||
return model
|
||||
|
||||
|
||||
def gelu(x):
|
||||
return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
||||
|
||||
|
||||
def swish(x):
|
||||
return x * torch.sigmoid(x)
|
||||
|
||||
|
||||
ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}
|
||||
|
||||
|
||||
class OpenAIGPTConfig(object):
|
||||
"""Configuration class to store the configuration of a `OpenAIGPTModel`.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size_or_config_json_file=40478,
|
||||
n_special=0,
|
||||
n_positions=512,
|
||||
n_ctx=512,
|
||||
n_embd=768,
|
||||
n_layer=12,
|
||||
n_head=12,
|
||||
afn="gelu",
|
||||
resid_pdrop=0.1,
|
||||
embd_pdrop=0.1,
|
||||
attn_pdrop=0.1,
|
||||
layer_norm_epsilon=1e-5,
|
||||
initializer_range=0.02,
|
||||
):
|
||||
"""Constructs OpenAIGPTConfig.
|
||||
|
||||
Args:
|
||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
|
||||
n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
|
||||
n_positions: Number of positional embeddings.
|
||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||
n_embd: Dimensionality of the embeddings and hidden states.
|
||||
n_layer: Number of hidden layers in the Transformer encoder.
|
||||
n_head: Number of attention heads for each attention layer in
|
||||
the Transformer encoder.
|
||||
afn: The non-linear activation function (function or string) in the
|
||||
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
|
||||
resid_pdrop: The dropout probabilitiy for all fully connected
|
||||
layers in the embeddings, encoder, and pooler.
|
||||
attn_pdrop: The dropout ratio for the attention
|
||||
probabilities.
|
||||
embd_pdrop: The dropout ratio for the embeddings.
|
||||
layer_norm_epsilon: epsilon to use in the layer norm layers
|
||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
"""
|
||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
|
||||
and isinstance(vocab_size_or_config_json_file, unicode)):
|
||||
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
|
||||
json_config = json.loads(reader.read())
|
||||
for key, value in json_config.items():
|
||||
self.__dict__[key] = value
|
||||
elif isinstance(vocab_size_or_config_json_file, int):
|
||||
self.vocab_size = vocab_size_or_config_json_file
|
||||
self.n_special = n_special
|
||||
self.n_ctx = n_ctx
|
||||
self.n_positions = n_positions
|
||||
self.n_embd = n_embd
|
||||
self.n_layer = n_layer
|
||||
self.n_head = n_head
|
||||
self.afn = afn
|
||||
self.resid_pdrop = resid_pdrop
|
||||
self.embd_pdrop = embd_pdrop
|
||||
self.attn_pdrop = attn_pdrop
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.initializer_range = initializer_range
|
||||
else:
|
||||
raise ValueError(
|
||||
"First argument must be either a vocabulary size (int)"
|
||||
"or the path to a pretrained model config file (str)"
|
||||
)
|
||||
|
||||
@property
|
||||
def total_tokens_embeddings(self):
|
||||
return self.vocab_size + self.n_special
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, json_object):
|
||||
"""Constructs a `OpenAIGPTConfig` from a Python dictionary of parameters."""
|
||||
config = OpenAIGPTConfig(vocab_size_or_config_json_file=-1)
|
||||
for key, value in json_object.items():
|
||||
config.__dict__[key] = value
|
||||
return config
|
||||
|
||||
@classmethod
|
||||
def from_json_file(cls, json_file):
|
||||
"""Constructs a `OpenAIGPTConfig` from a json file of parameters."""
|
||||
with open(json_file, "r", encoding="utf-8") as reader:
|
||||
text = reader.read()
|
||||
return cls.from_dict(json.loads(text))
|
||||
|
||||
def __repr__(self):
|
||||
return str(self.to_json_string())
|
||||
|
||||
def to_dict(self):
|
||||
"""Serializes this instance to a Python dictionary."""
|
||||
output = copy.deepcopy(self.__dict__)
|
||||
return output
|
||||
|
||||
def to_json_string(self):
|
||||
"""Serializes this instance to a JSON string."""
|
||||
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
|
||||
|
||||
|
||||
class Conv1D(nn.Module):
|
||||
def __init__(self, nf, rf, nx):
|
||||
super(Conv1D, self).__init__()
|
||||
self.rf = rf
|
||||
self.nf = nf
|
||||
if rf == 1: # faster 1x1 conv
|
||||
w = torch.empty(nx, nf)
|
||||
nn.init.normal_(w, std=0.02)
|
||||
self.weight = Parameter(w)
|
||||
self.bias = Parameter(torch.zeros(nf))
|
||||
else: # was used to train LM
|
||||
raise NotImplementedError
|
||||
|
||||
def forward(self, x):
|
||||
if self.rf == 1:
|
||||
size_out = x.size()[:-1] + (self.nf,)
|
||||
x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
|
||||
x = x.view(*size_out)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
return x
|
||||
|
||||
|
||||
class Attention(nn.Module):
|
||||
def __init__(self, nx, n_ctx, config, scale=False):
|
||||
super(Attention, self).__init__()
|
||||
n_state = nx # in Attention: n_state=768 (nx=n_embd)
|
||||
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
|
||||
assert n_state % config.n_head == 0
|
||||
self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
|
||||
self.n_head = config.n_head
|
||||
self.split_size = n_state
|
||||
self.scale = scale
|
||||
self.c_attn = Conv1D(n_state * 3, 1, nx)
|
||||
self.c_proj = Conv1D(n_state, 1, nx)
|
||||
self.attn_dropout = nn.Dropout(config.attn_pdrop)
|
||||
self.resid_dropout = nn.Dropout(config.resid_pdrop)
|
||||
|
||||
def _attn(self, q, k, v):
|
||||
w = torch.matmul(q, k)
|
||||
if self.scale:
|
||||
w = w / math.sqrt(v.size(-1))
|
||||
# w = w * self.bias + -1e9 * (1 - self.bias) # TF implem method: mask_attn_weights
|
||||
# XD: self.b may be larger than w, so we need to crop it
|
||||
b = self.bias[:, :, : w.size(-2), : w.size(-1)]
|
||||
w = w * b + -1e9 * (1 - b)
|
||||
|
||||
w = nn.Softmax(dim=-1)(w)
|
||||
w = self.attn_dropout(w)
|
||||
return torch.matmul(w, v)
|
||||
|
||||
def merge_heads(self, x):
|
||||
x = x.permute(0, 2, 1, 3).contiguous()
|
||||
new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
|
||||
return x.view(*new_x_shape) # in Tensorflow implem: fct merge_states
|
||||
|
||||
def split_heads(self, x, k=False):
|
||||
new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
|
||||
x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states
|
||||
if k:
|
||||
return x.permute(0, 2, 3, 1)
|
||||
else:
|
||||
return x.permute(0, 2, 1, 3)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.c_attn(x)
|
||||
query, key, value = x.split(self.split_size, dim=2)
|
||||
query = self.split_heads(query)
|
||||
key = self.split_heads(key, k=True)
|
||||
value = self.split_heads(value)
|
||||
a = self._attn(query, key, value)
|
||||
a = self.merge_heads(a)
|
||||
a = self.c_proj(a)
|
||||
a = self.resid_dropout(a)
|
||||
return a
|
||||
|
||||
|
||||
class MLP(nn.Module):
|
||||
def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd)
|
||||
super(MLP, self).__init__()
|
||||
nx = config.n_embd
|
||||
self.c_fc = Conv1D(n_state, 1, nx)
|
||||
self.c_proj = Conv1D(nx, 1, n_state)
|
||||
self.act = ACT_FNS[config.afn]
|
||||
self.dropout = nn.Dropout(config.resid_pdrop)
|
||||
|
||||
def forward(self, x):
|
||||
h = self.act(self.c_fc(x))
|
||||
h2 = self.c_proj(h)
|
||||
return self.dropout(h2)
|
||||
|
||||
|
||||
class Block(nn.Module):
|
||||
def __init__(self, n_ctx, config, scale=False):
|
||||
super(Block, self).__init__()
|
||||
nx = config.n_embd
|
||||
self.attn = Attention(nx, n_ctx, config, scale)
|
||||
self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
|
||||
self.mlp = MLP(4 * nx, config)
|
||||
self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
|
||||
|
||||
def forward(self, x):
|
||||
a = self.attn(x)
|
||||
n = self.ln_1(x + a)
|
||||
m = self.mlp(n)
|
||||
h = self.ln_2(n + m)
|
||||
return h
|
||||
|
||||
|
||||
class OpenAIGPTLMHead(nn.Module):
|
||||
""" Language Model Head for the transformer """
|
||||
|
||||
def __init__(self, model_embeddings_weights, config):
|
||||
super(OpenAIGPTLMHead, self).__init__()
|
||||
self.n_embd = config.n_embd
|
||||
self.set_embeddings_weights(model_embeddings_weights)
|
||||
|
||||
def set_embeddings_weights(self, model_embeddings_weights):
|
||||
embed_shape = model_embeddings_weights.shape
|
||||
self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
|
||||
self.decoder.weight = model_embeddings_weights # Tied weights
|
||||
|
||||
def forward(self, hidden_state):
|
||||
# Truncated Language modeling logits (we remove the last token)
|
||||
# h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd)
|
||||
lm_logits = self.decoder(hidden_state)
|
||||
return lm_logits
|
||||
|
||||
|
||||
class OpenAIGPTMultipleChoiceHead(nn.Module):
|
||||
""" Classifier Head for the transformer """
|
||||
|
||||
def __init__(self, config):
|
||||
super(OpenAIGPTMultipleChoiceHead, self).__init__()
|
||||
self.n_embd = config.n_embd
|
||||
# self.multiple_choice_token = multiple_choice_token
|
||||
self.dropout = nn.Dropout2d(config.resid_pdrop) # To reproduce the noise_shape parameter of TF implementation
|
||||
self.linear = nn.Linear(config.n_embd, 1)
|
||||
|
||||
nn.init.normal_(self.linear.weight, std=0.02)
|
||||
nn.init.normal_(self.linear.bias, 0)
|
||||
|
||||
def forward(self, hidden_states, mc_token_ids):
|
||||
# Classification logits
|
||||
# hidden_state (bsz, num_choices, seq_length, hidden_size)
|
||||
# mc_token_ids (bsz, num_choices)
|
||||
mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))
|
||||
# (bsz, num_choices, 1, hidden_size)
|
||||
multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2)
|
||||
# (bsz, num_choices, hidden_size)
|
||||
multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1)
|
||||
# (bsz, num_choices)
|
||||
return multiple_choice_logits
|
||||
|
||||
|
||||
class OpenAIGPTPreTrainedModel(nn.Module):
|
||||
""" An abstract class to handle weights initialization and
|
||||
a simple interface for dowloading and loading pretrained models.
|
||||
"""
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super(OpenAIGPTPreTrainedModel, self).__init__()
|
||||
if not isinstance(config, OpenAIGPTConfig):
|
||||
raise ValueError(
|
||||
"Parameter config in `{}(config)` should be an instance of class `OpenAIGPTConfig`. "
|
||||
"To create a model from a pretrained model use "
|
||||
"`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
|
||||
self.__class__.__name__, self.__class__.__name__
|
||||
)
|
||||
)
|
||||
self.config = config
|
||||
|
||||
def init_weights(self, module):
|
||||
""" Initialize the weights.
|
||||
"""
|
||||
if isinstance(module, (nn.Linear, nn.Embedding)):
|
||||
# Slightly different from the TF version which uses truncated_normal for initialization
|
||||
# cf https://github.com/pytorch/pytorch/pull/5617
|
||||
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
||||
elif isinstance(module, LayerNorm):
|
||||
module.bias.data.zero_()
|
||||
module.weight.data.fill_(1.0)
|
||||
if isinstance(module, nn.Linear) and module.bias is not None:
|
||||
module.bias.data.zero_()
|
||||
|
||||
def set_num_special_tokens(self, num_special_tokens):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls, pretrained_model_name_or_path, num_special_tokens=None, state_dict=None, cache_dir=None, from_tf=False, *inputs, **kwargs
|
||||
):
|
||||
"""
|
||||
Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
|
||||
Download and cache the pre-trained model file if needed.
|
||||
|
||||
Params:
|
||||
pretrained_model_name_or_path: either:
|
||||
- a str with the name of a pre-trained model to load selected in the list of:
|
||||
. `openai-gpt`
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `openai_gpt_config.json` a configuration file for the model
|
||||
. `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `bert_config.json` a configuration file for the model
|
||||
. a series of NumPy files containing OpenAI TensorFlow trained weights
|
||||
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
|
||||
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
|
||||
state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
|
||||
*inputs, **kwargs: additional input for the specific Bert class
|
||||
(ex: num_labels for BertForSequenceClassification)
|
||||
"""
|
||||
if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
|
||||
archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
|
||||
config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
|
||||
else:
|
||||
archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
|
||||
config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
|
||||
# redirect to the cache, if necessary
|
||||
try:
|
||||
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
|
||||
resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
|
||||
except EnvironmentError:
|
||||
logger.error(
|
||||
"Model name '{}' was not found in model name list ({}). "
|
||||
"We assumed '{}' was a path or url but couldn't find files {} and {} "
|
||||
"at this path or url.".format(
|
||||
pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
|
||||
archive_file, config_file
|
||||
)
|
||||
)
|
||||
return None
|
||||
if resolved_archive_file == archive_file and resolved_config_file == config_file:
|
||||
logger.info("loading weights file {}".format(archive_file))
|
||||
logger.info("loading configuration file {}".format(config_file))
|
||||
else:
|
||||
logger.info("loading weights file {} from cache at {}".format(
|
||||
archive_file, resolved_archive_file))
|
||||
logger.info("loading configuration file {} from cache at {}".format(
|
||||
config_file, resolved_config_file))
|
||||
# Load config
|
||||
config = OpenAIGPTConfig.from_json_file(resolved_config_file)
|
||||
logger.info("Model config {}".format(config))
|
||||
# Instantiate model.
|
||||
model = cls(config, *inputs, **kwargs)
|
||||
if state_dict is None and not from_tf:
|
||||
state_dict = torch.load(resolved_archive_file, map_location='cpu' if not torch.cuda.is_available() else None)
|
||||
if from_tf:
|
||||
# Directly load from a TensorFlow checkpoint (stored as NumPy array)
|
||||
return load_tf_weights_in_openai_gpt(model, resolved_archive_file)
|
||||
|
||||
old_keys = []
|
||||
new_keys = []
|
||||
for key in state_dict.keys():
|
||||
new_key = None
|
||||
if key.endswith(".g"):
|
||||
new_key = key[:-2] + ".weight"
|
||||
elif key.endswith(".b"):
|
||||
new_key = key[:-2] + ".bias"
|
||||
elif key.endswith(".w"):
|
||||
new_key = key[:-2] + ".weight"
|
||||
if new_key:
|
||||
old_keys.append(key)
|
||||
new_keys.append(new_key)
|
||||
for old_key, new_key in zip(old_keys, new_keys):
|
||||
state_dict[new_key] = state_dict.pop(old_key)
|
||||
|
||||
missing_keys = []
|
||||
unexpected_keys = []
|
||||
error_msgs = []
|
||||
# copy state_dict so _load_from_state_dict can modify it
|
||||
metadata = getattr(state_dict, "_metadata", None)
|
||||
state_dict = state_dict.copy()
|
||||
if metadata is not None:
|
||||
state_dict._metadata = metadata
|
||||
|
||||
def load(module, prefix=""):
|
||||
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
|
||||
module._load_from_state_dict(
|
||||
state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs
|
||||
)
|
||||
for name, child in module._modules.items():
|
||||
if child is not None:
|
||||
load(child, prefix + name + ".")
|
||||
|
||||
start_model = model
|
||||
if hasattr(model, "transformer") and all(not s.startswith('transformer.') for s in state_dict.keys()):
|
||||
start_model = model.transformer
|
||||
load(start_model, prefix="")
|
||||
|
||||
if len(missing_keys) > 0:
|
||||
logger.info(
|
||||
"Weights of {} not initialized from pretrained model: {}".format(model.__class__.__name__, missing_keys)
|
||||
)
|
||||
if len(unexpected_keys) > 0:
|
||||
logger.info(
|
||||
"Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys)
|
||||
)
|
||||
if len(error_msgs) > 0:
|
||||
raise RuntimeError(
|
||||
"Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
|
||||
)
|
||||
|
||||
# Add additional embeddings for special tokens if needed
|
||||
# This step also make sure we are still sharing the output and input embeddings after loading weights
|
||||
model.set_num_special_tokens(num_special_tokens if num_special_tokens is not None else config.n_special)
|
||||
return model
|
||||
|
||||
|
||||
class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
||||
"""OpenAI GPT model ("Improving Language Understanding by Generative Pre-Training").
|
||||
|
||||
OpenAI GPT use a single embedding matrix to store the word and special embeddings.
|
||||
Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
|
||||
Special tokens need to be trained during the fine-tuning if you use them.
|
||||
The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
|
||||
|
||||
The embeddings are ordered as follow in the token embeddings matrice:
|
||||
[0, ----------------------
|
||||
... -> word embeddings
|
||||
config.vocab_size - 1, ______________________
|
||||
config.vocab_size,
|
||||
... -> special embeddings
|
||||
config.vocab_size + config.n_special - 1] ______________________
|
||||
|
||||
where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
|
||||
total_tokens_embeddings = config.vocab_size + config.n_special
|
||||
You should use the associate indices to index the embeddings.
|
||||
|
||||
Params:
|
||||
config: a OpenAIGPTConfig class instance with the configuration to build a new model
|
||||
|
||||
Inputs:
|
||||
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
|
||||
were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
|
||||
`position_ids`: an optional torch.LongTensor with the same shape as input_ids
|
||||
with the position indices (selected in the range [0, config.n_positions - 1[.
|
||||
`token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
|
||||
You can use it to add a third type of embedding to each input token in the sequence
|
||||
(the previous two being the word and position embeddings).
|
||||
The input, position and token_type embeddings are summed inside the Transformer before the first
|
||||
self-attention block.
|
||||
|
||||
Outputs:
|
||||
`hidden_states`: the encoded-hidden-states at the top of the model
|
||||
as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
|
||||
(or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
|
||||
|
||||
Example usage:
|
||||
```python
|
||||
# Already been converted into BPE token ids
|
||||
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
|
||||
|
||||
config = modeling_openai.OpenAIGPTConfig()
|
||||
|
||||
model = modeling_openai.OpenAIGPTModel(config)
|
||||
hidden_states = model(input_ids)
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
super(OpenAIGPTModel, self).__init__(config)
|
||||
num_tokens = config.vocab_size + config.n_special
|
||||
self.tokens_embed = nn.Embedding(num_tokens, config.n_embd)
|
||||
self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
|
||||
self.drop = nn.Dropout(config.embd_pdrop)
|
||||
block = Block(config.n_ctx, config, scale=True)
|
||||
self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
|
||||
|
||||
self.apply(self.init_weights)
|
||||
# nn.init.normal_(self.embed.weight, std=0.02)
|
||||
|
||||
def set_num_special_tokens(self, num_special_tokens):
|
||||
" Update input embeddings with new embedding matrice if needed "
|
||||
if self.config.n_special == num_special_tokens:
|
||||
return
|
||||
# Update config
|
||||
self.config.n_special = num_special_tokens
|
||||
# # Build new embeddings and initialize
|
||||
old_embed = self.tokens_embed
|
||||
self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
|
||||
# Initialize all new embeddings (in particular the special tokens)
|
||||
self.init_weights(self.tokens_embed)
|
||||
# Copy word and positional embeddings from the previous weights
|
||||
self.tokens_embed.weight.data[: self.config.vocab_size, :] = old_embed.weight.data[: self.config.vocab_size, :]
|
||||
self.tokens_embed.weight.data[-self.config.n_positions :, :] = old_embed.weight.data[-self.config.n_positions :, :]
|
||||
|
||||
def forward(self, input_ids, position_ids=None, token_type_ids=None):
|
||||
if position_ids is None:
|
||||
# This was used when we had a single embedding matrice from position and token embeddings
|
||||
# start = self.config.vocab_size + self.config.n_special
|
||||
# end = start + input_ids.size(-1)
|
||||
# position_ids = torch.arange(start, end, dtype=torch.long, device=input_ids.device)
|
||||
position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device)
|
||||
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
|
||||
|
||||
input_shape = input_ids.size()
|
||||
input_ids = input_ids.view(-1, input_ids.size(-1))
|
||||
position_ids = position_ids.view(-1, position_ids.size(-1))
|
||||
|
||||
inputs_embeds = self.tokens_embed(input_ids)
|
||||
position_embeds = self.positions_embed(position_ids)
|
||||
if token_type_ids is not None:
|
||||
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
|
||||
token_type_embeds = self.tokens_embed(token_type_ids)
|
||||
else:
|
||||
token_type_embeds = 0
|
||||
# Add the position information to the input embeddings
|
||||
# h = e.sum(dim=2)
|
||||
hidden_states = inputs_embeds + position_embeds + token_type_embeds
|
||||
for block in self.h:
|
||||
hidden_states = block(hidden_states)
|
||||
output_shape = input_shape + (hidden_states.size(-1),)
|
||||
return hidden_states.view(*output_shape)
|
||||
|
||||
|
||||
class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
||||
"""OpenAI GPT model with a Language Modeling head ("Improving Language Understanding by Generative Pre-Training").
|
||||
|
||||
OpenAI GPT use a single embedding matrix to store the word and special embeddings.
|
||||
Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
|
||||
Special tokens need to be trained during the fine-tuning if you use them.
|
||||
The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
|
||||
|
||||
The embeddings are ordered as follow in the token embeddings matrice:
|
||||
[0, ----------------------
|
||||
... -> word embeddings
|
||||
config.vocab_size - 1, ______________________
|
||||
config.vocab_size,
|
||||
... -> special embeddings
|
||||
config.vocab_size + config.n_special - 1] ______________________
|
||||
|
||||
where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
|
||||
total_tokens_embeddings = config.vocab_size + config.n_special
|
||||
You should use the associate indices to index the embeddings.
|
||||
|
||||
Params:
|
||||
config: a OpenAIGPTConfig class instance with the configuration to build a new model
|
||||
|
||||
Inputs:
|
||||
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
|
||||
were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
|
||||
`position_ids`: an optional torch.LongTensor with the same shape as input_ids
|
||||
with the position indices (selected in the range [0, config.n_positions - 1[.
|
||||
`token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
|
||||
You can use it to add a third type of embedding to each input token in the sequence
|
||||
(the previous two being the word and position embeddings).
|
||||
The input, position and token_type embeddings are summed inside the Transformer before the first
|
||||
self-attention block.
|
||||
`lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
|
||||
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
|
||||
is only computed for the labels set in [0, ..., vocab_size]
|
||||
|
||||
Outputs:
|
||||
if `lm_labels` is not `None`:
|
||||
Outputs the language modeling loss.
|
||||
else:
|
||||
`lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, total_tokens_embeddings]
|
||||
(or more generally [d_1, ..., d_n, total_tokens_embeddings] were d_1 ... d_n are the dimension of input_ids)
|
||||
|
||||
Example usage:
|
||||
```python
|
||||
# Already been converted into BPE token ids
|
||||
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
|
||||
|
||||
config = modeling_openai.OpenAIGPTConfig()
|
||||
|
||||
model = modeling_openai.OpenAIGPTLMHeadModel(config)
|
||||
lm_logits = model(input_ids)
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
super(OpenAIGPTLMHeadModel, self).__init__(config)
|
||||
self.transformer = OpenAIGPTModel(config)
|
||||
self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
|
||||
self.apply(self.init_weights)
|
||||
|
||||
def set_num_special_tokens(self, num_special_tokens):
|
||||
""" Update input and output embeddings with new embedding matrice
|
||||
Make sure we are sharing the embeddings
|
||||
"""
|
||||
self.transformer.set_num_special_tokens(num_special_tokens)
|
||||
self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight)
|
||||
|
||||
def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None):
|
||||
hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
|
||||
lm_logits = self.lm_head(hidden_states)
|
||||
if lm_labels is not None:
|
||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
||||
loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))
|
||||
return loss
|
||||
return lm_logits
|
||||
|
||||
|
||||
class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
||||
"""OpenAI GPT model with a Language Modeling and a Multiple Choice head ("Improving Language Understanding by Generative Pre-Training").
|
||||
|
||||
OpenAI GPT use a single embedding matrix to store the word and special embeddings.
|
||||
Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
|
||||
Special tokens need to be trained during the fine-tuning if you use them.
|
||||
The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
|
||||
|
||||
The embeddings are ordered as follow in the token embeddings matrice:
|
||||
[0, ----------------------
|
||||
... -> word embeddings
|
||||
config.vocab_size - 1, ______________________
|
||||
config.vocab_size,
|
||||
... -> special embeddings
|
||||
config.vocab_size + config.n_special - 1] ______________________
|
||||
|
||||
where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
|
||||
total_tokens_embeddings = config.vocab_size + config.n_special
|
||||
You should use the associate indices to index the embeddings.
|
||||
|
||||
Params:
|
||||
config: a OpenAIGPTConfig class instance with the configuration to build a new model
|
||||
|
||||
Inputs:
|
||||
`input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
|
||||
indices selected in the range [0, total_tokens_embeddings[
|
||||
`mc_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token from
|
||||
which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
|
||||
`position_ids`: an optional torch.LongTensor with the same shape as input_ids
|
||||
with the position indices (selected in the range [0, config.n_positions - 1[.
|
||||
`token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
|
||||
You can use it to add a third type of embedding to each input token in the sequence
|
||||
(the previous two being the word and position embeddings).
|
||||
The input, position and token_type embeddings are summed inside the Transformer before the first
|
||||
self-attention block.
|
||||
`lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length]
|
||||
with indices selected in [-1, 0, ..., total_tokens_embeddings]. All labels set to -1 are ignored (masked), the loss
|
||||
is only computed for the labels set in [0, ..., total_tokens_embeddings]
|
||||
`multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
|
||||
with indices selected in [0, ..., num_choices].
|
||||
|
||||
Outputs:
|
||||
if `lm_labels` and `multiple_choice_labels` are not `None`:
|
||||
Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
|
||||
else: a tuple with
|
||||
`lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_tokens_embeddings]
|
||||
`multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
|
||||
|
||||
Example usage:
|
||||
```python
|
||||
# Already been converted into BPE token ids
|
||||
input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]]) # (bsz, number of choice, seq length)
|
||||
mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
|
||||
|
||||
config = modeling_openai.OpenAIGPTConfig()
|
||||
|
||||
model = modeling_openai.OpenAIGPTLMHeadModel(config)
|
||||
lm_logits, multiple_choice_logits = model(input_ids, mc_token_ids)
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
|
||||
self.transformer = OpenAIGPTModel(config)
|
||||
self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
|
||||
self.multiple_choice_head = OpenAIGPTMultipleChoiceHead(config)
|
||||
self.apply(self.init_weights)
|
||||
|
||||
def set_num_special_tokens(self, num_special_tokens):
|
||||
""" Update input and output embeddings with new embedding matrice
|
||||
Make sure we are sharing the embeddings
|
||||
"""
|
||||
self.transformer.set_num_special_tokens(num_special_tokens)
|
||||
self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight)
|
||||
|
||||
def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None):
|
||||
hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
|
||||
lm_logits = self.lm_head(hidden_states)
|
||||
mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
|
||||
losses = []
|
||||
if lm_labels is not None:
|
||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
||||
losses.append(loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)))
|
||||
if mc_labels is not None:
|
||||
loss_fct = CrossEntropyLoss()
|
||||
losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
|
||||
if losses:
|
||||
return losses
|
||||
return lm_logits, mc_logits
|
1381
pytorch_pretrained_bert/modeling_transfo_xl.py
Normal file
1381
pytorch_pretrained_bert/modeling_transfo_xl.py
Normal file
File diff suppressed because it is too large
Load Diff
402
pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
Normal file
402
pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
Normal file
@ -0,0 +1,402 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Utilities for PyTorch Transformer XL model.
|
||||
Directly adapted from https://github.com/kimiyoung/transformer-xl.
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
# CUDA_MAJOR = int(torch.version.cuda.split('.')[0])
|
||||
# CUDA_MINOR = int(torch.version.cuda.split('.')[1])
|
||||
|
||||
class ProjectedAdaptiveLogSoftmax(nn.Module):
|
||||
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
|
||||
keep_order=False):
|
||||
super(ProjectedAdaptiveLogSoftmax, self).__init__()
|
||||
|
||||
self.n_token = n_token
|
||||
self.d_embed = d_embed
|
||||
self.d_proj = d_proj
|
||||
|
||||
self.cutoffs = cutoffs + [n_token]
|
||||
self.cutoff_ends = [0] + self.cutoffs
|
||||
self.div_val = div_val
|
||||
|
||||
self.shortlist_size = self.cutoffs[0]
|
||||
self.n_clusters = len(self.cutoffs) - 1
|
||||
self.head_size = self.shortlist_size + self.n_clusters
|
||||
|
||||
if self.n_clusters > 0:
|
||||
self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed))
|
||||
self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters))
|
||||
|
||||
self.out_layers = nn.ModuleList()
|
||||
self.out_projs = nn.ParameterList()
|
||||
|
||||
if div_val == 1:
|
||||
for i in range(len(self.cutoffs)):
|
||||
if d_proj != d_embed:
|
||||
self.out_projs.append(
|
||||
nn.Parameter(torch.Tensor(d_proj, d_embed))
|
||||
)
|
||||
else:
|
||||
self.out_projs.append(None)
|
||||
|
||||
self.out_layers.append(nn.Linear(d_embed, n_token))
|
||||
else:
|
||||
for i in range(len(self.cutoffs)):
|
||||
l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
|
||||
d_emb_i = d_embed // (div_val ** i)
|
||||
|
||||
self.out_projs.append(
|
||||
nn.Parameter(torch.Tensor(d_proj, d_emb_i))
|
||||
)
|
||||
|
||||
self.out_layers.append(nn.Linear(d_emb_i, r_idx-l_idx))
|
||||
|
||||
self.keep_order = keep_order
|
||||
|
||||
def _compute_logit(self, hidden, weight, bias, proj):
|
||||
if proj is None:
|
||||
logit = F.linear(hidden, weight, bias=bias)
|
||||
else:
|
||||
# if CUDA_MAJOR <= 9 and CUDA_MINOR <= 1:
|
||||
proj_hid = F.linear(hidden, proj.t().contiguous())
|
||||
logit = F.linear(proj_hid, weight, bias=bias)
|
||||
# else:
|
||||
# logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t()))
|
||||
# if bias is not None:
|
||||
# logit = logit + bias
|
||||
|
||||
return logit
|
||||
|
||||
def forward(self, hidden, target=None, keep_order=False):
|
||||
'''
|
||||
Params:
|
||||
hidden :: [len*bsz x d_proj]
|
||||
target :: [len*bsz]
|
||||
Return:
|
||||
if target is None:
|
||||
out :: [len*bsz] Negative log likelihood
|
||||
else:
|
||||
out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary
|
||||
We could replace this implementation by the native PyTorch one
|
||||
if their's had an option to set bias on all clusters in the native one.
|
||||
here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
|
||||
'''
|
||||
|
||||
if target is not None:
|
||||
target = target.view(-1)
|
||||
if hidden.size(0) != target.size(0):
|
||||
raise RuntimeError('Input and target should have the same size '
|
||||
'in the batch dimension.')
|
||||
|
||||
if self.n_clusters == 0:
|
||||
logit = self._compute_logit(hidden, self.out_layers[0].weight,
|
||||
self.out_layers[0].bias, self.out_projs[0])
|
||||
if target is not None:
|
||||
output = -F.log_softmax(logit, dim=-1) \
|
||||
.gather(1, target.unsqueeze(1)).squeeze(1)
|
||||
else:
|
||||
output = F.log_softmax(logit, dim=-1)
|
||||
else:
|
||||
# construct weights and biases
|
||||
weights, biases = [], []
|
||||
for i in range(len(self.cutoffs)):
|
||||
if self.div_val == 1:
|
||||
l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
|
||||
weight_i = self.out_layers[0].weight[l_idx:r_idx]
|
||||
bias_i = self.out_layers[0].bias[l_idx:r_idx]
|
||||
else:
|
||||
weight_i = self.out_layers[i].weight
|
||||
bias_i = self.out_layers[i].bias
|
||||
|
||||
if i == 0:
|
||||
weight_i = torch.cat(
|
||||
[weight_i, self.cluster_weight], dim=0)
|
||||
bias_i = torch.cat(
|
||||
[bias_i, self.cluster_bias], dim=0)
|
||||
|
||||
weights.append(weight_i)
|
||||
biases.append(bias_i)
|
||||
|
||||
head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]
|
||||
|
||||
head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
|
||||
head_logprob = F.log_softmax(head_logit, dim=1)
|
||||
|
||||
if target is None:
|
||||
out = hidden.new_empty((head_logit.size(0), self.n_token))
|
||||
else:
|
||||
out = torch.zeros_like(target, dtype=hidden.dtype, device=hidden.device)
|
||||
|
||||
offset = 0
|
||||
cutoff_values = [0] + self.cutoffs
|
||||
for i in range(len(cutoff_values) - 1):
|
||||
l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1]
|
||||
|
||||
if target is not None:
|
||||
mask_i = (target >= l_idx) & (target < r_idx)
|
||||
indices_i = mask_i.nonzero().squeeze()
|
||||
|
||||
if indices_i.numel() == 0:
|
||||
continue
|
||||
|
||||
target_i = target.index_select(0, indices_i) - l_idx
|
||||
head_logprob_i = head_logprob.index_select(0, indices_i)
|
||||
hidden_i = hidden.index_select(0, indices_i)
|
||||
else:
|
||||
hidden_i = hidden
|
||||
|
||||
if i == 0:
|
||||
if target is not None:
|
||||
logprob_i = head_logprob_i.gather(1, target_i[:, None]).squeeze(1)
|
||||
else:
|
||||
out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]]
|
||||
else:
|
||||
weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
|
||||
|
||||
tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i)
|
||||
tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
|
||||
cluster_prob_idx = self.cutoffs[0] + i - 1 # No probability for the head cluster
|
||||
if target is not None:
|
||||
logprob_i = head_logprob_i[:, cluster_prob_idx] \
|
||||
+ tail_logprob_i.gather(1, target_i[:, None]).squeeze(1)
|
||||
else:
|
||||
logprob_i = head_logprob[:, cluster_prob_idx, None] + tail_logprob_i
|
||||
out[:, l_idx:r_idx] = logprob_i
|
||||
|
||||
if target is not None:
|
||||
if (hasattr(self, 'keep_order') and self.keep_order) or keep_order:
|
||||
out.index_copy_(0, indices_i, -logprob_i)
|
||||
else:
|
||||
out[offset:offset+logprob_i.size(0)].copy_(-logprob_i)
|
||||
offset += logprob_i.size(0)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def log_prob(self, hidden):
|
||||
r""" Computes log probabilities for all :math:`n\_classes`
|
||||
From: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.py
|
||||
Args:
|
||||
hidden (Tensor): a minibatch of examples
|
||||
Returns:
|
||||
log-probabilities of for each class :math:`c`
|
||||
in range :math:`0 <= c <= n\_classes`, where :math:`n\_classes` is a
|
||||
parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor.
|
||||
Shape:
|
||||
- Input: :math:`(N, in\_features)`
|
||||
- Output: :math:`(N, n\_classes)`
|
||||
"""
|
||||
if self.n_clusters == 0:
|
||||
logit = self._compute_logit(hidden, self.out_layers[0].weight,
|
||||
self.out_layers[0].bias, self.out_projs[0])
|
||||
return F.log_softmax(logit, dim=-1)
|
||||
else:
|
||||
# construct weights and biases
|
||||
weights, biases = [], []
|
||||
for i in range(len(self.cutoffs)):
|
||||
if self.div_val == 1:
|
||||
l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
|
||||
weight_i = self.out_layers[0].weight[l_idx:r_idx]
|
||||
bias_i = self.out_layers[0].bias[l_idx:r_idx]
|
||||
else:
|
||||
weight_i = self.out_layers[i].weight
|
||||
bias_i = self.out_layers[i].bias
|
||||
|
||||
if i == 0:
|
||||
weight_i = torch.cat(
|
||||
[weight_i, self.cluster_weight], dim=0)
|
||||
bias_i = torch.cat(
|
||||
[bias_i, self.cluster_bias], dim=0)
|
||||
|
||||
weights.append(weight_i)
|
||||
biases.append(bias_i)
|
||||
|
||||
head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]
|
||||
head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
|
||||
|
||||
out = hidden.new_empty((head_logit.size(0), self.n_token))
|
||||
head_logprob = F.log_softmax(head_logit, dim=1)
|
||||
|
||||
cutoff_values = [0] + self.cutoffs
|
||||
for i in range(len(cutoff_values) - 1):
|
||||
start_idx, stop_idx = cutoff_values[i], cutoff_values[i + 1]
|
||||
|
||||
if i == 0:
|
||||
out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]]
|
||||
else:
|
||||
weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
|
||||
|
||||
tail_logit_i = self._compute_logit(hidden, weight_i, bias_i, proj_i)
|
||||
tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
|
||||
|
||||
logprob_i = head_logprob[:, -i] + tail_logprob_i
|
||||
out[:, start_idx, stop_idx] = logprob_i
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class LogUniformSampler(object):
|
||||
def __init__(self, range_max, n_sample):
|
||||
"""
|
||||
Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py
|
||||
`P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
|
||||
|
||||
expected count can be approximated by 1 - (1 - p)^n
|
||||
and we use a numerically stable version -expm1(num_tries * log1p(-p))
|
||||
|
||||
Our implementation fixes num_tries at 2 * n_sample, and the actual #samples will vary from run to run
|
||||
"""
|
||||
with torch.no_grad():
|
||||
self.range_max = range_max
|
||||
log_indices = torch.arange(1., range_max+2., 1.).log_()
|
||||
self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
|
||||
# print('P', self.dist.numpy().tolist()[-30:])
|
||||
|
||||
self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float()
|
||||
|
||||
self.n_sample = n_sample
|
||||
|
||||
def sample(self, labels):
|
||||
"""
|
||||
labels: [b1, b2]
|
||||
Return
|
||||
true_log_probs: [b1, b2]
|
||||
samp_log_probs: [n_sample]
|
||||
neg_samples: [n_sample]
|
||||
"""
|
||||
|
||||
# neg_samples = torch.empty(0).long()
|
||||
n_sample = self.n_sample
|
||||
n_tries = 2 * n_sample
|
||||
|
||||
with torch.no_grad():
|
||||
neg_samples = torch.multinomial(self.dist, n_tries, replacement=True).unique()
|
||||
device = labels.device
|
||||
neg_samples = neg_samples.to(device)
|
||||
true_log_probs = self.log_q[labels].to(device)
|
||||
samp_log_probs = self.log_q[neg_samples].to(device)
|
||||
return true_log_probs, samp_log_probs, neg_samples
|
||||
|
||||
def sample_logits(embedding, bias, labels, inputs, sampler):
|
||||
"""
|
||||
embedding: an nn.Embedding layer
|
||||
bias: [n_vocab]
|
||||
labels: [b1, b2]
|
||||
inputs: [b1, b2, n_emb]
|
||||
sampler: you may use a LogUniformSampler
|
||||
Return
|
||||
logits: [b1, b2, 1 + n_sample]
|
||||
"""
|
||||
true_log_probs, samp_log_probs, neg_samples = sampler.sample(labels)
|
||||
n_sample = neg_samples.size(0)
|
||||
b1, b2 = labels.size(0), labels.size(1)
|
||||
all_ids = torch.cat([labels.view(-1), neg_samples])
|
||||
all_w = embedding(all_ids)
|
||||
true_w = all_w[: -n_sample].view(b1, b2, -1)
|
||||
sample_w = all_w[- n_sample:].view(n_sample, -1)
|
||||
|
||||
all_b = bias[all_ids]
|
||||
true_b = all_b[: -n_sample].view(b1, b2)
|
||||
sample_b = all_b[- n_sample:]
|
||||
|
||||
hit = (labels[:, :, None] == neg_samples).detach()
|
||||
|
||||
true_logits = torch.einsum('ijk,ijk->ij',
|
||||
[true_w, inputs]) + true_b - true_log_probs
|
||||
sample_logits = torch.einsum('lk,ijk->ijl',
|
||||
[sample_w, inputs]) + sample_b - samp_log_probs
|
||||
sample_logits.masked_fill_(hit, -1e30)
|
||||
logits = torch.cat([true_logits[:, :, None], sample_logits], -1)
|
||||
|
||||
return logits
|
||||
|
||||
|
||||
# class LogUniformSampler(object):
|
||||
# def __init__(self, range_max, unique=False):
|
||||
# """
|
||||
# Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py
|
||||
# `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
|
||||
# """
|
||||
# self.range_max = range_max
|
||||
# log_indices = torch.arange(1., range_max+2., 1.).log_()
|
||||
# self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
|
||||
|
||||
# self.unique = unique
|
||||
|
||||
# if self.unique:
|
||||
# self.exclude_mask = torch.ByteTensor(range_max).fill_(0)
|
||||
|
||||
# def sample(self, n_sample, labels):
|
||||
# pos_sample, new_labels = labels.unique(return_inverse=True)
|
||||
# n_pos_sample = pos_sample.size(0)
|
||||
# n_neg_sample = n_sample - n_pos_sample
|
||||
|
||||
# if self.unique:
|
||||
# self.exclude_mask.index_fill_(0, pos_sample, 1)
|
||||
# sample_dist = self.dist.clone().masked_fill_(self.exclude_mask, 0)
|
||||
# self.exclude_mask.index_fill_(0, pos_sample, 0)
|
||||
# else:
|
||||
# sample_dist = self.dist
|
||||
|
||||
# neg_sample = torch.multinomial(sample_dist, n_neg_sample)
|
||||
|
||||
# sample = torch.cat([pos_sample, neg_sample])
|
||||
# sample_prob = self.dist[sample]
|
||||
|
||||
# return new_labels, sample, sample_prob
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
S, B = 3, 4
|
||||
n_vocab = 10000
|
||||
n_sample = 5
|
||||
H = 32
|
||||
|
||||
labels = torch.LongTensor(S, B).random_(0, n_vocab)
|
||||
|
||||
# sampler = LogUniformSampler(n_vocab, unique=False)
|
||||
# new_labels, sample, sample_prob = sampler.sample(n_sample, labels)
|
||||
|
||||
sampler = LogUniformSampler(n_vocab, n_sample)#, unique=True)
|
||||
# true_probs, samp_probs, neg_samples = sampler.sample(n_sample, labels)
|
||||
|
||||
# print('true_probs', true_probs.numpy().tolist())
|
||||
# print('samp_probs', samp_probs.numpy().tolist())
|
||||
# print('neg_samples', neg_samples.numpy().tolist())
|
||||
|
||||
# print('sum', torch.sum(sampler.dist).item())
|
||||
|
||||
# assert torch.all(torch.sort(sample.unique())[0].eq(torch.sort(sample)[0])).item()
|
||||
|
||||
embedding = nn.Embedding(n_vocab, H)
|
||||
bias = torch.zeros(n_vocab)
|
||||
inputs = torch.Tensor(S, B, H).normal_()
|
||||
|
||||
logits, out_labels = sample_logits(embedding, bias, labels, inputs, sampler, n_sample)
|
||||
print('logits', logits.detach().numpy().tolist())
|
||||
print('logits shape', logits.size())
|
||||
print('out_labels', out_labels.detach().numpy().tolist())
|
||||
print('out_labels shape', out_labels.size())
|
||||
|
@ -17,6 +17,7 @@
|
||||
import math
|
||||
import torch
|
||||
from torch.optim import Optimizer
|
||||
from torch.optim.optimizer import required
|
||||
from torch.nn.utils import clip_grad_norm_
|
||||
|
||||
def warmup_cosine(x, warmup=0.002):
|
||||
@ -52,13 +53,13 @@ class BertAdam(Optimizer):
|
||||
b1: Adams b1. Default: 0.9
|
||||
b2: Adams b2. Default: 0.999
|
||||
e: Adams epsilon. Default: 1e-6
|
||||
weight_decay_rate: Weight decay. Default: 0.01
|
||||
weight_decay: Weight decay. Default: 0.01
|
||||
max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
|
||||
"""
|
||||
def __init__(self, params, lr, warmup=-1, t_total=-1, schedule='warmup_linear',
|
||||
b1=0.9, b2=0.999, e=1e-6, weight_decay_rate=0.01,
|
||||
def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
|
||||
b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01,
|
||||
max_grad_norm=1.0):
|
||||
if not lr >= 0.0:
|
||||
if lr is not required and lr < 0.0:
|
||||
raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
|
||||
if schedule not in SCHEDULES:
|
||||
raise ValueError("Invalid schedule parameter: {}".format(schedule))
|
||||
@ -71,7 +72,7 @@ class BertAdam(Optimizer):
|
||||
if not e >= 0.0:
|
||||
raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
|
||||
defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
|
||||
b1=b1, b2=b2, e=e, weight_decay_rate=weight_decay_rate,
|
||||
b1=b1, b2=b2, e=e, weight_decay=weight_decay,
|
||||
max_grad_norm=max_grad_norm)
|
||||
super(BertAdam, self).__init__(params, defaults)
|
||||
|
||||
@ -139,8 +140,8 @@ class BertAdam(Optimizer):
|
||||
# Instead we want to decay the weights in a manner that doesn't interact
|
||||
# with the m/v parameters. This is equivalent to adding the square
|
||||
# of the weights to the loss with plain (non-momentum) SGD.
|
||||
if group['weight_decay_rate'] > 0.0:
|
||||
update += group['weight_decay_rate'] * p.data
|
||||
if group['weight_decay'] > 0.0:
|
||||
update += group['weight_decay'] * p.data
|
||||
|
||||
if group['t_total'] != -1:
|
||||
schedule_fct = SCHEDULES[group['schedule']]
|
||||
|
140
pytorch_pretrained_bert/optimization_openai.py
Normal file
140
pytorch_pretrained_bert/optimization_openai.py
Normal file
@ -0,0 +1,140 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Open AI Team Authors and The HugginFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""PyTorch optimization for OpenAI GPT model."""
|
||||
|
||||
import math
|
||||
import torch
|
||||
from torch.optim import Optimizer
|
||||
from torch.optim.optimizer import required
|
||||
from torch.nn.utils import clip_grad_norm_
|
||||
|
||||
def warmup_cosine(x, warmup=0.002):
|
||||
s = 1 if x <= warmup else 0
|
||||
return s*(x/warmup) + (1-s)*(0.5 * (1 + torch.cos(math.pi * x)))
|
||||
|
||||
def warmup_constant(x, warmup=0.002):
|
||||
s = 1 if x <= warmup else 0
|
||||
return s*(x/warmup) + (1-s)*1
|
||||
|
||||
def warmup_linear(x, warmup=0.002):
|
||||
s = 1 if x <= warmup else 0
|
||||
return (s*(x/warmup) + (1-s))*(1-x)
|
||||
|
||||
SCHEDULES = {
|
||||
'warmup_cosine':warmup_cosine,
|
||||
'warmup_constant':warmup_constant,
|
||||
'warmup_linear':warmup_linear,
|
||||
}
|
||||
|
||||
|
||||
class OpenAIAdam(Optimizer):
|
||||
"""Implements Open AI version of Adam algorithm with weight decay fix.
|
||||
"""
|
||||
def __init__(self, params, lr=required, schedule='warmup_linear', warmup=-1, t_total=-1,
|
||||
b1=0.9, b2=0.999, e=1e-8, weight_decay=0,
|
||||
vector_l2=False, max_grad_norm=-1, **kwargs):
|
||||
if lr is not required and lr < 0.0:
|
||||
raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
|
||||
if schedule not in SCHEDULES:
|
||||
raise ValueError("Invalid schedule parameter: {}".format(schedule))
|
||||
if not 0.0 <= warmup < 1.0 and not warmup == -1:
|
||||
raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
|
||||
if not 0.0 <= b1 < 1.0:
|
||||
raise ValueError("Invalid b1 parameter: {}".format(b1))
|
||||
if not 0.0 <= b2 < 1.0:
|
||||
raise ValueError("Invalid b2 parameter: {}".format(b2))
|
||||
if not e >= 0.0:
|
||||
raise ValueError("Invalid epsilon value: {}".format(e))
|
||||
defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
|
||||
b1=b1, b2=b2, e=e, weight_decay=weight_decay, vector_l2=vector_l2,
|
||||
max_grad_norm=max_grad_norm)
|
||||
super(OpenAIAdam, self).__init__(params, defaults)
|
||||
|
||||
def get_lr(self):
|
||||
lr = []
|
||||
for group in self.param_groups:
|
||||
for p in group['params']:
|
||||
state = self.state[p]
|
||||
if len(state) == 0:
|
||||
return [0]
|
||||
if group['t_total'] != -1:
|
||||
schedule_fct = SCHEDULES[group['schedule']]
|
||||
lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
|
||||
else:
|
||||
lr_scheduled = group['lr']
|
||||
lr.append(lr_scheduled)
|
||||
return lr
|
||||
|
||||
def step(self, closure=None):
|
||||
"""Performs a single optimization step.
|
||||
|
||||
Arguments:
|
||||
closure (callable, optional): A closure that reevaluates the model
|
||||
and returns the loss.
|
||||
"""
|
||||
loss = None
|
||||
if closure is not None:
|
||||
loss = closure()
|
||||
|
||||
for group in self.param_groups:
|
||||
for p in group['params']:
|
||||
if p.grad is None:
|
||||
continue
|
||||
grad = p.grad.data
|
||||
if grad.is_sparse:
|
||||
raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
|
||||
|
||||
state = self.state[p]
|
||||
|
||||
# State initialization
|
||||
if len(state) == 0:
|
||||
state['step'] = 0
|
||||
# Exponential moving average of gradient values
|
||||
state['exp_avg'] = torch.zeros_like(p.data)
|
||||
# Exponential moving average of squared gradient values
|
||||
state['exp_avg_sq'] = torch.zeros_like(p.data)
|
||||
|
||||
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
|
||||
beta1, beta2 = group['b1'], group['b2']
|
||||
|
||||
state['step'] += 1
|
||||
|
||||
# Add grad clipping
|
||||
if group['max_grad_norm'] > 0:
|
||||
clip_grad_norm_(p, group['max_grad_norm'])
|
||||
|
||||
# Decay the first and second moment running average coefficient
|
||||
exp_avg.mul_(beta1).add_(1 - beta1, grad)
|
||||
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
|
||||
denom = exp_avg_sq.sqrt().add_(group['e'])
|
||||
|
||||
bias_correction1 = 1 - beta1 ** state['step']
|
||||
bias_correction2 = 1 - beta2 ** state['step']
|
||||
|
||||
if group['t_total'] != -1:
|
||||
schedule_fct = SCHEDULES[group['schedule']]
|
||||
lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
|
||||
else:
|
||||
lr_scheduled = group['lr']
|
||||
|
||||
step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
|
||||
|
||||
p.data.addcdiv_(-step_size, exp_avg, denom)
|
||||
|
||||
# Add weight decay at the end (fixed version)
|
||||
if (len(p.size()) > 1 or group['vector_l2']) and group['weight_decay'] > 0:
|
||||
p.data.add_(-lr_scheduled * group['weight_decay'], p.data)
|
||||
|
||||
return loss
|
@ -14,41 +14,37 @@
|
||||
# limitations under the License.
|
||||
"""Tokenization classes."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import collections
|
||||
import unicodedata
|
||||
import os
|
||||
import logging
|
||||
import os
|
||||
import unicodedata
|
||||
from io import open
|
||||
|
||||
from .file_utils import cached_path
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PRETRAINED_VOCAB_ARCHIVE_MAP = {
|
||||
'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
|
||||
'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
|
||||
'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
|
||||
'bert-base-multilingual': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-vocab.txt",
|
||||
'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
|
||||
'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
|
||||
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
|
||||
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
|
||||
}
|
||||
|
||||
def printable_text(text):
|
||||
"""Returns text encoded in a way suitable for print or `tf.logging`."""
|
||||
|
||||
# These functions want `str` for both Python2 and Python3, but in one case
|
||||
# it's a Unicode string and in the other it's a byte string.
|
||||
if isinstance(text, str):
|
||||
return text
|
||||
elif isinstance(text, bytes):
|
||||
return text.decode("utf-8", "ignore")
|
||||
else:
|
||||
raise ValueError("Unsupported string type: %s" % (type(text)))
|
||||
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
|
||||
'bert-base-uncased': 512,
|
||||
'bert-large-uncased': 512,
|
||||
'bert-base-cased': 512,
|
||||
'bert-large-cased': 512,
|
||||
'bert-base-multilingual-uncased': 512,
|
||||
'bert-base-multilingual-cased': 512,
|
||||
'bert-base-chinese': 512,
|
||||
}
|
||||
VOCAB_NAME = 'vocab.txt'
|
||||
|
||||
|
||||
def load_vocab(vocab_file):
|
||||
@ -77,7 +73,9 @@ def whitespace_tokenize(text):
|
||||
|
||||
class BertTokenizer(object):
|
||||
"""Runs end-to-end tokenization: punctuation splitting + wordpiece"""
|
||||
def __init__(self, vocab_file, do_lower_case=True):
|
||||
|
||||
def __init__(self, vocab_file, do_lower_case=True, max_len=None,
|
||||
never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
|
||||
@ -85,8 +83,10 @@ class BertTokenizer(object):
|
||||
self.vocab = load_vocab(vocab_file)
|
||||
self.ids_to_tokens = collections.OrderedDict(
|
||||
[(ids, tok) for tok, ids in self.vocab.items()])
|
||||
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
|
||||
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
|
||||
never_split=never_split)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
|
||||
self.max_len = max_len if max_len is not None else int(1e12)
|
||||
|
||||
def tokenize(self, text):
|
||||
split_tokens = []
|
||||
@ -100,6 +100,12 @@ class BertTokenizer(object):
|
||||
ids = []
|
||||
for token in tokens:
|
||||
ids.append(self.vocab[token])
|
||||
if len(ids) > self.max_len:
|
||||
raise ValueError(
|
||||
"Token indices sequence length is longer than the specified maximum "
|
||||
" sequence length for this BERT model ({} > {}). Running this"
|
||||
" sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
|
||||
)
|
||||
return ids
|
||||
|
||||
def convert_ids_to_tokens(self, ids):
|
||||
@ -110,47 +116,57 @@ class BertTokenizer(object):
|
||||
return tokens
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name, do_lower_case=True):
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
|
||||
"""
|
||||
Instantiate a PreTrainedBertModel from a pre-trained model file.
|
||||
Download and cache the pre-trained model file if needed.
|
||||
"""
|
||||
if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP:
|
||||
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name]
|
||||
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
|
||||
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
|
||||
else:
|
||||
vocab_file = pretrained_model_name
|
||||
vocab_file = pretrained_model_name_or_path
|
||||
if os.path.isdir(vocab_file):
|
||||
vocab_file = os.path.join(vocab_file, VOCAB_NAME)
|
||||
# redirect to the cache, if necessary
|
||||
try:
|
||||
resolved_vocab_file = cached_path(vocab_file)
|
||||
if resolved_vocab_file == vocab_file:
|
||||
logger.info("loading vocabulary file {}".format(vocab_file))
|
||||
else:
|
||||
logger.info("loading vocabulary file {} from cache at {}".format(
|
||||
vocab_file, resolved_vocab_file))
|
||||
# Instantiate tokenizer.
|
||||
tokenizer = cls(resolved_vocab_file, do_lower_case)
|
||||
except FileNotFoundError:
|
||||
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
|
||||
except EnvironmentError:
|
||||
logger.error(
|
||||
"Model name '{}' was not found in model name list ({}). "
|
||||
"We assumed '{}' was a path or url but couldn't find any file "
|
||||
"associated to this path or url.".format(
|
||||
pretrained_model_name,
|
||||
pretrained_model_name_or_path,
|
||||
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
|
||||
pretrained_model_name))
|
||||
tokenizer = None
|
||||
vocab_file))
|
||||
return None
|
||||
if resolved_vocab_file == vocab_file:
|
||||
logger.info("loading vocabulary file {}".format(vocab_file))
|
||||
else:
|
||||
logger.info("loading vocabulary file {} from cache at {}".format(
|
||||
vocab_file, resolved_vocab_file))
|
||||
if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
|
||||
# if we're using a pretrained model, ensure the tokenizer wont index sequences longer
|
||||
# than the number of positional embeddings
|
||||
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
|
||||
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
|
||||
# Instantiate tokenizer.
|
||||
tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
|
||||
return tokenizer
|
||||
|
||||
|
||||
class BasicTokenizer(object):
|
||||
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
|
||||
|
||||
def __init__(self, do_lower_case=True):
|
||||
def __init__(self,
|
||||
do_lower_case=True,
|
||||
never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
|
||||
"""Constructs a BasicTokenizer.
|
||||
|
||||
Args:
|
||||
do_lower_case: Whether to lower case the input.
|
||||
"""
|
||||
self.do_lower_case = do_lower_case
|
||||
self.never_split = never_split
|
||||
|
||||
def tokenize(self, text):
|
||||
"""Tokenizes a piece of text."""
|
||||
@ -165,7 +181,7 @@ class BasicTokenizer(object):
|
||||
orig_tokens = whitespace_tokenize(text)
|
||||
split_tokens = []
|
||||
for token in orig_tokens:
|
||||
if self.do_lower_case:
|
||||
if self.do_lower_case and token not in self.never_split:
|
||||
token = token.lower()
|
||||
token = self._run_strip_accents(token)
|
||||
split_tokens.extend(self._run_split_on_punc(token))
|
||||
@ -186,6 +202,8 @@ class BasicTokenizer(object):
|
||||
|
||||
def _run_split_on_punc(self, text):
|
||||
"""Splits punctuation on a piece of text."""
|
||||
if text in self.never_split:
|
||||
return [text]
|
||||
chars = list(text)
|
||||
i = 0
|
||||
start_new_word = True
|
||||
@ -203,7 +221,7 @@ class BasicTokenizer(object):
|
||||
i += 1
|
||||
|
||||
return ["".join(x) for x in output]
|
||||
|
||||
|
||||
def _tokenize_chinese_chars(self, text):
|
||||
"""Adds whitespace around any CJK character."""
|
||||
output = []
|
||||
@ -228,17 +246,17 @@ class BasicTokenizer(object):
|
||||
# space-separated words, so they are not treated specially and handled
|
||||
# like the all of the other languages.
|
||||
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
|
||||
(cp >= 0x3400 and cp <= 0x4DBF) or #
|
||||
(cp >= 0x20000 and cp <= 0x2A6DF) or #
|
||||
(cp >= 0x2A700 and cp <= 0x2B73F) or #
|
||||
(cp >= 0x2B740 and cp <= 0x2B81F) or #
|
||||
(cp >= 0x2B820 and cp <= 0x2CEAF) or
|
||||
(cp >= 0xF900 and cp <= 0xFAFF) or #
|
||||
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
|
||||
(cp >= 0x3400 and cp <= 0x4DBF) or #
|
||||
(cp >= 0x20000 and cp <= 0x2A6DF) or #
|
||||
(cp >= 0x2A700 and cp <= 0x2B73F) or #
|
||||
(cp >= 0x2B740 and cp <= 0x2B81F) or #
|
||||
(cp >= 0x2B820 and cp <= 0x2CEAF) or
|
||||
(cp >= 0xF900 and cp <= 0xFAFF) or #
|
||||
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
|
||||
return True
|
||||
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _clean_text(self, text):
|
||||
"""Performs invalid character removal and whitespace cleanup on text."""
|
||||
output = []
|
||||
@ -273,7 +291,7 @@ class WordpieceTokenizer(object):
|
||||
|
||||
Args:
|
||||
text: A single token or whitespace separated tokens. This should have
|
||||
already been passed through `BasicTokenizer.
|
||||
already been passed through `BasicTokenizer`.
|
||||
|
||||
Returns:
|
||||
A list of wordpiece tokens.
|
||||
|
206
pytorch_pretrained_bert/tokenization_gpt2.py
Normal file
206
pytorch_pretrained_bert/tokenization_gpt2.py
Normal file
@ -0,0 +1,206 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Open AI Team Authors and The HugginFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Tokenization classes for OpenAI GPT."""
|
||||
from __future__ import (absolute_import, division, print_function,
|
||||
unicode_literals)
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import regex as re
|
||||
from io import open
|
||||
|
||||
try:
|
||||
from functools import lru_cache
|
||||
except ImportError:
|
||||
# Just a dummy decorator to get the checks to run on python2
|
||||
# because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
|
||||
def lru_cache():
|
||||
return lambda func: func
|
||||
|
||||
from .file_utils import cached_path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PRETRAINED_VOCAB_ARCHIVE_MAP = {
|
||||
'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
|
||||
}
|
||||
PRETRAINED_MERGES_ARCHIVE_MAP = {
|
||||
'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
|
||||
}
|
||||
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
|
||||
'gpt2': 1024,
|
||||
}
|
||||
VOCAB_NAME = 'vocab.json'
|
||||
MERGES_NAME = 'merges.txt'
|
||||
|
||||
@lru_cache()
|
||||
def bytes_to_unicode():
|
||||
"""
|
||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
||||
The reversible bpe codes work on unicode strings.
|
||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||
This is a signficant percentage of your normal, say, 32K bpe vocab.
|
||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||
"""
|
||||
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
||||
cs = bs[:]
|
||||
n = 0
|
||||
for b in range(2**8):
|
||||
if b not in bs:
|
||||
bs.append(b)
|
||||
cs.append(2**8+n)
|
||||
n += 1
|
||||
cs = [chr(n) for n in cs]
|
||||
return dict(zip(bs, cs))
|
||||
|
||||
def get_pairs(word):
|
||||
"""Return set of symbol pairs in a word.
|
||||
|
||||
Word is represented as tuple of symbols (symbols being variable-length strings).
|
||||
"""
|
||||
pairs = set()
|
||||
prev_char = word[0]
|
||||
for char in word[1:]:
|
||||
pairs.add((prev_char, char))
|
||||
prev_char = char
|
||||
return pairs
|
||||
|
||||
class GPT2Tokenizer(object):
|
||||
"""
|
||||
GPT-2 BPE tokenizer. Peculiarities:
|
||||
- Byte-level BPE
|
||||
"""
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
|
||||
"""
|
||||
Instantiate a PreTrainedBertModel from a pre-trained model file.
|
||||
Download and cache the pre-trained model file if needed.
|
||||
"""
|
||||
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
|
||||
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
|
||||
merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
|
||||
else:
|
||||
vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
|
||||
merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
|
||||
# redirect to the cache, if necessary
|
||||
try:
|
||||
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
|
||||
resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
|
||||
except EnvironmentError:
|
||||
logger.error(
|
||||
"Model name '{}' was not found in model name list ({}). "
|
||||
"We assumed '{}' was a path or url but couldn't find files {} and {} "
|
||||
"at this path or url.".format(
|
||||
pretrained_model_name_or_path,
|
||||
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
|
||||
pretrained_model_name_or_path,
|
||||
vocab_file, merges_file))
|
||||
return None
|
||||
if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
|
||||
logger.info("loading vocabulary file {}".format(vocab_file))
|
||||
logger.info("loading merges file {}".format(merges_file))
|
||||
else:
|
||||
logger.info("loading vocabulary file {} from cache at {}".format(
|
||||
vocab_file, resolved_vocab_file))
|
||||
logger.info("loading merges file {} from cache at {}".format(
|
||||
merges_file, resolved_merges_file))
|
||||
if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
|
||||
# if we're using a pretrained model, ensure the tokenizer wont index sequences longer
|
||||
# than the number of positional embeddings
|
||||
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
|
||||
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
|
||||
# Instantiate tokenizer.
|
||||
tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs, **kwargs)
|
||||
return tokenizer
|
||||
|
||||
def __init__(self, vocab_file, merges_file, errors='replace', max_len=None):
|
||||
self.max_len = max_len if max_len is not None else int(1e12)
|
||||
self.encoder = json.load(open(vocab_file))
|
||||
self.decoder = {v:k for k,v in self.encoder.items()}
|
||||
self.errors = errors # how to handle errors in decoding
|
||||
self.byte_encoder = bytes_to_unicode()
|
||||
self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
|
||||
bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
|
||||
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
|
||||
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
|
||||
self.cache = {}
|
||||
|
||||
# Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
|
||||
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
|
||||
|
||||
def __len__(self):
|
||||
return len(self.encoder)
|
||||
|
||||
def bpe(self, token):
|
||||
if token in self.cache:
|
||||
return self.cache[token]
|
||||
word = tuple(token)
|
||||
pairs = get_pairs(word)
|
||||
|
||||
if not pairs:
|
||||
return token
|
||||
|
||||
while True:
|
||||
bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
|
||||
if bigram not in self.bpe_ranks:
|
||||
break
|
||||
first, second = bigram
|
||||
new_word = []
|
||||
i = 0
|
||||
while i < len(word):
|
||||
try:
|
||||
j = word.index(first, i)
|
||||
new_word.extend(word[i:j])
|
||||
i = j
|
||||
except:
|
||||
new_word.extend(word[i:])
|
||||
break
|
||||
|
||||
if word[i] == first and i < len(word)-1 and word[i+1] == second:
|
||||
new_word.append(first+second)
|
||||
i += 2
|
||||
else:
|
||||
new_word.append(word[i])
|
||||
i += 1
|
||||
new_word = tuple(new_word)
|
||||
word = new_word
|
||||
if len(word) == 1:
|
||||
break
|
||||
else:
|
||||
pairs = get_pairs(word)
|
||||
word = ' '.join(word)
|
||||
self.cache[token] = word
|
||||
return word
|
||||
|
||||
def encode(self, text):
|
||||
bpe_tokens = []
|
||||
for token in re.findall(self.pat, text):
|
||||
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
|
||||
bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
|
||||
if len(bpe_tokens) > self.max_len:
|
||||
raise ValueError(
|
||||
"Token indices sequence length is longer than the specified maximum "
|
||||
" sequence length for this OpenAI GPT-2 model ({} > {}). Running this"
|
||||
" sequence through the model will result in indexing errors".format(len(bpe_tokens), self.max_len)
|
||||
)
|
||||
return bpe_tokens
|
||||
|
||||
def decode(self, tokens):
|
||||
text = ''.join([self.decoder[token] for token in tokens])
|
||||
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
|
||||
return text
|
263
pytorch_pretrained_bert/tokenization_openai.py
Normal file
263
pytorch_pretrained_bert/tokenization_openai.py
Normal file
@ -0,0 +1,263 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Open AI Team Authors and The HugginFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Tokenization classes for OpenAI GPT."""
|
||||
from __future__ import (absolute_import, division, print_function,
|
||||
unicode_literals)
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from io import open
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
from .file_utils import cached_path
|
||||
from .tokenization import BasicTokenizer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PRETRAINED_VOCAB_ARCHIVE_MAP = {
|
||||
'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json",
|
||||
}
|
||||
PRETRAINED_MERGES_ARCHIVE_MAP = {
|
||||
'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt",
|
||||
}
|
||||
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
|
||||
'openai-gpt': 512,
|
||||
}
|
||||
VOCAB_NAME = 'vocab.json'
|
||||
MERGES_NAME = 'merges.txt'
|
||||
|
||||
def get_pairs(word):
|
||||
"""
|
||||
Return set of symbol pairs in a word.
|
||||
word is represented as tuple of symbols (symbols being variable-length strings)
|
||||
"""
|
||||
pairs = set()
|
||||
prev_char = word[0]
|
||||
for char in word[1:]:
|
||||
pairs.add((prev_char, char))
|
||||
prev_char = char
|
||||
return pairs
|
||||
|
||||
def text_standardize(text):
|
||||
"""
|
||||
fixes some issues the spacy tokenizer had on books corpus
|
||||
also does some whitespace standardization
|
||||
"""
|
||||
text = text.replace('—', '-')
|
||||
text = text.replace('–', '-')
|
||||
text = text.replace('―', '-')
|
||||
text = text.replace('…', '...')
|
||||
text = text.replace('´', "'")
|
||||
text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
|
||||
text = re.sub(r'\s*\n\s*', ' \n ', text)
|
||||
text = re.sub(r'[^\S\n]+', ' ', text)
|
||||
return text.strip()
|
||||
|
||||
class OpenAIGPTTokenizer(object):
|
||||
"""
|
||||
BPE tokenizer. Peculiarities:
|
||||
- lower case all inputs
|
||||
- uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
|
||||
- argument special_tokens and function set_special_tokens:
|
||||
can be used to add additional symbols (ex: "__classify__") to a vocabulary.
|
||||
"""
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
|
||||
"""
|
||||
Instantiate a PreTrainedBertModel from a pre-trained model file.
|
||||
Download and cache the pre-trained model file if needed.
|
||||
"""
|
||||
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
|
||||
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
|
||||
merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
|
||||
else:
|
||||
vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
|
||||
merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
|
||||
# redirect to the cache, if necessary
|
||||
try:
|
||||
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
|
||||
resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
|
||||
except EnvironmentError:
|
||||
logger.error(
|
||||
"Model name '{}' was not found in model name list ({}). "
|
||||
"We assumed '{}' was a path or url but couldn't find files {} and {} "
|
||||
"at this path or url.".format(
|
||||
pretrained_model_name_or_path,
|
||||
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
|
||||
pretrained_model_name_or_path,
|
||||
vocab_file, merges_file))
|
||||
return None
|
||||
if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
|
||||
logger.info("loading vocabulary file {}".format(vocab_file))
|
||||
logger.info("loading merges file {}".format(merges_file))
|
||||
else:
|
||||
logger.info("loading vocabulary file {} from cache at {}".format(
|
||||
vocab_file, resolved_vocab_file))
|
||||
logger.info("loading merges file {} from cache at {}".format(
|
||||
merges_file, resolved_merges_file))
|
||||
if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
|
||||
# if we're using a pretrained model, ensure the tokenizer wont index sequences longer
|
||||
# than the number of positional embeddings
|
||||
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
|
||||
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
|
||||
# Instantiate tokenizer.
|
||||
tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs, **kwargs)
|
||||
return tokenizer
|
||||
|
||||
def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
|
||||
try:
|
||||
import ftfy
|
||||
import spacy
|
||||
self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
|
||||
self.fix_text = ftfy.fix_text
|
||||
except ImportError:
|
||||
logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
|
||||
self.nlp = BasicTokenizer(do_lower_case=True,
|
||||
never_split=special_tokens if special_tokens is not None else [])
|
||||
self.fix_text = None
|
||||
|
||||
self.max_len = max_len if max_len is not None else int(1e12)
|
||||
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
|
||||
self.decoder = {v:k for k,v in self.encoder.items()}
|
||||
merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
|
||||
merges = [tuple(merge.split()) for merge in merges]
|
||||
self.bpe_ranks = dict(zip(merges, range(len(merges))))
|
||||
self.cache = {}
|
||||
self.set_special_tokens(special_tokens)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.encoder) + len(self.special_tokens)
|
||||
|
||||
def set_special_tokens(self, special_tokens):
|
||||
""" Add a list of additional tokens to the encoder.
|
||||
The additional tokens are indexed starting from the last index of the
|
||||
current vocabulary in the order of the `special_tokens` list.
|
||||
"""
|
||||
if not special_tokens:
|
||||
self.special_tokens = {}
|
||||
self.special_tokens_decoder = {}
|
||||
return
|
||||
self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
|
||||
self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
|
||||
if self.fix_text is None:
|
||||
# Using BERT's BasicTokenizer: we can update the tokenizer
|
||||
self.nlp.never_split = special_tokens
|
||||
logger.info("Special tokens {}".format(self.special_tokens))
|
||||
|
||||
def bpe(self, token):
|
||||
word = tuple(token[:-1]) + (token[-1] + '</w>',)
|
||||
if token in self.cache:
|
||||
return self.cache[token]
|
||||
pairs = get_pairs(word)
|
||||
|
||||
if not pairs:
|
||||
return token+'</w>'
|
||||
|
||||
while True:
|
||||
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
|
||||
if bigram not in self.bpe_ranks:
|
||||
break
|
||||
first, second = bigram
|
||||
new_word = []
|
||||
i = 0
|
||||
while i < len(word):
|
||||
try:
|
||||
j = word.index(first, i)
|
||||
new_word.extend(word[i:j])
|
||||
i = j
|
||||
except:
|
||||
new_word.extend(word[i:])
|
||||
break
|
||||
|
||||
if word[i] == first and i < len(word)-1 and word[i+1] == second:
|
||||
new_word.append(first+second)
|
||||
i += 2
|
||||
else:
|
||||
new_word.append(word[i])
|
||||
i += 1
|
||||
new_word = tuple(new_word)
|
||||
word = new_word
|
||||
if len(word) == 1:
|
||||
break
|
||||
else:
|
||||
pairs = get_pairs(word)
|
||||
word = ' '.join(word)
|
||||
if word == '\n </w>':
|
||||
word = '\n</w>'
|
||||
self.cache[token] = word
|
||||
return word
|
||||
|
||||
def tokenize(self, text):
|
||||
""" Tokenize a string. """
|
||||
split_tokens = []
|
||||
if self.fix_text is None:
|
||||
# Using BERT's BasicTokenizer
|
||||
text = self.nlp.tokenize(text)
|
||||
for token in text:
|
||||
split_tokens.extend([t for t in self.bpe(token).split(' ')])
|
||||
else:
|
||||
# Using SpaCy & ftfy (original tokenization process of OpenAI GPT)
|
||||
text = self.nlp(text_standardize(self.fix_text(text)))
|
||||
for token in text:
|
||||
split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
|
||||
return split_tokens
|
||||
|
||||
def convert_tokens_to_ids(self, tokens):
|
||||
""" Converts a sequence of tokens into ids using the vocab. """
|
||||
ids = []
|
||||
if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
|
||||
if tokens in self.special_tokens:
|
||||
return self.special_tokens[tokens]
|
||||
else:
|
||||
return self.encoder.get(tokens, 0)
|
||||
for token in tokens:
|
||||
if token in self.special_tokens:
|
||||
ids.append(self.special_tokens[token])
|
||||
else:
|
||||
ids.append(self.encoder.get(token, 0))
|
||||
if len(ids) > self.max_len:
|
||||
raise ValueError(
|
||||
"Token indices sequence length is longer than the specified maximum "
|
||||
" sequence length for this OpenAI GPT model ({} > {}). Running this"
|
||||
" sequence through the model will result in indexing errors".format(len(ids), self.max_len)
|
||||
)
|
||||
return ids
|
||||
|
||||
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
|
||||
"""Converts a sequence of ids in BPE tokens using the vocab."""
|
||||
tokens = []
|
||||
for i in ids:
|
||||
if i in self.special_tokens_decoder:
|
||||
if not skip_special_tokens:
|
||||
tokens.append(self.special_tokens_decoder[i])
|
||||
else:
|
||||
tokens.append(self.decoder[i])
|
||||
return tokens
|
||||
|
||||
def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=False):
|
||||
"""Converts a sequence of ids in a string."""
|
||||
tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)
|
||||
out_string = ''.join(tokens).replace('</w>', ' ').strip()
|
||||
if clean_up_tokenization_spaces:
|
||||
out_string = out_string.replace('<unk>', '')
|
||||
out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',').replace(' ,', ','
|
||||
).replace(" n't", "n't").replace(" 'm", "'m").replace(" 're", "'re").replace(" do not", " don't"
|
||||
).replace(" 's", "'s").replace(" t ", "'t ").replace(" s ", "'s ").replace(" m ", "'m "
|
||||
).replace(" 've", "'ve")
|
||||
return out_string
|
672
pytorch_pretrained_bert/tokenization_transfo_xl.py
Normal file
672
pytorch_pretrained_bert/tokenization_transfo_xl.py
Normal file
@ -0,0 +1,672 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Tokenization classes for Transformer XL model.
|
||||
Adapted from https://github.com/kimiyoung/transformer-xl.
|
||||
"""
|
||||
from __future__ import (absolute_import, division, print_function,
|
||||
unicode_literals)
|
||||
|
||||
import glob
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from collections import Counter, OrderedDict
|
||||
from io import open
|
||||
import unicodedata
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
from .file_utils import cached_path
|
||||
|
||||
if sys.version_info[0] == 2:
|
||||
import cPickle as pickle
|
||||
else:
|
||||
import pickle
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PRETRAINED_VOCAB_ARCHIVE_MAP = {
|
||||
'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin",
|
||||
}
|
||||
VOCAB_NAME = 'vocab.bin'
|
||||
|
||||
PRETRAINED_CORPUS_ARCHIVE_MAP = {
|
||||
'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-corpus.bin",
|
||||
}
|
||||
CORPUS_NAME = 'corpus.bin'
|
||||
|
||||
class TransfoXLTokenizer(object):
|
||||
"""
|
||||
Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
|
||||
"""
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
|
||||
"""
|
||||
Instantiate a TransfoXLTokenizer.
|
||||
The TransfoXLTokenizer.
|
||||
"""
|
||||
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
|
||||
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
|
||||
else:
|
||||
vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
|
||||
# redirect to the cache, if necessary
|
||||
try:
|
||||
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
|
||||
except EnvironmentError:
|
||||
logger.error(
|
||||
"Model name '{}' was not found in model name list ({}). "
|
||||
"We assumed '{}' was a path or url but couldn't find files {} "
|
||||
"at this path or url.".format(
|
||||
pretrained_model_name_or_path,
|
||||
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
|
||||
pretrained_model_name_or_path,
|
||||
vocab_file))
|
||||
return None
|
||||
if resolved_vocab_file == vocab_file:
|
||||
logger.info("loading vocabulary file {}".format(vocab_file))
|
||||
else:
|
||||
logger.info("loading vocabulary file {} from cache at {}".format(
|
||||
vocab_file, resolved_vocab_file))
|
||||
|
||||
# Instantiate tokenizer.
|
||||
tokenizer = cls(*inputs, **kwargs)
|
||||
vocab_dict = torch.load(resolved_vocab_file)
|
||||
for key, value in vocab_dict.items():
|
||||
tokenizer.__dict__[key] = value
|
||||
return tokenizer
|
||||
|
||||
def __init__(self, special=[], min_freq=0, max_size=None, lower_case=False,
|
||||
delimiter=None, vocab_file=None, never_split=("<unk>", "<eos>", "<formula>")):
|
||||
self.counter = Counter()
|
||||
self.special = special
|
||||
self.min_freq = min_freq
|
||||
self.max_size = max_size
|
||||
self.lower_case = lower_case
|
||||
self.delimiter = delimiter
|
||||
self.vocab_file = vocab_file
|
||||
self.never_split = never_split
|
||||
|
||||
def count_file(self, path, verbose=False, add_eos=False):
|
||||
if verbose: print('counting file {} ...'.format(path))
|
||||
assert os.path.exists(path)
|
||||
|
||||
sents = []
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
for idx, line in enumerate(f):
|
||||
if verbose and idx > 0 and idx % 500000 == 0:
|
||||
print(' line {}'.format(idx))
|
||||
symbols = self.tokenize(line, add_eos=add_eos)
|
||||
self.counter.update(symbols)
|
||||
sents.append(symbols)
|
||||
|
||||
return sents
|
||||
|
||||
def count_sents(self, sents, verbose=False):
|
||||
"""
|
||||
sents : a list of sentences, each a list of tokenized symbols
|
||||
"""
|
||||
if verbose: print('counting {} sents ...'.format(len(sents)))
|
||||
for idx, symbols in enumerate(sents):
|
||||
if verbose and idx > 0 and idx % 500000 == 0:
|
||||
print(' line {}'.format(idx))
|
||||
self.counter.update(symbols)
|
||||
|
||||
def _build_from_file(self, vocab_file):
|
||||
self.idx2sym = []
|
||||
self.sym2idx = OrderedDict()
|
||||
|
||||
with open(vocab_file, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
symb = line.strip().split()[0]
|
||||
self.add_symbol(symb)
|
||||
if '<UNK>' in self.sym2idx:
|
||||
self.unk_idx = self.sym2idx['<UNK>']
|
||||
elif '<unk>' in self.sym2idx:
|
||||
self.unk_idx = self.sym2idx['<unk>']
|
||||
else:
|
||||
raise ValueError('No <unkown> token in vocabulary')
|
||||
|
||||
def build_vocab(self):
|
||||
if self.vocab_file:
|
||||
print('building vocab from {}'.format(self.vocab_file))
|
||||
self._build_from_file(self.vocab_file)
|
||||
print('final vocab size {}'.format(len(self)))
|
||||
else:
|
||||
print('building vocab with min_freq={}, max_size={}'.format(
|
||||
self.min_freq, self.max_size))
|
||||
self.idx2sym = []
|
||||
self.sym2idx = OrderedDict()
|
||||
|
||||
for sym in self.special:
|
||||
self.add_special(sym)
|
||||
|
||||
for sym, cnt in self.counter.most_common(self.max_size):
|
||||
if cnt < self.min_freq: break
|
||||
self.add_symbol(sym)
|
||||
|
||||
print('final vocab size {} from {} unique tokens'.format(
|
||||
len(self), len(self.counter)))
|
||||
|
||||
def encode_file(self, path, ordered=False, verbose=False, add_eos=True,
|
||||
add_double_eos=False):
|
||||
if verbose: print('encoding file {} ...'.format(path))
|
||||
assert os.path.exists(path)
|
||||
encoded = []
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
for idx, line in enumerate(f):
|
||||
if verbose and idx > 0 and idx % 500000 == 0:
|
||||
print(' line {}'.format(idx))
|
||||
symbols = self.tokenize(line, add_eos=add_eos,
|
||||
add_double_eos=add_double_eos)
|
||||
encoded.append(self.convert_to_tensor(symbols))
|
||||
|
||||
if ordered:
|
||||
encoded = torch.cat(encoded)
|
||||
|
||||
return encoded
|
||||
|
||||
def encode_sents(self, sents, ordered=False, verbose=False):
|
||||
if verbose: print('encoding {} sents ...'.format(len(sents)))
|
||||
encoded = []
|
||||
for idx, symbols in enumerate(sents):
|
||||
if verbose and idx > 0 and idx % 500000 == 0:
|
||||
print(' line {}'.format(idx))
|
||||
encoded.append(self.convert_to_tensor(symbols))
|
||||
|
||||
if ordered:
|
||||
encoded = torch.cat(encoded)
|
||||
|
||||
return encoded
|
||||
|
||||
def add_special(self, sym):
|
||||
if sym not in self.sym2idx:
|
||||
self.idx2sym.append(sym)
|
||||
self.sym2idx[sym] = len(self.idx2sym) - 1
|
||||
setattr(self, '{}_idx'.format(sym.strip('<>')), self.sym2idx[sym])
|
||||
|
||||
def add_symbol(self, sym):
|
||||
if sym not in self.sym2idx:
|
||||
self.idx2sym.append(sym)
|
||||
self.sym2idx[sym] = len(self.idx2sym) - 1
|
||||
|
||||
def get_sym(self, idx):
|
||||
assert 0 <= idx < len(self), 'Index {} out of vocabulary range'.format(idx)
|
||||
return self.idx2sym[idx]
|
||||
|
||||
def get_idx(self, sym):
|
||||
if sym in self.sym2idx:
|
||||
return self.sym2idx[sym]
|
||||
else:
|
||||
# print('encounter unk {}'.format(sym))
|
||||
# assert '<eos>' not in sym
|
||||
if hasattr(self, 'unk_idx'):
|
||||
return self.sym2idx.get(sym, self.unk_idx)
|
||||
# Backward compatibility with pre-trained models
|
||||
elif '<unk>' in self.sym2idx:
|
||||
return self.sym2idx['<unk>']
|
||||
elif '<UNK>' in self.sym2idx:
|
||||
return self.sym2idx['<UNK>']
|
||||
else:
|
||||
raise ValueError('Token not in vocabulary and no <unk> token in vocabulary for replacement')
|
||||
|
||||
def convert_ids_to_tokens(self, indices):
|
||||
"""Converts a sequence of indices in symbols using the vocab."""
|
||||
return [self.get_sym(idx) for idx in indices]
|
||||
|
||||
def convert_tokens_to_ids(self, symbols):
|
||||
"""Converts a sequence of symbols into ids using the vocab."""
|
||||
return [self.get_idx(sym) for sym in symbols]
|
||||
|
||||
def convert_to_tensor(self, symbols):
|
||||
return torch.LongTensor(self.convert_tokens_to_ids(symbols))
|
||||
|
||||
def decode(self, indices, exclude=None):
|
||||
"""Converts a sequence of indices in a string."""
|
||||
if exclude is None:
|
||||
return ' '.join([self.get_sym(idx) for idx in indices])
|
||||
else:
|
||||
return ' '.join([self.get_sym(idx) for idx in indices if idx not in exclude])
|
||||
|
||||
def __len__(self):
|
||||
return len(self.idx2sym)
|
||||
|
||||
def _run_split_on_punc(self, text):
|
||||
"""Splits punctuation on a piece of text."""
|
||||
if text in self.never_split:
|
||||
return [text]
|
||||
chars = list(text)
|
||||
i = 0
|
||||
start_new_word = True
|
||||
output = []
|
||||
while i < len(chars):
|
||||
char = chars[i]
|
||||
if _is_punctuation(char):
|
||||
output.append([char])
|
||||
start_new_word = True
|
||||
else:
|
||||
if start_new_word:
|
||||
output.append([])
|
||||
start_new_word = False
|
||||
output[-1].append(char)
|
||||
i += 1
|
||||
|
||||
return ["".join(x) for x in output]
|
||||
|
||||
def _run_strip_accents(self, text):
|
||||
"""Strips accents from a piece of text."""
|
||||
text = unicodedata.normalize("NFD", text)
|
||||
output = []
|
||||
for char in text:
|
||||
cat = unicodedata.category(char)
|
||||
if cat == "Mn":
|
||||
continue
|
||||
output.append(char)
|
||||
return "".join(output)
|
||||
|
||||
def _clean_text(self, text):
|
||||
"""Performs invalid character removal and whitespace cleanup on text."""
|
||||
output = []
|
||||
for char in text:
|
||||
cp = ord(char)
|
||||
if cp == 0 or cp == 0xfffd or _is_control(char):
|
||||
continue
|
||||
if _is_whitespace(char):
|
||||
output.append(" ")
|
||||
else:
|
||||
output.append(char)
|
||||
return "".join(output)
|
||||
|
||||
def whitespace_tokenize(self, text):
|
||||
"""Runs basic whitespace cleaning and splitting on a peice of text."""
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return []
|
||||
if self.delimiter == '':
|
||||
tokens = text
|
||||
else:
|
||||
tokens = text.split(self.delimiter)
|
||||
return tokens
|
||||
|
||||
def tokenize(self, line, add_eos=False, add_double_eos=False):
|
||||
line = self._clean_text(line)
|
||||
line = line.strip()
|
||||
|
||||
symbols = self.whitespace_tokenize(line)
|
||||
|
||||
split_symbols = []
|
||||
for symbol in symbols:
|
||||
if self.lower_case and symbol not in self.never_split:
|
||||
symbol = symbol.lower()
|
||||
symbol = self._run_strip_accents(symbol)
|
||||
split_symbols.extend(self._run_split_on_punc(symbol))
|
||||
|
||||
if add_double_eos: # lm1b
|
||||
return ['<S>'] + split_symbols + ['<S>']
|
||||
elif add_eos:
|
||||
return split_symbols + ['<eos>']
|
||||
else:
|
||||
return split_symbols
|
||||
|
||||
|
||||
class LMOrderedIterator(object):
|
||||
def __init__(self, data, bsz, bptt, device='cpu', ext_len=None):
|
||||
"""
|
||||
data -- LongTensor -- the LongTensor is strictly ordered
|
||||
"""
|
||||
self.bsz = bsz
|
||||
self.bptt = bptt
|
||||
self.ext_len = ext_len if ext_len is not None else 0
|
||||
|
||||
self.device = device
|
||||
|
||||
# Work out how cleanly we can divide the dataset into bsz parts.
|
||||
self.n_step = data.size(0) // bsz
|
||||
|
||||
# Trim off any extra elements that wouldn't cleanly fit (remainders).
|
||||
data = data.narrow(0, 0, self.n_step * bsz)
|
||||
|
||||
# Evenly divide the data across the bsz batches.
|
||||
self.data = data.view(bsz, -1).t().contiguous().to(device)
|
||||
|
||||
# Number of mini-batches
|
||||
self.n_batch = (self.n_step + self.bptt - 1) // self.bptt
|
||||
|
||||
def get_batch(self, i, bptt=None):
|
||||
if bptt is None: bptt = self.bptt
|
||||
seq_len = min(bptt, self.data.size(0) - 1 - i)
|
||||
|
||||
end_idx = i + seq_len
|
||||
beg_idx = max(0, i - self.ext_len)
|
||||
|
||||
data = self.data[beg_idx:end_idx]
|
||||
target = self.data[i+1:i+1+seq_len]
|
||||
|
||||
data_out = data.transpose(0, 1).contiguous().to(self.device)
|
||||
target_out = target.transpose(0, 1).contiguous().to(self.device)
|
||||
|
||||
return data_out, target_out, seq_len
|
||||
|
||||
def get_fixlen_iter(self, start=0):
|
||||
for i in range(start, self.data.size(0) - 1, self.bptt):
|
||||
yield self.get_batch(i)
|
||||
|
||||
def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3):
|
||||
max_len = self.bptt + max_deviation * std
|
||||
i = start
|
||||
while True:
|
||||
bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.
|
||||
bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std))))
|
||||
data, target, seq_len = self.get_batch(i, bptt)
|
||||
i += seq_len
|
||||
yield data, target, seq_len
|
||||
if i >= self.data.size(0) - 2:
|
||||
break
|
||||
|
||||
def __iter__(self):
|
||||
return self.get_fixlen_iter()
|
||||
|
||||
|
||||
class LMShuffledIterator(object):
|
||||
def __init__(self, data, bsz, bptt, device='cpu', ext_len=None, shuffle=False):
|
||||
"""
|
||||
data -- list[LongTensor] -- there is no order among the LongTensors
|
||||
"""
|
||||
self.data = data
|
||||
|
||||
self.bsz = bsz
|
||||
self.bptt = bptt
|
||||
self.ext_len = ext_len if ext_len is not None else 0
|
||||
|
||||
self.device = device
|
||||
self.shuffle = shuffle
|
||||
|
||||
def get_sent_stream(self):
|
||||
# index iterator
|
||||
epoch_indices = np.random.permutation(len(self.data)) if self.shuffle \
|
||||
else np.array(range(len(self.data)))
|
||||
|
||||
# sentence iterator
|
||||
for idx in epoch_indices:
|
||||
yield self.data[idx]
|
||||
|
||||
def stream_iterator(self, sent_stream):
|
||||
# streams for each data in the batch
|
||||
streams = [None] * self.bsz
|
||||
|
||||
data = torch.LongTensor(self.bptt, self.bsz)
|
||||
target = torch.LongTensor(self.bptt, self.bsz)
|
||||
|
||||
n_retain = 0
|
||||
|
||||
while True:
|
||||
# data : [n_retain+bptt x bsz]
|
||||
# target : [bptt x bsz]
|
||||
data[n_retain:].fill_(-1)
|
||||
target.fill_(-1)
|
||||
|
||||
valid_batch = True
|
||||
|
||||
for i in range(self.bsz):
|
||||
n_filled = 0
|
||||
try:
|
||||
while n_filled < self.bptt:
|
||||
if streams[i] is None or len(streams[i]) <= 1:
|
||||
streams[i] = next(sent_stream)
|
||||
# number of new tokens to fill in
|
||||
n_new = min(len(streams[i]) - 1, self.bptt - n_filled)
|
||||
# first n_retain tokens are retained from last batch
|
||||
data[n_retain+n_filled:n_retain+n_filled+n_new, i] = \
|
||||
streams[i][:n_new]
|
||||
target[n_filled:n_filled+n_new, i] = \
|
||||
streams[i][1:n_new+1]
|
||||
streams[i] = streams[i][n_new:]
|
||||
n_filled += n_new
|
||||
except StopIteration:
|
||||
valid_batch = False
|
||||
break
|
||||
|
||||
if not valid_batch:
|
||||
return
|
||||
|
||||
data_out = data.transpose(0, 1).contiguous().to(self.device)
|
||||
target_out = target.transpose(0, 1).contiguous().to(self.device)
|
||||
|
||||
yield data_out, target_out, self.bptt
|
||||
|
||||
n_retain = min(data.size(0), self.ext_len)
|
||||
if n_retain > 0:
|
||||
data[:n_retain] = data[-n_retain:]
|
||||
data.resize_(n_retain + self.bptt, data.size(1))
|
||||
|
||||
def __iter__(self):
|
||||
# sent_stream is an iterator
|
||||
sent_stream = self.get_sent_stream()
|
||||
|
||||
for batch in self.stream_iterator(sent_stream):
|
||||
yield batch
|
||||
|
||||
|
||||
class LMMultiFileIterator(LMShuffledIterator):
|
||||
def __init__(self, paths, vocab, bsz, bptt, device='cpu', ext_len=None,
|
||||
shuffle=False):
|
||||
|
||||
self.paths = paths
|
||||
self.vocab = vocab
|
||||
|
||||
self.bsz = bsz
|
||||
self.bptt = bptt
|
||||
self.ext_len = ext_len if ext_len is not None else 0
|
||||
|
||||
self.device = device
|
||||
self.shuffle = shuffle
|
||||
|
||||
def get_sent_stream(self, path):
|
||||
sents = self.vocab.encode_file(path, add_double_eos=True)
|
||||
if self.shuffle:
|
||||
np.random.shuffle(sents)
|
||||
sent_stream = iter(sents)
|
||||
|
||||
return sent_stream
|
||||
|
||||
def __iter__(self):
|
||||
if self.shuffle:
|
||||
np.random.shuffle(self.paths)
|
||||
|
||||
for path in self.paths:
|
||||
# sent_stream is an iterator
|
||||
sent_stream = self.get_sent_stream(path)
|
||||
for batch in self.stream_iterator(sent_stream):
|
||||
yield batch
|
||||
|
||||
|
||||
class TransfoXLCorpus(object):
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
|
||||
"""
|
||||
Instantiate a pre-processed corpus.
|
||||
"""
|
||||
vocab = TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
if pretrained_model_name_or_path in PRETRAINED_CORPUS_ARCHIVE_MAP:
|
||||
corpus_file = PRETRAINED_CORPUS_ARCHIVE_MAP[pretrained_model_name_or_path]
|
||||
else:
|
||||
corpus_file = os.path.join(pretrained_model_name_or_path, CORPUS_NAME)
|
||||
# redirect to the cache, if necessary
|
||||
try:
|
||||
resolved_corpus_file = cached_path(corpus_file, cache_dir=cache_dir)
|
||||
except EnvironmentError:
|
||||
logger.error(
|
||||
"Corpus '{}' was not found in corpus list ({}). "
|
||||
"We assumed '{}' was a path or url but couldn't find files {} "
|
||||
"at this path or url.".format(
|
||||
pretrained_model_name_or_path,
|
||||
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
|
||||
pretrained_model_name_or_path,
|
||||
corpus_file))
|
||||
return None
|
||||
if resolved_corpus_file == corpus_file:
|
||||
logger.info("loading corpus file {}".format(corpus_file))
|
||||
else:
|
||||
logger.info("loading corpus file {} from cache at {}".format(
|
||||
corpus_file, resolved_corpus_file))
|
||||
|
||||
# Instantiate tokenizer.
|
||||
corpus = cls(*inputs, **kwargs)
|
||||
corpus_dict = torch.load(resolved_corpus_file)
|
||||
for key, value in corpus_dict.items():
|
||||
corpus.__dict__[key] = value
|
||||
corpus.vocab = vocab
|
||||
if corpus.train is not None:
|
||||
corpus.train = torch.tensor(corpus.train, dtype=torch.long)
|
||||
if corpus.valid is not None:
|
||||
corpus.valid = torch.tensor(corpus.valid, dtype=torch.long)
|
||||
if corpus.test is not None:
|
||||
corpus.test = torch.tensor(corpus.test, dtype=torch.long)
|
||||
return corpus
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.vocab = TransfoXLTokenizer(*args, **kwargs)
|
||||
self.dataset = None
|
||||
self.train = None
|
||||
self.valid = None
|
||||
self.test = None
|
||||
|
||||
def build_corpus(self, path, dataset):
|
||||
self.dataset = dataset
|
||||
|
||||
if self.dataset in ['ptb', 'wt2', 'enwik8', 'text8']:
|
||||
self.vocab.count_file(os.path.join(path, 'train.txt'))
|
||||
self.vocab.count_file(os.path.join(path, 'valid.txt'))
|
||||
self.vocab.count_file(os.path.join(path, 'test.txt'))
|
||||
elif self.dataset == 'wt103':
|
||||
self.vocab.count_file(os.path.join(path, 'train.txt'))
|
||||
elif self.dataset == 'lm1b':
|
||||
train_path_pattern = os.path.join(
|
||||
path, '1-billion-word-language-modeling-benchmark-r13output',
|
||||
'training-monolingual.tokenized.shuffled', 'news.en-*')
|
||||
train_paths = glob.glob(train_path_pattern)
|
||||
# the vocab will load from file when build_vocab() is called
|
||||
|
||||
self.vocab.build_vocab()
|
||||
|
||||
if self.dataset in ['ptb', 'wt2', 'wt103']:
|
||||
self.train = self.vocab.encode_file(
|
||||
os.path.join(path, 'train.txt'), ordered=True)
|
||||
self.valid = self.vocab.encode_file(
|
||||
os.path.join(path, 'valid.txt'), ordered=True)
|
||||
self.test = self.vocab.encode_file(
|
||||
os.path.join(path, 'test.txt'), ordered=True)
|
||||
elif self.dataset in ['enwik8', 'text8']:
|
||||
self.train = self.vocab.encode_file(
|
||||
os.path.join(path, 'train.txt'), ordered=True, add_eos=False)
|
||||
self.valid = self.vocab.encode_file(
|
||||
os.path.join(path, 'valid.txt'), ordered=True, add_eos=False)
|
||||
self.test = self.vocab.encode_file(
|
||||
os.path.join(path, 'test.txt'), ordered=True, add_eos=False)
|
||||
elif self.dataset == 'lm1b':
|
||||
self.train = train_paths
|
||||
self.valid = self.vocab.encode_file(
|
||||
os.path.join(path, 'valid.txt'), ordered=False, add_double_eos=True)
|
||||
self.test = self.vocab.encode_file(
|
||||
os.path.join(path, 'test.txt'), ordered=False, add_double_eos=True)
|
||||
|
||||
def get_iterator(self, split, *args, **kwargs):
|
||||
if split == 'train':
|
||||
if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
|
||||
data_iter = LMOrderedIterator(self.train, *args, **kwargs)
|
||||
elif self.dataset == 'lm1b':
|
||||
kwargs['shuffle'] = True
|
||||
data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs)
|
||||
elif split in ['valid', 'test']:
|
||||
data = self.valid if split == 'valid' else self.test
|
||||
if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
|
||||
data_iter = LMOrderedIterator(data, *args, **kwargs)
|
||||
elif self.dataset == 'lm1b':
|
||||
data_iter = LMShuffledIterator(data, *args, **kwargs)
|
||||
|
||||
return data_iter
|
||||
|
||||
|
||||
def get_lm_corpus(datadir, dataset):
|
||||
fn = os.path.join(datadir, 'cache.pt')
|
||||
fn_pickle = os.path.join(datadir, 'cache.pkl')
|
||||
if os.path.exists(fn):
|
||||
print('Loading cached dataset...')
|
||||
corpus = torch.load(fn_pickle)
|
||||
elif os.path.exists(fn):
|
||||
print('Loading cached dataset from pickle...')
|
||||
with open(fn, "rb") as fp:
|
||||
corpus = pickle.load(fp)
|
||||
else:
|
||||
print('Producing dataset {}...'.format(dataset))
|
||||
kwargs = {}
|
||||
if dataset in ['wt103', 'wt2']:
|
||||
kwargs['special'] = ['<eos>']
|
||||
kwargs['lower_case'] = False
|
||||
elif dataset == 'ptb':
|
||||
kwargs['special'] = ['<eos>']
|
||||
kwargs['lower_case'] = True
|
||||
elif dataset == 'lm1b':
|
||||
kwargs['special'] = []
|
||||
kwargs['lower_case'] = False
|
||||
kwargs['vocab_file'] = os.path.join(datadir, '1b_word_vocab.txt')
|
||||
elif dataset in ['enwik8', 'text8']:
|
||||
pass
|
||||
|
||||
corpus = TransfoXLCorpus(datadir, dataset, **kwargs)
|
||||
torch.save(corpus, fn)
|
||||
|
||||
return corpus
|
||||
|
||||
def _is_whitespace(char):
|
||||
"""Checks whether `chars` is a whitespace character."""
|
||||
# \t, \n, and \r are technically contorl characters but we treat them
|
||||
# as whitespace since they are generally considered as such.
|
||||
if char == " " or char == "\t" or char == "\n" or char == "\r":
|
||||
return True
|
||||
cat = unicodedata.category(char)
|
||||
if cat == "Zs":
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _is_control(char):
|
||||
"""Checks whether `chars` is a control character."""
|
||||
# These are technically control characters but we count them as whitespace
|
||||
# characters.
|
||||
if char == "\t" or char == "\n" or char == "\r":
|
||||
return False
|
||||
cat = unicodedata.category(char)
|
||||
if cat.startswith("C"):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _is_punctuation(char):
|
||||
"""Checks whether `chars` is a punctuation character."""
|
||||
cp = ord(char)
|
||||
# We treat all non-letter/number ASCII as punctuation.
|
||||
# Characters such as "^", "$", and "`" are not in the Unicode
|
||||
# Punctuation class but we treat them as punctuation anyways, for
|
||||
# consistency.
|
||||
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
|
||||
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
|
||||
return True
|
||||
cat = unicodedata.category(char)
|
||||
if cat.startswith("P"):
|
||||
return True
|
||||
return False
|
@ -1,6 +1,5 @@
|
||||
# This installs Pytorch for CUDA 8 only. If you are using a newer version,
|
||||
# please visit http://pytorch.org/ and install the relevant version.
|
||||
torch>=0.4.1,<0.5.0
|
||||
# PyTorch
|
||||
torch>=0.4.1
|
||||
# progress bars in model download and training scripts
|
||||
tqdm
|
||||
# Accessing files from S3 directly.
|
||||
|
50
setup.py
50
setup.py
@ -1,12 +1,48 @@
|
||||
"""
|
||||
Simple check list from AllenNLP repo: https://github.com/allenai/allennlp/blob/master/setup.py
|
||||
|
||||
To create the package for pypi.
|
||||
|
||||
1. Change the version in __init__.py and setup.py.
|
||||
|
||||
2. Commit these changes with the message: "Release: VERSION"
|
||||
|
||||
3. Add a tag in git to mark the release: "git tag VERSION -m'Adds tag VERSION for pypi' "
|
||||
Push the tag to git: git push --tags origin master
|
||||
|
||||
4. Build both the sources and the wheel. Do not change anything in setup.py between
|
||||
creating the wheel and the source distribution (obviously).
|
||||
|
||||
For the wheel, run: "python setup.py bdist_wheel" in the top level allennlp directory.
|
||||
(this will build a wheel for the python version you use to build it - make sure you use python 3.x).
|
||||
|
||||
For the sources, run: "python setup.py sdist"
|
||||
You should now have a /dist directory with both .whl and .tar.gz source versions of allennlp.
|
||||
|
||||
5. Check that everything looks correct by uploading the package to the pypi test server:
|
||||
|
||||
twine upload dist/* -r pypitest
|
||||
(pypi suggest using twine as other methods upload files via plaintext.)
|
||||
|
||||
Check that you can install it in a virtualenv by running:
|
||||
pip install -i https://testpypi.python.org/pypi allennlp
|
||||
|
||||
6. Upload the final version to actual pypi:
|
||||
twine upload dist/* -r pypi
|
||||
|
||||
7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
|
||||
|
||||
"""
|
||||
from io import open
|
||||
from setuptools import find_packages, setup
|
||||
|
||||
setup(
|
||||
name="pytorch_pretrained_bert",
|
||||
version="0.2.0",
|
||||
author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors",
|
||||
version="0.6.0",
|
||||
author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors",
|
||||
author_email="thomas@huggingface.co",
|
||||
description="PyTorch version of Google AI BERT model with script to load Google pre-trained models",
|
||||
long_description=open("README.md", "r").read(),
|
||||
long_description=open("README.md", "r", encoding='utf-8').read(),
|
||||
long_description_content_type="text/markdown",
|
||||
keywords='BERT NLP deep learning google',
|
||||
license='Apache',
|
||||
@ -18,8 +54,12 @@ setup(
|
||||
'boto3',
|
||||
'requests',
|
||||
'tqdm'],
|
||||
scripts=["bin/pytorch_pretrained_bert"],
|
||||
python_requires='>=3.5.0',
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
"pytorch_pretrained_bert=pytorch_pretrained_bert.__main__:main",
|
||||
]
|
||||
},
|
||||
# python_requires='>=3.5.0',
|
||||
tests_require=['pytest'],
|
||||
classifiers=[
|
||||
'Intended Audience :: Science/Research',
|
||||
|
210
tests/modeling_gpt2_test.py
Normal file
210
tests/modeling_gpt2_test.py
Normal file
@ -0,0 +1,210 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import json
|
||||
import random
|
||||
|
||||
import torch
|
||||
|
||||
from pytorch_pretrained_bert import (GPT2Config, GPT2Model,
|
||||
GPT2LMHeadModel, GPT2DoubleHeadsModel)
|
||||
|
||||
|
||||
class GPT2ModelTest(unittest.TestCase):
|
||||
class GPT2ModelTester(object):
|
||||
|
||||
def __init__(self,
|
||||
parent,
|
||||
batch_size=13,
|
||||
seq_length=7,
|
||||
is_training=True,
|
||||
use_position_ids=True,
|
||||
use_token_type_ids=True,
|
||||
use_labels=True,
|
||||
vocab_size=99,
|
||||
n_positions=33,
|
||||
n_embd=32,
|
||||
n_layer=5,
|
||||
n_head=4,
|
||||
n_choices=3,
|
||||
type_sequence_label_size=2,
|
||||
initializer_range=0.02,
|
||||
num_labels=3,
|
||||
scope=None):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.seq_length = seq_length
|
||||
self.is_training = is_training
|
||||
self.use_position_ids = use_position_ids
|
||||
self.use_token_type_ids = use_token_type_ids
|
||||
self.use_labels = use_labels
|
||||
self.vocab_size = vocab_size
|
||||
self.n_positions = n_positions
|
||||
self.n_embd = n_embd
|
||||
self.n_layer = n_layer
|
||||
self.n_head = n_head
|
||||
self.n_choices = n_choices
|
||||
self.type_sequence_label_size = type_sequence_label_size
|
||||
self.initializer_range = initializer_range
|
||||
self.num_labels = num_labels
|
||||
self.scope = scope
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
input_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.vocab_size)
|
||||
|
||||
position_ids = None
|
||||
if self.use_position_ids:
|
||||
position_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions)
|
||||
|
||||
token_type_ids = None
|
||||
if self.use_token_type_ids:
|
||||
total_voc = self.vocab_size
|
||||
token_type_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
|
||||
|
||||
mc_labels = None
|
||||
lm_labels = None
|
||||
mc_token_ids = None
|
||||
if self.use_labels:
|
||||
mc_labels = GPT2ModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||
lm_labels = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
|
||||
mc_token_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices], self.seq_length)
|
||||
|
||||
config = GPT2Config(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
n_positions=self.n_positions,
|
||||
n_embd=self.n_embd,
|
||||
n_layer=self.n_layer,
|
||||
n_head=self.n_head,
|
||||
initializer_range=self.initializer_range)
|
||||
|
||||
return (config, input_ids, token_type_ids, position_ids,
|
||||
mc_labels, lm_labels, mc_token_ids)
|
||||
|
||||
def create_gpt2_model(self, config, input_ids, token_type_ids, position_ids,
|
||||
mc_labels, lm_labels, mc_token_ids):
|
||||
model = GPT2Model(config)
|
||||
model.eval()
|
||||
hidden_states, presents = model(input_ids, position_ids, token_type_ids)
|
||||
outputs = {
|
||||
"hidden_states": hidden_states,
|
||||
"presents": presents,
|
||||
}
|
||||
return outputs
|
||||
|
||||
def check_gpt2_model_output(self, result):
|
||||
self.parent.assertListEqual(
|
||||
list(result["hidden_states"].size()),
|
||||
[self.batch_size, self.n_choices, self.seq_length, self.n_embd])
|
||||
|
||||
|
||||
def create_gpt2_lm_head(self, config, input_ids, token_type_ids, position_ids,
|
||||
mc_labels, lm_labels, mc_token_ids):
|
||||
model = GPT2LMHeadModel(config)
|
||||
model.eval()
|
||||
loss = model(input_ids, position_ids, token_type_ids, lm_labels)
|
||||
lm_logits, presents = model(input_ids, position_ids, token_type_ids)
|
||||
outputs = {
|
||||
"loss": loss,
|
||||
"lm_logits": lm_logits,
|
||||
"presents": presents,
|
||||
}
|
||||
return outputs
|
||||
|
||||
def check_gpt2_lm_head_output(self, result):
|
||||
total_voc = self.vocab_size
|
||||
self.parent.assertListEqual(
|
||||
list(result["lm_logits"].size()),
|
||||
[self.batch_size, self.n_choices, self.seq_length, total_voc])
|
||||
|
||||
def check_gpt2_lm_head_loss_output(self, result):
|
||||
self.parent.assertListEqual(
|
||||
list(result["loss"].size()),
|
||||
[])
|
||||
|
||||
def create_gpt2_double_heads(self, config, input_ids, token_type_ids, position_ids,
|
||||
mc_labels, lm_labels, mc_token_ids):
|
||||
model = GPT2DoubleHeadsModel(config)
|
||||
model.eval()
|
||||
loss = model(input_ids, mc_token_ids,
|
||||
lm_labels=lm_labels, mc_labels=mc_labels,
|
||||
token_type_ids=token_type_ids, position_ids=position_ids)
|
||||
lm_logits, mc_logits, presents = model(input_ids, mc_token_ids, position_ids=position_ids, token_type_ids=token_type_ids)
|
||||
outputs = {
|
||||
"loss": loss,
|
||||
"lm_logits": lm_logits,
|
||||
"mc_logits": mc_logits,
|
||||
"presents": presents,
|
||||
}
|
||||
return outputs
|
||||
|
||||
def check_gpt2_double_heads_output(self, result):
|
||||
total_voc = self.vocab_size
|
||||
self.parent.assertListEqual(
|
||||
list(result["lm_logits"].size()),
|
||||
[self.batch_size, self.n_choices, self.seq_length, total_voc])
|
||||
self.parent.assertListEqual(
|
||||
list(result["mc_logits"].size()),
|
||||
[self.batch_size, self.n_choices])
|
||||
|
||||
def check_gpt2_double_heads_loss_output(self, result):
|
||||
self.parent.assertListEqual(
|
||||
[list(l.size()) for l in result["loss"]],
|
||||
[[], []])
|
||||
|
||||
def test_default(self):
|
||||
self.run_tester(GPT2ModelTest.GPT2ModelTester(self))
|
||||
|
||||
def test_config_to_json_string(self):
|
||||
config = GPT2Config(vocab_size_or_config_json_file=99, n_embd=37)
|
||||
obj = json.loads(config.to_json_string())
|
||||
self.assertEqual(obj["vocab_size"], 99)
|
||||
self.assertEqual(obj["n_embd"], 37)
|
||||
|
||||
def run_tester(self, tester):
|
||||
config_and_inputs = tester.prepare_config_and_inputs()
|
||||
output_result = tester.create_gpt2_model(*config_and_inputs)
|
||||
tester.check_gpt2_model_output(output_result)
|
||||
|
||||
output_result = tester.create_gpt2_lm_head(*config_and_inputs)
|
||||
tester.check_gpt2_lm_head_output(output_result)
|
||||
tester.check_gpt2_lm_head_loss_output(output_result)
|
||||
|
||||
output_result = tester.create_gpt2_double_heads(*config_and_inputs)
|
||||
tester.check_gpt2_double_heads_output(output_result)
|
||||
tester.check_gpt2_double_heads_loss_output(output_result)
|
||||
|
||||
@classmethod
|
||||
def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
|
||||
"""Creates a random int32 tensor of the shape within the vocab size."""
|
||||
if rng is None:
|
||||
rng = random.Random()
|
||||
|
||||
total_dims = 1
|
||||
for dim in shape:
|
||||
total_dims *= dim
|
||||
|
||||
values = []
|
||||
for _ in range(total_dims):
|
||||
values.append(rng.randint(0, vocab_size - 1))
|
||||
|
||||
return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
222
tests/modeling_openai_test.py
Normal file
222
tests/modeling_openai_test.py
Normal file
@ -0,0 +1,222 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import json
|
||||
import random
|
||||
|
||||
import torch
|
||||
|
||||
from pytorch_pretrained_bert import (OpenAIGPTConfig, OpenAIGPTModel,
|
||||
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
|
||||
|
||||
|
||||
class OpenAIGPTModelTest(unittest.TestCase):
|
||||
class OpenAIGPTModelTester(object):
|
||||
|
||||
def __init__(self,
|
||||
parent,
|
||||
batch_size=13,
|
||||
seq_length=7,
|
||||
is_training=True,
|
||||
use_position_ids=True,
|
||||
use_token_type_ids=True,
|
||||
use_labels=True,
|
||||
vocab_size=99,
|
||||
n_special=1,
|
||||
n_positions=33,
|
||||
n_embd=32,
|
||||
n_layer=5,
|
||||
n_head=4,
|
||||
n_choices=3,
|
||||
afn="gelu",
|
||||
resid_pdrop=0.1,
|
||||
attn_pdrop=0.1,
|
||||
embd_pdrop=0.1,
|
||||
type_sequence_label_size=2,
|
||||
initializer_range=0.02,
|
||||
num_labels=3,
|
||||
scope=None):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.seq_length = seq_length
|
||||
self.is_training = is_training
|
||||
self.use_position_ids = use_position_ids
|
||||
self.use_token_type_ids = use_token_type_ids
|
||||
self.use_labels = use_labels
|
||||
self.vocab_size = vocab_size
|
||||
self.n_special = n_special
|
||||
self.n_positions = n_positions
|
||||
self.n_embd = n_embd
|
||||
self.n_layer = n_layer
|
||||
self.n_head = n_head
|
||||
self.afn = afn
|
||||
self.n_choices = n_choices
|
||||
self.resid_pdrop = resid_pdrop
|
||||
self.attn_pdrop = attn_pdrop
|
||||
self.embd_pdrop = embd_pdrop
|
||||
self.type_sequence_label_size = type_sequence_label_size
|
||||
self.initializer_range = initializer_range
|
||||
self.num_labels = num_labels
|
||||
self.scope = scope
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
input_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.vocab_size)
|
||||
|
||||
position_ids = None
|
||||
if self.use_position_ids:
|
||||
position_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions)
|
||||
|
||||
token_type_ids = None
|
||||
if self.use_token_type_ids:
|
||||
total_voc = self.vocab_size + self.n_special
|
||||
token_type_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
|
||||
|
||||
mc_labels = None
|
||||
lm_labels = None
|
||||
mc_token_ids = None
|
||||
if self.use_labels:
|
||||
mc_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||
lm_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
|
||||
mc_token_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices], self.seq_length)
|
||||
|
||||
config = OpenAIGPTConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
n_positions=self.n_positions,
|
||||
n_special=self.n_special,
|
||||
n_embd=self.n_embd,
|
||||
n_layer=self.n_layer,
|
||||
n_head=self.n_head,
|
||||
afn=self.afn,
|
||||
resid_pdrop=self.resid_pdrop,
|
||||
attn_pdrop=self.attn_pdrop,
|
||||
embd_pdrop=self.embd_pdrop,
|
||||
initializer_range=self.initializer_range)
|
||||
|
||||
return (config, input_ids, token_type_ids, position_ids,
|
||||
mc_labels, lm_labels, mc_token_ids)
|
||||
|
||||
def create_openai_model(self, config, input_ids, token_type_ids, position_ids,
|
||||
mc_labels, lm_labels, mc_token_ids):
|
||||
model = OpenAIGPTModel(config)
|
||||
model.eval()
|
||||
hidden_states = model(input_ids, position_ids, token_type_ids)
|
||||
outputs = {
|
||||
"hidden_states": hidden_states,
|
||||
}
|
||||
return outputs
|
||||
|
||||
def check_openai_model_output(self, result):
|
||||
self.parent.assertListEqual(
|
||||
list(result["hidden_states"].size()),
|
||||
[self.batch_size, self.n_choices, self.seq_length, self.n_embd])
|
||||
|
||||
|
||||
def create_openai_lm_head(self, config, input_ids, token_type_ids, position_ids,
|
||||
mc_labels, lm_labels, mc_token_ids):
|
||||
model = OpenAIGPTLMHeadModel(config)
|
||||
model.eval()
|
||||
loss = model(input_ids, position_ids, token_type_ids, lm_labels)
|
||||
lm_logits = model(input_ids, position_ids, token_type_ids)
|
||||
outputs = {
|
||||
"loss": loss,
|
||||
"lm_logits": lm_logits,
|
||||
}
|
||||
return outputs
|
||||
|
||||
def check_openai_lm_head_output(self, result):
|
||||
total_voc = self.n_special + self.vocab_size
|
||||
self.parent.assertListEqual(
|
||||
list(result["lm_logits"].size()),
|
||||
[self.batch_size, self.n_choices, self.seq_length, total_voc])
|
||||
|
||||
def check_openai_lm_head_loss_output(self, result):
|
||||
self.parent.assertListEqual(
|
||||
list(result["loss"].size()),
|
||||
[])
|
||||
|
||||
def create_openai_double_heads(self, config, input_ids, token_type_ids, position_ids,
|
||||
mc_labels, lm_labels, mc_token_ids):
|
||||
model = OpenAIGPTDoubleHeadsModel(config)
|
||||
model.eval()
|
||||
loss = model(input_ids, mc_token_ids,
|
||||
lm_labels=lm_labels, mc_labels=mc_labels,
|
||||
token_type_ids=token_type_ids, position_ids=position_ids)
|
||||
lm_logits, mc_logits = model(input_ids, mc_token_ids, position_ids=position_ids, token_type_ids=token_type_ids)
|
||||
outputs = {
|
||||
"loss": loss,
|
||||
"lm_logits": lm_logits,
|
||||
"mc_logits": mc_logits,
|
||||
}
|
||||
return outputs
|
||||
|
||||
def check_openai_double_heads_output(self, result):
|
||||
total_voc = self.n_special + self.vocab_size
|
||||
self.parent.assertListEqual(
|
||||
list(result["lm_logits"].size()),
|
||||
[self.batch_size, self.n_choices, self.seq_length, total_voc])
|
||||
self.parent.assertListEqual(
|
||||
list(result["mc_logits"].size()),
|
||||
[self.batch_size, self.n_choices])
|
||||
|
||||
def check_openai_double_heads_loss_output(self, result):
|
||||
self.parent.assertListEqual(
|
||||
[list(l.size()) for l in result["loss"]],
|
||||
[[], []])
|
||||
|
||||
def test_default(self):
|
||||
self.run_tester(OpenAIGPTModelTest.OpenAIGPTModelTester(self))
|
||||
|
||||
def test_config_to_json_string(self):
|
||||
config = OpenAIGPTConfig(vocab_size_or_config_json_file=99, n_embd=37)
|
||||
obj = json.loads(config.to_json_string())
|
||||
self.assertEqual(obj["vocab_size"], 99)
|
||||
self.assertEqual(obj["n_embd"], 37)
|
||||
|
||||
def run_tester(self, tester):
|
||||
config_and_inputs = tester.prepare_config_and_inputs()
|
||||
output_result = tester.create_openai_model(*config_and_inputs)
|
||||
tester.check_openai_model_output(output_result)
|
||||
|
||||
output_result = tester.create_openai_lm_head(*config_and_inputs)
|
||||
tester.check_openai_lm_head_output(output_result)
|
||||
tester.check_openai_lm_head_loss_output(output_result)
|
||||
|
||||
output_result = tester.create_openai_double_heads(*config_and_inputs)
|
||||
tester.check_openai_double_heads_output(output_result)
|
||||
tester.check_openai_double_heads_loss_output(output_result)
|
||||
|
||||
@classmethod
|
||||
def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
|
||||
"""Creates a random int32 tensor of the shape within the vocab size."""
|
||||
if rng is None:
|
||||
rng = random.Random()
|
||||
|
||||
total_dims = 1
|
||||
for dim in shape:
|
||||
total_dims *= dim
|
||||
|
||||
values = []
|
||||
for _ in range(total_dims):
|
||||
values.append(rng.randint(0, vocab_size - 1))
|
||||
|
||||
return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -22,7 +22,10 @@ import random
|
||||
|
||||
import torch
|
||||
|
||||
from pytorch_pretrained_bert import BertConfig, BertModel
|
||||
from pytorch_pretrained_bert import (BertConfig, BertModel, BertForMaskedLM,
|
||||
BertForNextSentencePrediction, BertForPreTraining,
|
||||
BertForQuestionAnswering, BertForSequenceClassification,
|
||||
BertForTokenClassification)
|
||||
|
||||
|
||||
class BertModelTest(unittest.TestCase):
|
||||
@ -35,6 +38,7 @@ class BertModelTest(unittest.TestCase):
|
||||
is_training=True,
|
||||
use_input_mask=True,
|
||||
use_token_type_ids=True,
|
||||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
@ -45,7 +49,9 @@ class BertModelTest(unittest.TestCase):
|
||||
attention_probs_dropout_prob=0.1,
|
||||
max_position_embeddings=512,
|
||||
type_vocab_size=16,
|
||||
type_sequence_label_size=2,
|
||||
initializer_range=0.02,
|
||||
num_labels=3,
|
||||
scope=None):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
@ -53,6 +59,7 @@ class BertModelTest(unittest.TestCase):
|
||||
self.is_training = is_training
|
||||
self.use_input_mask = use_input_mask
|
||||
self.use_token_type_ids = use_token_type_ids
|
||||
self.use_labels = use_labels
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
@ -63,10 +70,12 @@ class BertModelTest(unittest.TestCase):
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.type_vocab_size = type_vocab_size
|
||||
self.type_sequence_label_size = type_sequence_label_size
|
||||
self.initializer_range = initializer_range
|
||||
self.num_labels = num_labels
|
||||
self.scope = scope
|
||||
|
||||
def create_model(self):
|
||||
def prepare_config_and_inputs(self):
|
||||
input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||
|
||||
input_mask = None
|
||||
@ -77,6 +86,12 @@ class BertModelTest(unittest.TestCase):
|
||||
if self.use_token_type_ids:
|
||||
token_type_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||
|
||||
sequence_labels = None
|
||||
token_labels = None
|
||||
if self.use_labels:
|
||||
sequence_labels = BertModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||
token_labels = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||
|
||||
config = BertConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
hidden_size=self.hidden_size,
|
||||
@ -90,10 +105,17 @@ class BertModelTest(unittest.TestCase):
|
||||
type_vocab_size=self.type_vocab_size,
|
||||
initializer_range=self.initializer_range)
|
||||
|
||||
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels
|
||||
|
||||
def check_loss_output(self, result):
|
||||
self.parent.assertListEqual(
|
||||
list(result["loss"].size()),
|
||||
[])
|
||||
|
||||
def create_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
|
||||
model = BertModel(config=config)
|
||||
|
||||
model.eval()
|
||||
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
|
||||
|
||||
outputs = {
|
||||
"sequence_output": all_encoder_layers[-1],
|
||||
"pooled_output": pooled_output,
|
||||
@ -101,13 +123,125 @@ class BertModelTest(unittest.TestCase):
|
||||
}
|
||||
return outputs
|
||||
|
||||
def check_output(self, result):
|
||||
def check_bert_model_output(self, result):
|
||||
self.parent.assertListEqual(
|
||||
[size for layer in result["all_encoder_layers"] for size in layer.size()],
|
||||
[self.batch_size, self.seq_length, self.hidden_size] * self.num_hidden_layers)
|
||||
self.parent.assertListEqual(
|
||||
list(result["sequence_output"].size()),
|
||||
[self.batch_size, self.seq_length, self.hidden_size])
|
||||
|
||||
self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
|
||||
|
||||
|
||||
def create_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
|
||||
model = BertForMaskedLM(config=config)
|
||||
model.eval()
|
||||
loss = model(input_ids, token_type_ids, input_mask, token_labels)
|
||||
prediction_scores = model(input_ids, token_type_ids, input_mask)
|
||||
outputs = {
|
||||
"loss": loss,
|
||||
"prediction_scores": prediction_scores,
|
||||
}
|
||||
return outputs
|
||||
|
||||
def check_bert_for_masked_lm_output(self, result):
|
||||
self.parent.assertListEqual(
|
||||
list(result["prediction_scores"].size()),
|
||||
[self.batch_size, self.seq_length, self.vocab_size])
|
||||
|
||||
def create_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
|
||||
model = BertForNextSentencePrediction(config=config)
|
||||
model.eval()
|
||||
loss = model(input_ids, token_type_ids, input_mask, sequence_labels)
|
||||
seq_relationship_score = model(input_ids, token_type_ids, input_mask)
|
||||
outputs = {
|
||||
"loss": loss,
|
||||
"seq_relationship_score": seq_relationship_score,
|
||||
}
|
||||
return outputs
|
||||
|
||||
def check_bert_for_next_sequence_prediction_output(self, result):
|
||||
self.parent.assertListEqual(
|
||||
list(result["seq_relationship_score"].size()),
|
||||
[self.batch_size, 2])
|
||||
|
||||
|
||||
def create_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
|
||||
model = BertForPreTraining(config=config)
|
||||
model.eval()
|
||||
loss = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels)
|
||||
prediction_scores, seq_relationship_score = model(input_ids, token_type_ids, input_mask)
|
||||
outputs = {
|
||||
"loss": loss,
|
||||
"prediction_scores": prediction_scores,
|
||||
"seq_relationship_score": seq_relationship_score,
|
||||
}
|
||||
return outputs
|
||||
|
||||
def check_bert_for_pretraining_output(self, result):
|
||||
self.parent.assertListEqual(
|
||||
list(result["prediction_scores"].size()),
|
||||
[self.batch_size, self.seq_length, self.vocab_size])
|
||||
self.parent.assertListEqual(
|
||||
list(result["seq_relationship_score"].size()),
|
||||
[self.batch_size, 2])
|
||||
|
||||
|
||||
def create_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
|
||||
model = BertForQuestionAnswering(config=config)
|
||||
model.eval()
|
||||
loss = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels)
|
||||
start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
|
||||
outputs = {
|
||||
"loss": loss,
|
||||
"start_logits": start_logits,
|
||||
"end_logits": end_logits,
|
||||
}
|
||||
return outputs
|
||||
|
||||
def check_bert_for_question_answering_output(self, result):
|
||||
self.parent.assertListEqual(
|
||||
list(result["start_logits"].size()),
|
||||
[self.batch_size, self.seq_length])
|
||||
self.parent.assertListEqual(
|
||||
list(result["end_logits"].size()),
|
||||
[self.batch_size, self.seq_length])
|
||||
|
||||
|
||||
def create_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
|
||||
model = BertForSequenceClassification(config=config, num_labels=self.num_labels)
|
||||
model.eval()
|
||||
loss = model(input_ids, token_type_ids, input_mask, sequence_labels)
|
||||
logits = model(input_ids, token_type_ids, input_mask)
|
||||
outputs = {
|
||||
"loss": loss,
|
||||
"logits": logits,
|
||||
}
|
||||
return outputs
|
||||
|
||||
def check_bert_for_sequence_classification_output(self, result):
|
||||
self.parent.assertListEqual(
|
||||
list(result["logits"].size()),
|
||||
[self.batch_size, self.num_labels])
|
||||
|
||||
|
||||
def create_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
|
||||
model = BertForTokenClassification(config=config, num_labels=self.num_labels)
|
||||
model.eval()
|
||||
loss = model(input_ids, token_type_ids, input_mask, token_labels)
|
||||
logits = model(input_ids, token_type_ids, input_mask)
|
||||
outputs = {
|
||||
"loss": loss,
|
||||
"logits": logits,
|
||||
}
|
||||
return outputs
|
||||
|
||||
def check_bert_for_token_classification_output(self, result):
|
||||
self.parent.assertListEqual(
|
||||
list(result["logits"].size()),
|
||||
[self.batch_size, self.seq_length, self.num_labels])
|
||||
|
||||
|
||||
def test_default(self):
|
||||
self.run_tester(BertModelTest.BertModelTester(self))
|
||||
|
||||
@ -118,8 +252,33 @@ class BertModelTest(unittest.TestCase):
|
||||
self.assertEqual(obj["hidden_size"], 37)
|
||||
|
||||
def run_tester(self, tester):
|
||||
output_result = tester.create_model()
|
||||
tester.check_output(output_result)
|
||||
config_and_inputs = tester.prepare_config_and_inputs()
|
||||
output_result = tester.create_bert_model(*config_and_inputs)
|
||||
tester.check_bert_model_output(output_result)
|
||||
|
||||
output_result = tester.create_bert_for_masked_lm(*config_and_inputs)
|
||||
tester.check_bert_for_masked_lm_output(output_result)
|
||||
tester.check_loss_output(output_result)
|
||||
|
||||
output_result = tester.create_bert_for_next_sequence_prediction(*config_and_inputs)
|
||||
tester.check_bert_for_next_sequence_prediction_output(output_result)
|
||||
tester.check_loss_output(output_result)
|
||||
|
||||
output_result = tester.create_bert_for_pretraining(*config_and_inputs)
|
||||
tester.check_bert_for_pretraining_output(output_result)
|
||||
tester.check_loss_output(output_result)
|
||||
|
||||
output_result = tester.create_bert_for_question_answering(*config_and_inputs)
|
||||
tester.check_bert_for_question_answering_output(output_result)
|
||||
tester.check_loss_output(output_result)
|
||||
|
||||
output_result = tester.create_bert_for_sequence_classification(*config_and_inputs)
|
||||
tester.check_bert_for_sequence_classification_output(output_result)
|
||||
tester.check_loss_output(output_result)
|
||||
|
||||
output_result = tester.create_bert_for_token_classification(*config_and_inputs)
|
||||
tester.check_bert_for_token_classification_output(output_result)
|
||||
tester.check_loss_output(output_result)
|
||||
|
||||
@classmethod
|
||||
def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
|
||||
|
218
tests/modeling_transfo_xl_test.py
Normal file
218
tests/modeling_transfo_xl_test.py
Normal file
@ -0,0 +1,218 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import json
|
||||
import random
|
||||
|
||||
import torch
|
||||
|
||||
from pytorch_pretrained_bert import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
|
||||
|
||||
|
||||
class TransfoXLModelTest(unittest.TestCase):
|
||||
class TransfoXLModelTester(object):
|
||||
|
||||
def __init__(self,
|
||||
parent,
|
||||
batch_size=13,
|
||||
seq_length=7,
|
||||
mem_len=30,
|
||||
clamp_len=15,
|
||||
is_training=True,
|
||||
use_labels=True,
|
||||
vocab_size=99,
|
||||
cutoffs=[10, 50, 80],
|
||||
d_model=32,
|
||||
d_embed=32,
|
||||
n_head=4,
|
||||
d_head=8,
|
||||
d_inner=128,
|
||||
div_val=2,
|
||||
n_layer=5,
|
||||
scope=None,
|
||||
seed=1):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.seq_length = seq_length
|
||||
self.mem_len = mem_len
|
||||
self.clamp_len = clamp_len
|
||||
self.is_training = is_training
|
||||
self.use_labels = use_labels
|
||||
self.vocab_size = vocab_size
|
||||
self.cutoffs = cutoffs
|
||||
self.d_model = d_model
|
||||
self.d_embed = d_embed
|
||||
self.n_head = n_head
|
||||
self.d_head = d_head
|
||||
self.d_inner = d_inner
|
||||
self.div_val = div_val
|
||||
self.n_layer = n_layer
|
||||
self.scope = scope
|
||||
self.seed = seed
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
input_ids_1 = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||
input_ids_2 = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||
|
||||
lm_labels = None
|
||||
if self.use_labels:
|
||||
lm_labels = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||
|
||||
config = TransfoXLConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
mem_len=self.mem_len,
|
||||
clamp_len=self.clamp_len,
|
||||
cutoffs=self.cutoffs,
|
||||
d_model=self.d_model,
|
||||
d_embed=self.d_embed,
|
||||
n_head=self.n_head,
|
||||
d_head=self.d_head,
|
||||
d_inner=self.d_inner,
|
||||
div_val=self.div_val,
|
||||
n_layer=self.n_layer)
|
||||
|
||||
return (config, input_ids_1, input_ids_2, lm_labels)
|
||||
|
||||
def set_seed(self):
|
||||
random.seed(self.seed)
|
||||
torch.manual_seed(self.seed)
|
||||
|
||||
def create_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
|
||||
model = TransfoXLModel(config)
|
||||
model.eval()
|
||||
|
||||
hidden_states_1, mems_1 = model(input_ids_1)
|
||||
hidden_states_2, mems_2 = model(input_ids_2, mems_1)
|
||||
outputs = {
|
||||
"hidden_states_1": hidden_states_1,
|
||||
"mems_1": mems_1,
|
||||
"hidden_states_2": hidden_states_2,
|
||||
"mems_2": mems_2,
|
||||
}
|
||||
return outputs
|
||||
|
||||
def check_transfo_xl_model_output(self, result):
|
||||
self.parent.assertListEqual(
|
||||
list(result["hidden_states_1"].size()),
|
||||
[self.batch_size, self.seq_length, self.d_model])
|
||||
self.parent.assertListEqual(
|
||||
list(result["hidden_states_2"].size()),
|
||||
[self.batch_size, self.seq_length, self.d_model])
|
||||
self.parent.assertListEqual(
|
||||
list(list(mem.size()) for mem in result["mems_1"]),
|
||||
[[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
|
||||
self.parent.assertListEqual(
|
||||
list(list(mem.size()) for mem in result["mems_2"]),
|
||||
[[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
|
||||
|
||||
|
||||
def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
|
||||
model = TransfoXLLMHeadModel(config)
|
||||
model.eval()
|
||||
|
||||
loss_1, mems_1a = model(input_ids_1, target=lm_labels)
|
||||
lm_logits_1, mems_1b = model(input_ids_1)
|
||||
|
||||
loss_2, mems_2a = model(input_ids_2, target=lm_labels, mems=mems_1a)
|
||||
lm_logits_2, mems_2b = model(input_ids_2, mems=mems_1b)
|
||||
|
||||
outputs = {
|
||||
"loss_1": loss_1,
|
||||
"mems_1a": mems_1a,
|
||||
"lm_logits_1": lm_logits_1,
|
||||
"mems_1b": mems_1b,
|
||||
"loss_2": loss_2,
|
||||
"mems_2a": mems_2a,
|
||||
"lm_logits_2": lm_logits_2,
|
||||
"mems_2b": mems_2b,
|
||||
}
|
||||
return outputs
|
||||
|
||||
def check_transfo_xl_lm_head_output(self, result):
|
||||
self.parent.assertListEqual(
|
||||
list(result["loss_1"].size()),
|
||||
[self.batch_size, self.seq_length])
|
||||
self.parent.assertListEqual(
|
||||
list(result["lm_logits_1"].size()),
|
||||
[self.batch_size, self.seq_length, self.vocab_size])
|
||||
self.parent.assertListEqual(
|
||||
list(list(mem.size()) for mem in result["mems_1a"]),
|
||||
[[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
|
||||
self.parent.assertListEqual(
|
||||
list(list(mem.size()) for mem in result["mems_1b"]),
|
||||
[[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
|
||||
self.parent.assertListEqual(
|
||||
list(mem[~torch.isnan(mem)].sum() for mem in result["mems_1a"]),
|
||||
list(mem[~torch.isnan(mem)].sum() for mem in result["mems_1b"]))
|
||||
|
||||
self.parent.assertListEqual(
|
||||
list(result["loss_2"].size()),
|
||||
[self.batch_size, self.seq_length])
|
||||
self.parent.assertListEqual(
|
||||
list(result["lm_logits_2"].size()),
|
||||
[self.batch_size, self.seq_length, self.vocab_size])
|
||||
self.parent.assertListEqual(
|
||||
list(list(mem.size()) for mem in result["mems_2a"]),
|
||||
[[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
|
||||
self.parent.assertListEqual(
|
||||
list(list(mem.size()) for mem in result["mems_2b"]),
|
||||
[[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
|
||||
self.parent.assertListEqual(
|
||||
list(mem[~torch.isnan(mem)].sum() for mem in result["mems_2a"]),
|
||||
list(mem[~torch.isnan(mem)].sum() for mem in result["mems_2b"]))
|
||||
|
||||
def test_default(self):
|
||||
self.run_tester(TransfoXLModelTest.TransfoXLModelTester(self))
|
||||
|
||||
def test_config_to_json_string(self):
|
||||
config = TransfoXLConfig(vocab_size_or_config_json_file=96, d_embed=37)
|
||||
obj = json.loads(config.to_json_string())
|
||||
self.assertEqual(obj["n_token"], 96)
|
||||
self.assertEqual(obj["d_embed"], 37)
|
||||
|
||||
def run_tester(self, tester):
|
||||
config_and_inputs = tester.prepare_config_and_inputs()
|
||||
|
||||
tester.set_seed()
|
||||
output_result = tester.create_transfo_xl_model(*config_and_inputs)
|
||||
tester.check_transfo_xl_model_output(output_result)
|
||||
|
||||
tester.set_seed()
|
||||
output_result = tester.create_transfo_xl_lm_head(*config_and_inputs)
|
||||
tester.check_transfo_xl_lm_head_output(output_result)
|
||||
|
||||
@classmethod
|
||||
def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
|
||||
"""Creates a random int32 tensor of the shape within the vocab size."""
|
||||
if rng is None:
|
||||
rng = random.Random()
|
||||
|
||||
total_dims = 1
|
||||
for dim in shape:
|
||||
total_dims *= dim
|
||||
|
||||
values = []
|
||||
for _ in range(total_dims):
|
||||
values.append(rng.randint(0, vocab_size - 1))
|
||||
|
||||
return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -32,10 +32,10 @@ class OptimizationTest(unittest.TestCase):
|
||||
def test_adam(self):
|
||||
w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True)
|
||||
target = torch.tensor([0.4, 0.2, -0.5])
|
||||
criterion = torch.nn.MSELoss(reduction='elementwise_mean')
|
||||
criterion = torch.nn.MSELoss()
|
||||
# No warmup, constant schedule, no gradient clipping
|
||||
optimizer = BertAdam(params=[w], lr=2e-1,
|
||||
weight_decay_rate=0.0,
|
||||
weight_decay=0.0,
|
||||
max_grad_norm=-1)
|
||||
for _ in range(100):
|
||||
loss = criterion(w, target)
|
||||
|
56
tests/tokenization_openai_test.py
Normal file
56
tests/tokenization_openai_test.py
Normal file
@ -0,0 +1,56 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import os
|
||||
import unittest
|
||||
import json
|
||||
|
||||
from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer
|
||||
|
||||
|
||||
class OpenAIGPTTokenizationTest(unittest.TestCase):
|
||||
|
||||
def test_full_tokenizer(self):
|
||||
""" Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
|
||||
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
|
||||
"w</w>", "r</w>", "t</w>",
|
||||
"lo", "low", "er</w>",
|
||||
"low</w>", "lowest</w>", "newer</w>", "wider</w>"]
|
||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||
merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
|
||||
with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp:
|
||||
json.dump(vocab_tokens, fp)
|
||||
vocab_file = fp.name
|
||||
with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp:
|
||||
fp.write("\n".join(merges))
|
||||
merges_file = fp.name
|
||||
|
||||
tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file, special_tokens=["<unk>"])
|
||||
os.remove(vocab_file)
|
||||
os.remove(merges_file)
|
||||
|
||||
text = "lower"
|
||||
bpe_tokens = ["low", "er</w>"]
|
||||
tokens = tokenizer.tokenize(text)
|
||||
self.assertListEqual(tokens, bpe_tokens)
|
||||
|
||||
input_tokens = tokens + ["<unk>"]
|
||||
input_bpe_tokens = [14, 15, 20]
|
||||
self.assertListEqual(
|
||||
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
@ -12,15 +12,17 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import os
|
||||
import unittest
|
||||
from io import open
|
||||
|
||||
from pytorch_pretrained_bert.tokenization import (BertTokenizer, BasicTokenizer, WordpieceTokenizer,
|
||||
_is_whitespace, _is_control, _is_punctuation)
|
||||
from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
|
||||
BertTokenizer,
|
||||
WordpieceTokenizer,
|
||||
_is_control, _is_punctuation,
|
||||
_is_whitespace)
|
||||
|
||||
|
||||
class TokenizationTest(unittest.TestCase):
|
||||
@ -30,7 +32,7 @@ class TokenizationTest(unittest.TestCase):
|
||||
"[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
|
||||
"##ing", ","
|
||||
]
|
||||
with open("/tmp/bert_tokenizer_test.txt", "w") as vocab_writer:
|
||||
with open("/tmp/bert_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer:
|
||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||
|
||||
vocab_file = vocab_writer.name
|
||||
@ -44,12 +46,30 @@ class TokenizationTest(unittest.TestCase):
|
||||
self.assertListEqual(
|
||||
tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
|
||||
|
||||
def test_full_tokenizer_raises_error_for_long_sequences(self):
|
||||
vocab_tokens = [
|
||||
"[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
|
||||
"##ing", ","
|
||||
]
|
||||
with open("/tmp/bert_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer:
|
||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||
vocab_file = vocab_writer.name
|
||||
|
||||
tokenizer = BertTokenizer(vocab_file, max_len=10)
|
||||
os.remove(vocab_file)
|
||||
tokens = tokenizer.tokenize(u"the cat sat on the mat in the summer time")
|
||||
indices = tokenizer.convert_tokens_to_ids(tokens)
|
||||
self.assertListEqual(indices, [0 for _ in range(10)])
|
||||
|
||||
tokens = tokenizer.tokenize(u"the cat sat on the mat in the summer time .")
|
||||
self.assertRaises(ValueError, tokenizer.convert_tokens_to_ids, tokens)
|
||||
|
||||
def test_chinese(self):
|
||||
tokenizer = BasicTokenizer()
|
||||
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.tokenize(u"ah\u535A\u63A8zz"),
|
||||
[u"ah", u"\u535A", u"\u63A8", u"zz"])
|
||||
[u"ah", u"\u535A", u"\u63A8", u"zz"])
|
||||
|
||||
def test_basic_tokenizer_lower(self):
|
||||
tokenizer = BasicTokenizer(do_lower_case=True)
|
||||
|
90
tests/tokenization_transfo_xl_test.py
Normal file
90
tests/tokenization_transfo_xl_test.py
Normal file
@ -0,0 +1,90 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import os
|
||||
import unittest
|
||||
from io import open
|
||||
|
||||
from pytorch_pretrained_bert.tokenization_transfo_xl import (TransfoXLTokenizer,
|
||||
_is_control, _is_punctuation,
|
||||
_is_whitespace)
|
||||
|
||||
|
||||
class TransfoXLTokenizationTest(unittest.TestCase):
|
||||
|
||||
def test_full_tokenizer(self):
|
||||
vocab_tokens = [
|
||||
"<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un", "running", ","
|
||||
]
|
||||
with open("/tmp/transfo_xl_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer:
|
||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||
vocab_file = vocab_writer.name
|
||||
|
||||
tokenizer = TransfoXLTokenizer(vocab_file=vocab_file, lower_case=True)
|
||||
tokenizer.build_vocab()
|
||||
os.remove(vocab_file)
|
||||
|
||||
tokens = tokenizer.tokenize(u"<unk> UNwant\u00E9d,running")
|
||||
self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
|
||||
|
||||
def test_full_tokenizer_lower(self):
|
||||
tokenizer = TransfoXLTokenizer(lower_case=True)
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "),
|
||||
["hello", "!", "how", "are", "you", "?"])
|
||||
self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
|
||||
|
||||
def test_full_tokenizer_no_lower(self):
|
||||
tokenizer = TransfoXLTokenizer(lower_case=False)
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "),
|
||||
["HeLLo", "!", "how", "Are", "yoU", "?"])
|
||||
|
||||
def test_is_whitespace(self):
|
||||
self.assertTrue(_is_whitespace(u" "))
|
||||
self.assertTrue(_is_whitespace(u"\t"))
|
||||
self.assertTrue(_is_whitespace(u"\r"))
|
||||
self.assertTrue(_is_whitespace(u"\n"))
|
||||
self.assertTrue(_is_whitespace(u"\u00A0"))
|
||||
|
||||
self.assertFalse(_is_whitespace(u"A"))
|
||||
self.assertFalse(_is_whitespace(u"-"))
|
||||
|
||||
def test_is_control(self):
|
||||
self.assertTrue(_is_control(u"\u0005"))
|
||||
|
||||
self.assertFalse(_is_control(u"A"))
|
||||
self.assertFalse(_is_control(u" "))
|
||||
self.assertFalse(_is_control(u"\t"))
|
||||
self.assertFalse(_is_control(u"\r"))
|
||||
|
||||
def test_is_punctuation(self):
|
||||
self.assertTrue(_is_punctuation(u"-"))
|
||||
self.assertTrue(_is_punctuation(u"$"))
|
||||
self.assertTrue(_is_punctuation(u"`"))
|
||||
self.assertTrue(_is_punctuation(u"."))
|
||||
|
||||
self.assertFalse(_is_punctuation(u"A"))
|
||||
self.assertFalse(_is_punctuation(u" "))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
Reference in New Issue
Block a user