Merge pull request #287 from huggingface/gpt2

Gpt2
simple
2025-10-21 01:23:56 +08:00 · 2019-02-18 11:38:05 +01:00 · 2019-02-18 11:33:54 +01:00 · 2019-02-18 11:27:18 +01:00 · 2019-02-18 11:12:09 +01:00 · 2019-02-18 10:50:30 +01:00
44 changed files with 9551 additions and 712 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -0,0 +1,29 @@
+version: 2
+jobs:
+    build_py3:
+        working_directory: ~/pytorch-pretrained-BERT
+        docker:
+            - image: circleci/python:3.5
+        steps:
+            - checkout
+            - run: sudo pip install --progress-bar off .
+            - run: sudo pip install pytest ftfy spacy
+            - run: sudo python -m spacy download en
+            - run: python -m pytest -sv tests/
+    build_py2:
+        working_directory: ~/pytorch-pretrained-BERT
+        docker:
+            - image: circleci/python:2.7
+        steps:
+            - checkout
+            - run: sudo pip install --progress-bar off .
+            - run: sudo pip install pytest spacy
+            - run: sudo pip install ftfy==4.4.3
+            - run: sudo python -m spacy download en
+            - run: python -m pytest -sv tests/
+workflows:
+  version: 2
+  build_and_test:
+    jobs:
+      - build_py3
+      - build_py2
--- a/.gitignore
+++ b/.gitignore
@ -119,4 +119,7 @@ dmypy.json
 .vscode

 # TF code
-tensorflow_code
+tensorflow_code
+
+# Models
+models
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -0,0 +1 @@
+include LICENSE
--- a/README.md
+++ b/README.md
--- a/bin/pytorch_pretrained_bert
+++ b/bin/pytorch_pretrained_bert
@ -1,2 +0,0 @@
-#!/bin/sh
-python -m pytorch_pretrained_bert "$@"
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -0,0 +1,7 @@
+FROM pytorch/pytorch:latest
+
+RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
+
+RUN pip install pytorch-pretrained-bert
+
+WORKDIR /workspace
--- a/examples/extract_features.py
+++ b/examples/extract_features.py
@ -28,7 +28,7 @@ import torch
 from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler

-from pytorch_pretrained_bert.tokenization import convert_to_unicode, BertTokenizer
+from pytorch_pretrained_bert.tokenization import BertTokenizer
 from pytorch_pretrained_bert.modeling import BertModel

 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
@ -80,10 +80,10 @@ def convert_examples_to_features(examples, seq_length, tokenizer):
        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
+        #  type_ids:   0   0  0    0    0     0      0   0    1  1  1   1  1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids: 0   0   0   0  0     0 0
+        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
@ -168,9 +168,9 @@ def read_examples(input_file):
    """Read a list of `InputExample`s from an input file."""
    examples = []
    unique_id = 0
-    with open(input_file, "r") as reader:
+    with open(input_file, "r", encoding='utf-8') as reader:
        while True:
-            line = convert_to_unicode(reader.readline())
+            line = reader.readline()
            if not line:
                break
            line = line.strip()
@ -199,6 +199,7 @@ def main():
                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")

    ## Other parameters
+    parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
    parser.add_argument("--layers", default="-1,-2,-3,-4", type=str)
    parser.add_argument("--max_seq_length", default=128, type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. Sequences longer "
@ -209,7 +210,6 @@ def main():
                        default=-1,
                        help = "local_rank for distributed training on gpus")
    parser.add_argument("--no_cuda",
-                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")

@ -227,7 +227,7 @@ def main():

    layer_indexes = [int(x) for x in args.layers.split(",")]

-    tokenizer = BertTokenizer.from_pretrained(args.bert_model)
+    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    examples = read_examples(args.input_file)

--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@ -1,5 +1,6 @@
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -14,27 +15,28 @@
 # limitations under the License.
 """BERT finetuning runner."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function

-import csv
-import os
-import logging
 import argparse
+import csv
+import logging
+import os
 import random
-from tqdm import tqdm, trange
+import sys

 import numpy as np
 import torch
-from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
 from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange

-from pytorch_pretrained_bert.tokenization import printable_text, convert_to_unicode, BertTokenizer
-from pytorch_pretrained_bert.modeling import BertForSequenceClassification
-from pytorch_pretrained_bert.optimization import BertAdam
+from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
+from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
+from pytorch_pretrained_bert.tokenization import BertTokenizer
+from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear

-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
 logger = logging.getLogger(__name__)
@ -93,6 +95,8 @@ class DataProcessor(object):
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
+                if sys.version_info[0] == 2:
+                    line = list(unicode(cell, 'utf-8') for cell in line)
                lines.append(line)
            return lines

@ -122,9 +126,9 @@ class MrpcProcessor(DataProcessor):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, i)
-            text_a = convert_to_unicode(line[3])
-            text_b = convert_to_unicode(line[4])
-            label = convert_to_unicode(line[0])
+            text_a = line[3]
+            text_b = line[4]
+            label = line[0]
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples
@ -154,10 +158,10 @@ class MnliProcessor(DataProcessor):
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
-            guid = "%s-%s" % (set_type, convert_to_unicode(line[0]))
-            text_a = convert_to_unicode(line[8])
-            text_b = convert_to_unicode(line[9])
-            label = convert_to_unicode(line[-1])
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[8]
+            text_b = line[9]
+            label = line[-1]
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples
@ -185,8 +189,8 @@ class ColaProcessor(DataProcessor):
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
-            text_a = convert_to_unicode(line[3])
-            label = convert_to_unicode(line[1])
+            text_a = line[3]
+            label = line[1]
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        return examples
@ -195,9 +199,7 @@ class ColaProcessor(DataProcessor):
 def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""

-    label_map = {}
-    for (i, label) in enumerate(label_list):
-        label_map[label] = i
+    label_map = {label : i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
@ -206,8 +208,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
-
-        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
@ -215,7 +215,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
-                tokens_a = tokens_a[0:(max_seq_length - 2)]
+                tokens_a = tokens_a[:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
@ -235,22 +235,12 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
-        tokens = []
-        segment_ids = []
-        tokens.append("[CLS]")
-        segment_ids.append(0)
-        for token in tokens_a:
-            tokens.append(token)
-            segment_ids.append(0)
-        tokens.append("[SEP]")
-        segment_ids.append(0)
+        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
+        segment_ids = [0] * len(tokens)

        if tokens_b:
-            for token in tokens_b:
-                tokens.append(token)
-                segment_ids.append(1)
-            tokens.append("[SEP]")
-            segment_ids.append(1)
+            tokens += tokens_b + ["[SEP]"]
+            segment_ids += [1] * (len(tokens_b) + 1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

@ -259,10 +249,10 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
-        while len(input_ids) < max_seq_length:
-            input_ids.append(0)
-            input_mask.append(0)
-            segment_ids.append(0)
+        padding = [0] * (max_seq_length - len(input_ids))
+        input_ids += padding
+        input_mask += padding
+        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
@ -273,7 +263,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
-                    [printable_text(x) for x in tokens]))
+                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
@ -308,35 +298,6 @@ def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)

-def copy_optimizer_params_to_model(named_params_model, named_params_optimizer):
-    """ Utility function for optimize_on_cpu and 16-bits training.
-        Copy the parameters optimized on CPU/RAM back to the model on GPU
-    """
-    for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model):
-        if name_opti != name_model:
-            logger.error("name_opti != name_model: {} {}".format(name_opti, name_model))
-            raise ValueError
-        param_model.data.copy_(param_opti.data)
-
-def set_optimizer_params_grad(named_params_optimizer, named_params_model, test_nan=False):
-    """ Utility function for optimize_on_cpu and 16-bits training.
-        Copy the gradient of the GPU parameters to the CPU/RAMM copy of the model
-    """
-    is_nan = False
-    for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model):
-        if name_opti != name_model:
-            logger.error("name_opti != name_model: {} {}".format(name_opti, name_model))
-            raise ValueError
-        if param_model.grad is not None:
-            if test_nan and torch.isnan(param_model.grad).sum() > 0:
-                is_nan = True
-            if param_opti.grad is None:
-                param_opti.grad = torch.nn.Parameter(param_opti.data.new().resize_(*param_opti.data.size()))
-            param_opti.grad.data.copy_(param_model.grad.data)
-        else:
-            param_opti.grad = None
-    return is_nan
-
 def main():
    parser = argparse.ArgumentParser()

@ -348,7 +309,8 @@ def main():
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--bert_model", default=None, type=str, required=True,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
-                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
+                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
+                        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
@ -358,9 +320,13 @@ def main():
                        default=None,
                        type=str,
                        required=True,
-                        help="The output directory where the model checkpoints will be written.")
+                        help="The output directory where the model predictions and checkpoints will be written.")

    ## Other parameters
+    parser.add_argument("--cache_dir",
+                        default="",
+                        type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
@ -368,13 +334,14 @@ def main():
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
-                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
-                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_lower_case",
+                        action='store_true',
+                        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
@ -397,59 +364,68 @@ def main():
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
-                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--seed', 
-                        type=int, 
+    parser.add_argument('--seed',
+                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
-                        help="Number of updates steps to accumualte before performing a backward/update pass.")                       
-    parser.add_argument('--optimize_on_cpu',
-                        default=False,
-                        action='store_true',
-                        help="Whether to perform optimization and keep the optimizer averages on CPU")
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument('--fp16',
-                        default=False,
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
-                        type=float, default=128,
-                        help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
-
+                        type=float, default=0,
+                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
+                             "0 (default value): dynamic loss scaling.\n"
+                             "Positive power of 2: static loss scaling value.\n")
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
    args = parser.parse_args()

+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
    }

+    num_labels_task = {
+        "cola": 2,
+        "mnli": 3,
+        "mrpc": 2,
+    }
+
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
+        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
-        if args.fp16:
-            logger.info("16-bits training currently not supported in distributed training")
-            args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496)
-    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))
+    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
+        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))

-    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)
+    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
@ -460,9 +436,10 @@ def main():
    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-    os.makedirs(args.output_dir, exist_ok=True)
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

@ -470,56 +447,77 @@ def main():
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
+    num_labels = num_labels_task[task_name]
    label_list = processor.get_labels()

-    tokenizer = BertTokenizer.from_pretrained(args.bert_model)
+    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    train_examples = None
-    num_train_steps = None
+    num_train_optimization_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
-        num_train_steps = int(
-            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)
+        num_train_optimization_steps = int(
+            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
+        if args.local_rank != -1:
+            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()

    # Prepare model
-    model = BertForSequenceClassification.from_pretrained(args.bert_model, len(label_list),
-                cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank))
+    cache_dir = args.cache_dir if args.cache_dir else os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank))
+    model = BertForSequenceClassification.from_pretrained(args.bert_model,
+              cache_dir=cache_dir,
+              num_labels = num_labels)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank)
+        try:
+            from apex.parallel import DistributedDataParallel as DDP
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
-    if args.fp16:
-        param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \
-                            for n, param in model.named_parameters()]
-    elif args.optimize_on_cpu:
-        param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
-                            for n, param in model.named_parameters()]
-    else:
-        param_optimizer = list(model.named_parameters())
-    no_decay = ['bias', 'gamma', 'beta']
+    param_optimizer = list(model.named_parameters())
+    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
-        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
-        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
+        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
-    optimizer = BertAdam(optimizer_grouped_parameters,
-                         lr=args.learning_rate,
-                         warmup=args.warmup_proportion,
-                         t_total=num_train_steps)
+    if args.fp16:
+        try:
+            from apex.optimizers import FP16_Optimizer
+            from apex.optimizers import FusedAdam
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+        optimizer = FusedAdam(optimizer_grouped_parameters,
+                              lr=args.learning_rate,
+                              bias_correction=False,
+                              max_grad_norm=1.0)
+        if args.loss_scale == 0:
+            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+        else:
+            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+
+    else:
+        optimizer = BertAdam(optimizer_grouped_parameters,
+                             lr=args.learning_rate,
+                             warmup=args.warmup_proportion,
+                             t_total=num_train_optimization_steps)

    global_step = 0
+    nb_tr_steps = 0
+    tr_loss = 0
    if args.do_train:
        train_features = convert_examples_to_features(
            train_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
-        logger.info("  Num steps = %d", num_train_steps)
+        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
@ -538,40 +536,49 @@ def main():
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
-                loss, _ = model(input_ids, segment_ids, input_mask, label_ids)
+                loss = model(input_ids, segment_ids, input_mask, label_ids)
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
-                if args.fp16 and args.loss_scale != 1.0:
-                    # rescale loss for fp16 training
-                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
-                    loss = loss * args.loss_scale
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
-                loss.backward()
+
+                if args.fp16:
+                    optimizer.backward(loss)
+                else:
+                    loss.backward()
+
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
-                    if args.fp16 or args.optimize_on_cpu:
-                        if args.fp16 and args.loss_scale != 1.0:
-                            # scale down gradients for fp16 training
-                            for param in model.parameters():
-                                if param.grad is not None:
-                                    param.grad.data = param.grad.data / args.loss_scale
-                        is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
-                        if is_nan:
-                            logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling")
-                            args.loss_scale = args.loss_scale / 2
-                            model.zero_grad()
-                            continue
-                        optimizer.step()
-                        copy_optimizer_params_to_model(model.named_parameters(), param_optimizer)
-                    else:
-                        optimizer.step()
-                    model.zero_grad()
+                    if args.fp16:
+                        # modify learning rate with special warm up BERT uses
+                        # if args.fp16 is False, BertAdam is used that handles this automatically
+                        lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
+                        for param_group in optimizer.param_groups:
+                            param_group['lr'] = lr_this_step
+                    optimizer.step()
+                    optimizer.zero_grad()
                    global_step += 1

-    if args.do_eval:
+    if args.do_train:
+        # Save a trained model and the associated configuration
+        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+        torch.save(model_to_save.state_dict(), output_model_file)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+        with open(output_config_file, 'w') as f:
+            f.write(model_to_save.config.to_json_string())
+
+        # Load a trained model and config that you have fine-tuned
+        config = BertConfig(output_config_file)
+        model = BertForSequenceClassification(config, num_labels=num_labels)
+        model.load_state_dict(torch.load(output_model_file))
+    else:
+        model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
+    model.to(device)
+
+    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        eval_examples = processor.get_dev_examples(args.data_dir)
        eval_features = convert_examples_to_features(
            eval_examples, label_list, args.max_seq_length, tokenizer)
@ -583,23 +590,23 @@ def main():
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-        if args.local_rank == -1:
-            eval_sampler = SequentialSampler(eval_data)
-        else:
-            eval_sampler = DistributedSampler(eval_data)
+        # Run prediction for full data
+        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
-        for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
+ 
+        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
-                tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids)
+                tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
+                logits = model(input_ids, segment_ids, input_mask)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
@ -613,11 +620,11 @@ def main():

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
-
+        loss = tr_loss/nb_tr_steps if args.do_train else None
        result = {'eval_loss': eval_loss,
                  'eval_accuracy': eval_accuracy,
                  'global_step': global_step,
-                  'loss': tr_loss/nb_tr_steps}
+                  'loss': loss}

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
--- a/examples/run_gpt2.py
+++ b/examples/run_gpt2.py
@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+
+import argparse
+import logging
+from tqdm import trange
+
+import torch
+import torch.nn.functional as F
+import numpy as np
+
+from pytorch_pretrained_bert import GPT2LMHeadModel, GPT2Tokenizer
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+def top_k_logits(logits, k):
+    if k == 0:
+        return logits
+    values, _ = torch.topk(logits, k)
+    min_values = values[:, -1]
+    return torch.where(logits < min_values, torch.ones_like(logits, dtype=logits.dtype) * -1e10, logits)
+
+def sample_sequence(model, length, start_token=None, batch_size=None, context=None, temperature=1, top_k=0, device='cuda', sample=True):
+    if start_token is None:
+        assert context is not None, 'Specify exactly one of start_token and context!'
+        context = torch.tensor(context, device=device, dtype=torch.long).unsqueeze(0).repeat(batch_size, 1)
+    else:
+        assert context is None, 'Specify exactly one of start_token and context!'
+        context = torch.full((batch_size, 1), start_token, device=device, dtype=torch.long)
+    prev = context
+    output = context
+    past = None
+    with torch.no_grad():
+        for i in trange(length):
+            logits, past = model(prev, past=past)
+            logits = logits[:, -1, :] / temperature
+            logits = top_k_logits(logits, k=top_k)
+            log_probs = F.softmax(logits, dim=-1)
+            if sample:
+                prev = torch.multinomial(log_probs, num_samples=1)
+            else:
+                _, prev = torch.topk(log_probs, k=1, dim=-1)
+            output = torch.cat((output, prev), dim=1)
+    return output
+
+def run_model():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_name_or_path', type=str, default='gpt2', help='pretrained model name or path to local checkpoint')
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--nsamples", type=int, default=1)
+    parser.add_argument("--batch_size", type=int, default=-1)
+    parser.add_argument("--length", type=int, default=-1)
+    parser.add_argument("--temperature", type=int, default=1)
+    parser.add_argument("--top_k", type=int, default=0)
+    parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.')
+    args = parser.parse_args()
+    print(args)
+
+    if args.batch_size == -1:
+        args.batch_size = 1
+    assert args.nsamples % args.batch_size == 0
+
+    np.random.seed(args.seed)
+    torch.random.manual_seed(args.seed)
+    torch.cuda.manual_seed(args.seed)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
+    model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path)
+    model.to(device)
+    model.eval()
+
+    if args.length == -1:
+        args.length = model.config.n_ctx // 2
+    elif args.length > model.config.n_ctx:
+        raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx)
+
+    while not args.unconditional:
+        if not args.unconditional:
+            raw_text = input("Model prompt >>> ")
+            while not raw_text:
+                print('Prompt should not be empty!')
+                raw_text = input("Model prompt >>> ")
+            context_tokens = enc.encode(raw_text)
+        generated = 0
+        for _ in range(args.nsamples // args.batch_size):
+            out = sample_sequence(
+                model=model, length=args.length,
+                context=context_tokens if not args.unconditional else None,
+                start_token=enc.encoder['<|endoftext|>'] if args.unconditional else None,
+                batch_size=args.batch_size,
+                temperature=args.temperature, top_k=args.top_k, device=device
+            )
+            out = out[:, len(context_tokens):].tolist()
+            for i in range(args.batch_size):
+                generated += 1
+                text = enc.decode(out[i])
+                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
+                print(text)
+        print("=" * 80)
+
+if __name__ == '__main__':
+    run_model()
--- a/examples/run_gpt2_generate_unconditional_samples.py
+++ b/examples/run_gpt2_generate_unconditional_samples.py
@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+
+import argparse
+import logging
+
+import torch
+import torch.nn.functional as F
+import numpy as np
+from tqdm import trange
+
+from pytorch_pretrained_bert import GPT2LMHeadModel, GPT2Tokenizer
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+def top_k_logits(logits, k):
+    if k == 0:
+        return logits
+    values, _ = torch.topk(logits, k)
+    min_values = values[:, -1]
+    return torch.where(logits < min_values, torch.ones_like(logits, dtype=logits.dtype) * -1e10, logits)
+
+def sample_sequence(model, length, start_token=None, batch_size=None, context=None, temperature=1, top_k=0, device='cuda'):
+    if start_token is None:
+        assert context is not None, 'Specify exactly one of start_token and context!'
+        context = torch.tensor(context, device=device, dtype=torch.long)
+    else:
+        assert context is None, 'Specify exactly one of start_token and context!'
+        context = torch.full((batch_size, 1), start_token, device=device, dtype=torch.long)
+    prev = context
+    output = context
+    past = None
+    with torch.no_grad():
+        for i in trange(length):
+            logits, past = model(prev, past=past)
+            logits = logits[:, -1, :] / temperature
+            logits = top_k_logits(logits, k=top_k)
+            log_probs = F.softmax(logits, dim=-1)
+            prev = torch.multinomial(log_probs, num_samples=1)
+            output = torch.cat((output, prev), dim=1)
+    return output
+
+def sample_model():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_name_or_path', type=str, default='gpt2', help='pretrained model name or path to local checkpoint')
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--nsamples", type=int, default=0)
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--length", type=int, default=-1)
+    parser.add_argument("--temperature", type=int, default=1)
+    parser.add_argument("--top_k", type=int, default=0)
+    args = parser.parse_args()
+    print(args)
+
+    np.random.seed(args.seed)
+    torch.random.manual_seed(args.seed)
+    torch.cuda.manual_seed(args.seed)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
+    model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path)
+    model.to(device)
+    model.eval()
+
+    if args.length == -1:
+        args.length = model.config.n_ctx
+    elif args.length > model.config.n_ctx:
+        raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx)
+
+    generated = 0
+    while args.nsamples == 0 or generated < args.nsamples:
+        out = sample_sequence(
+            model=model, length=args.length,
+            start_token=enc.encoder['<|endoftext|>'],
+            batch_size=args.batch_size,
+            temperature=args.temperature, top_k=args.top_k, device=device
+        )
+        out = out.tolist()
+        for i in range(args.batch_size):
+            generated += args.batch_size
+            text = enc.decode(out[i])
+            print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
+            print(text)
+
+if __name__ == '__main__':
+    sample_model()
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@ -0,0 +1,645 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT finetuning runner."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+import logging
+import os
+import random
+from io import open
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, Dataset, RandomSampler
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
+
+from pytorch_pretrained_bert.modeling import BertForPreTraining
+from pytorch_pretrained_bert.tokenization import BertTokenizer
+from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
+
+from torch.utils.data import Dataset
+import random
+
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt='%m/%d/%Y %H:%M:%S',
+                    level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class BERTDataset(Dataset):
+    def __init__(self, corpus_path, tokenizer, seq_len, encoding="utf-8", corpus_lines=None, on_memory=True):
+        self.vocab = tokenizer.vocab
+        self.tokenizer = tokenizer
+        self.seq_len = seq_len
+        self.on_memory = on_memory
+        self.corpus_lines = corpus_lines  # number of non-empty lines in input corpus
+        self.corpus_path = corpus_path
+        self.encoding = encoding
+        self.current_doc = 0  # to avoid random sentence from same doc
+
+        # for loading samples directly from file
+        self.sample_counter = 0  # used to keep track of full epochs on file
+        self.line_buffer = None  # keep second sentence of a pair in memory and use as first sentence in next pair
+
+        # for loading samples in memory
+        self.current_random_doc = 0
+        self.num_docs = 0
+        self.sample_to_doc = [] # map sample index to doc and line
+
+        # load samples into memory
+        if on_memory:
+            self.all_docs = []
+            doc = []
+            self.corpus_lines = 0
+            with open(corpus_path, "r", encoding=encoding) as f:
+                for line in tqdm(f, desc="Loading Dataset", total=corpus_lines):
+                    line = line.strip()
+                    if line == "":
+                        self.all_docs.append(doc)
+                        doc = []
+                        #remove last added sample because there won't be a subsequent line anymore in the doc
+                        self.sample_to_doc.pop()
+                    else:
+                        #store as one sample
+                        sample = {"doc_id": len(self.all_docs),
+                                  "line": len(doc)}
+                        self.sample_to_doc.append(sample)
+                        doc.append(line)
+                        self.corpus_lines = self.corpus_lines + 1
+
+            # if last row in file is not empty
+            if self.all_docs[-1] != doc:
+                self.all_docs.append(doc)
+                self.sample_to_doc.pop()
+
+            self.num_docs = len(self.all_docs)
+
+        # load samples later lazily from disk
+        else:
+            if self.corpus_lines is None:
+                with open(corpus_path, "r", encoding=encoding) as f:
+                    self.corpus_lines = 0
+                    for line in tqdm(f, desc="Loading Dataset", total=corpus_lines):
+                        if line.strip() == "":
+                            self.num_docs += 1
+                        else:
+                            self.corpus_lines += 1
+
+                    # if doc does not end with empty line
+                    if line.strip() != "":
+                        self.num_docs += 1
+
+            self.file = open(corpus_path, "r", encoding=encoding)
+            self.random_file = open(corpus_path, "r", encoding=encoding)
+
+    def __len__(self):
+        # last line of doc won't be used, because there's no "nextSentence". Additionally, we start counting at 0.
+        return self.corpus_lines - self.num_docs - 1
+
+    def __getitem__(self, item):
+        cur_id = self.sample_counter
+        self.sample_counter += 1
+        if not self.on_memory:
+            # after one epoch we start again from beginning of file
+            if cur_id != 0 and (cur_id % len(self) == 0):
+                self.file.close()
+                self.file = open(self.corpus_path, "r", encoding=self.encoding)
+
+        t1, t2, is_next_label = self.random_sent(item)
+
+        # tokenize
+        tokens_a = self.tokenizer.tokenize(t1)
+        tokens_b = self.tokenizer.tokenize(t2)
+
+        # combine to one sample
+        cur_example = InputExample(guid=cur_id, tokens_a=tokens_a, tokens_b=tokens_b, is_next=is_next_label)
+
+        # transform sample to features
+        cur_features = convert_example_to_features(cur_example, self.seq_len, self.tokenizer)
+
+        cur_tensors = (torch.tensor(cur_features.input_ids),
+                       torch.tensor(cur_features.input_mask),
+                       torch.tensor(cur_features.segment_ids),
+                       torch.tensor(cur_features.lm_label_ids),
+                       torch.tensor(cur_features.is_next))
+
+        return cur_tensors
+
+    def random_sent(self, index):
+        """
+        Get one sample from corpus consisting of two sentences. With prob. 50% these are two subsequent sentences
+        from one doc. With 50% the second sentence will be a random one from another doc.
+        :param index: int, index of sample.
+        :return: (str, str, int), sentence 1, sentence 2, isNextSentence Label
+        """
+        t1, t2 = self.get_corpus_line(index)
+        if random.random() > 0.5:
+            label = 0
+        else:
+            t2 = self.get_random_line()
+            label = 1
+
+        assert len(t1) > 0
+        assert len(t2) > 0
+        return t1, t2, label
+
+    def get_corpus_line(self, item):
+        """
+        Get one sample from corpus consisting of a pair of two subsequent lines from the same doc.
+        :param item: int, index of sample.
+        :return: (str, str), two subsequent sentences from corpus
+        """
+        t1 = ""
+        t2 = ""
+        assert item < self.corpus_lines
+        if self.on_memory:
+            sample = self.sample_to_doc[item]
+            t1 = self.all_docs[sample["doc_id"]][sample["line"]]
+            t2 = self.all_docs[sample["doc_id"]][sample["line"]+1]
+            # used later to avoid random nextSentence from same doc
+            self.current_doc = sample["doc_id"]
+            return t1, t2
+        else:
+            if self.line_buffer is None:
+                # read first non-empty line of file
+                while t1 == "" :
+                    t1 = next(self.file).strip()
+                    t2 = next(self.file).strip()
+            else:
+                # use t2 from previous iteration as new t1
+                t1 = self.line_buffer
+                t2 = next(self.file).strip()
+                # skip empty rows that are used for separating documents and keep track of current doc id
+                while t2 == "" or t1 == "":
+                    t1 = next(self.file).strip()
+                    t2 = next(self.file).strip()
+                    self.current_doc = self.current_doc+1
+            self.line_buffer = t2
+
+        assert t1 != ""
+        assert t2 != ""
+        return t1, t2
+
+    def get_random_line(self):
+        """
+        Get random line from another document for nextSentence task.
+        :return: str, content of one line
+        """
+        # Similar to original tf repo: This outer loop should rarely go for more than one iteration for large
+        # corpora. However, just to be careful, we try to make sure that
+        # the random document is not the same as the document we're processing.
+        for _ in range(10):
+            if self.on_memory:
+                rand_doc_idx = random.randint(0, len(self.all_docs)-1)
+                rand_doc = self.all_docs[rand_doc_idx]
+                line = rand_doc[random.randrange(len(rand_doc))]
+            else:
+                rand_index = random.randint(1, self.corpus_lines if self.corpus_lines < 1000 else 1000)
+                #pick random line
+                for _ in range(rand_index):
+                    line = self.get_next_line()
+            #check if our picked random line is really from another doc like we want it to be
+            if self.current_random_doc != self.current_doc:
+                break
+        return line
+
+    def get_next_line(self):
+        """ Gets next line of random_file and starts over when reaching end of file"""
+        try:
+            line = next(self.random_file).strip()
+            #keep track of which document we are currently looking at to later avoid having the same doc as t1
+            if line == "":
+                self.current_random_doc = self.current_random_doc + 1
+                line = next(self.random_file).strip()
+        except StopIteration:
+            self.random_file.close()
+            self.random_file = open(self.corpus_path, "r", encoding=self.encoding)
+            line = next(self.random_file).strip()
+        return line
+
+
+class InputExample(object):
+    """A single training/test example for the language model."""
+
+    def __init__(self, guid, tokens_a, tokens_b=None, is_next=None, lm_labels=None):
+        """Constructs a InputExample.
+
+        Args:
+            guid: Unique id for the example.
+            tokens_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+            tokens_b: (Optional) string. The untokenized text of the second sequence.
+            Only must be specified for sequence pair tasks.
+            label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.tokens_a = tokens_a
+        self.tokens_b = tokens_b
+        self.is_next = is_next  # nextSentence
+        self.lm_labels = lm_labels  # masked words for language model
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, input_ids, input_mask, segment_ids, is_next, lm_label_ids):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.is_next = is_next
+        self.lm_label_ids = lm_label_ids
+
+
+def random_word(tokens, tokenizer):
+    """
+    Masking some random tokens for Language Model task with probabilities as in the original BERT paper.
+    :param tokens: list of str, tokenized sentence.
+    :param tokenizer: Tokenizer, object used for tokenization (we need it's vocab here)
+    :return: (list of str, list of int), masked tokens and related labels for LM prediction
+    """
+    output_label = []
+
+    for i, token in enumerate(tokens):
+        prob = random.random()
+        # mask token with 15% probability
+        if prob < 0.15:
+            prob /= 0.15
+
+            # 80% randomly change token to mask token
+            if prob < 0.8:
+                tokens[i] = "[MASK]"
+
+            # 10% randomly change token to random token
+            elif prob < 0.9:
+                tokens[i] = random.choice(list(tokenizer.vocab.items()))[0]
+
+            # -> rest 10% randomly keep current token
+
+            # append current token to output (we will predict these later)
+            try:
+                output_label.append(tokenizer.vocab[token])
+            except KeyError:
+                # For unknown words (should not occur with BPE vocab)
+                output_label.append(tokenizer.vocab["[UNK]"])
+                logger.warning("Cannot find token '{}' in vocab. Using [UNK] insetad".format(token))
+        else:
+            # no masking token (will be ignored by loss function later)
+            output_label.append(-1)
+
+    return tokens, output_label
+
+
+def convert_example_to_features(example, max_seq_length, tokenizer):
+    """
+    Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with
+    IDs, LM labels, input_mask, CLS and SEP tokens etc.
+    :param example: InputExample, containing sentence input as strings and is_next label
+    :param max_seq_length: int, maximum length of sequence.
+    :param tokenizer: Tokenizer
+    :return: InputFeatures, containing all inputs and labels of one sample as IDs (as used for model training)
+    """
+    tokens_a = example.tokens_a
+    tokens_b = example.tokens_b
+    # Modifies `tokens_a` and `tokens_b` in place so that the total
+    # length is less than the specified length.
+    # Account for [CLS], [SEP], [SEP] with "- 3"
+    _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+
+    tokens_a, t1_label = random_word(tokens_a, tokenizer)
+    tokens_b, t2_label = random_word(tokens_b, tokenizer)
+    # concatenate lm labels and account for CLS, SEP, SEP
+    lm_label_ids = ([-1] + t1_label + [-1] + t2_label + [-1])
+
+    # The convention in BERT is:
+    # (a) For sequence pairs:
+    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+    #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
+    # (b) For single sequences:
+    #  tokens:   [CLS] the dog is hairy . [SEP]
+    #  type_ids: 0   0   0   0  0     0 0
+    #
+    # Where "type_ids" are used to indicate whether this is the first
+    # sequence or the second sequence. The embedding vectors for `type=0` and
+    # `type=1` were learned during pre-training and are added to the wordpiece
+    # embedding vector (and position vector). This is not *strictly* necessary
+    # since the [SEP] token unambigiously separates the sequences, but it makes
+    # it easier for the model to learn the concept of sequences.
+    #
+    # For classification tasks, the first vector (corresponding to [CLS]) is
+    # used as as the "sentence vector". Note that this only makes sense because
+    # the entire model is fine-tuned.
+    tokens = []
+    segment_ids = []
+    tokens.append("[CLS]")
+    segment_ids.append(0)
+    for token in tokens_a:
+        tokens.append(token)
+        segment_ids.append(0)
+    tokens.append("[SEP]")
+    segment_ids.append(0)
+
+    assert len(tokens_b) > 0
+    for token in tokens_b:
+        tokens.append(token)
+        segment_ids.append(1)
+    tokens.append("[SEP]")
+    segment_ids.append(1)
+
+    input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+    # The mask has 1 for real tokens and 0 for padding tokens. Only real
+    # tokens are attended to.
+    input_mask = [1] * len(input_ids)
+
+    # Zero-pad up to the sequence length.
+    while len(input_ids) < max_seq_length:
+        input_ids.append(0)
+        input_mask.append(0)
+        segment_ids.append(0)
+        lm_label_ids.append(-1)
+
+    assert len(input_ids) == max_seq_length
+    assert len(input_mask) == max_seq_length
+    assert len(segment_ids) == max_seq_length
+    assert len(lm_label_ids) == max_seq_length
+
+    if example.guid < 5:
+        logger.info("*** Example ***")
+        logger.info("guid: %s" % (example.guid))
+        logger.info("tokens: %s" % " ".join(
+                [str(x) for x in tokens]))
+        logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+        logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+        logger.info(
+                "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+        logger.info("LM label: %s " % (lm_label_ids))
+        logger.info("Is next sentence label: %s " % (example.is_next))
+
+    features = InputFeatures(input_ids=input_ids,
+                             input_mask=input_mask,
+                             segment_ids=segment_ids,
+                             lm_label_ids=lm_label_ids,
+                             is_next=example.is_next)
+    return features
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters
+    parser.add_argument("--train_file",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The input train corpus.")
+    parser.add_argument("--bert_model", default=None, type=str, required=True,
+                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
+                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
+    parser.add_argument("--output_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The output directory where the model checkpoints will be written.")
+
+    ## Other parameters
+    parser.add_argument("--max_seq_length",
+                        default=128,
+                        type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. \n"
+                             "Sequences longer than this will be truncated, and sequences shorter \n"
+                             "than this will be padded.")
+    parser.add_argument("--do_train",
+                        action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument("--train_batch_size",
+                        default=32,
+                        type=int,
+                        help="Total batch size for training.")
+    parser.add_argument("--learning_rate",
+                        default=3e-5,
+                        type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument("--num_train_epochs",
+                        default=3.0,
+                        type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--warmup_proportion",
+                        default=0.1,
+                        type=float,
+                        help="Proportion of training to perform linear learning rate warmup for. "
+                             "E.g., 0.1 = 10%% of training.")
+    parser.add_argument("--no_cuda",
+                        action='store_true',
+                        help="Whether not to use CUDA when available")
+    parser.add_argument("--on_memory",
+                        action='store_true',
+                        help="Whether to load train samples into memory or use disk")
+    parser.add_argument("--do_lower_case",
+                        action='store_true',
+                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
+    parser.add_argument("--local_rank",
+                        type=int,
+                        default=-1,
+                        help="local_rank for distributed training on gpus")
+    parser.add_argument('--seed',
+                        type=int,
+                        default=42,
+                        help="random seed for initialization")
+    parser.add_argument('--gradient_accumulation_steps',
+                        type=int,
+                        default=1,
+                        help="Number of updates steps to accumualte before performing a backward/update pass.")
+    parser.add_argument('--fp16',
+                        action='store_true',
+                        help="Whether to use 16-bit float precision instead of 32-bit")
+    parser.add_argument('--loss_scale',
+                        type = float, default = 0,
+                        help = "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
+                        "0 (default value): dynamic loss scaling.\n"
+                        "Positive power of 2: static loss scaling value.\n")
+
+    args = parser.parse_args()
+
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        n_gpu = torch.cuda.device_count()
+    else:
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        n_gpu = 1
+        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.distributed.init_process_group(backend='nccl')
+    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
+        device, n_gpu, bool(args.local_rank != -1), args.fp16))
+
+    if args.gradient_accumulation_steps < 1:
+        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
+                            args.gradient_accumulation_steps))
+
+    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+    if not args.do_train:
+        raise ValueError("Training is currently the only implemented execution option. Please set `do_train`.")
+
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
+        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
+
+    #train_examples = None
+    num_train_optimization_steps = None
+    if args.do_train:
+        print("Loading Train Dataset", args.train_file)
+        train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length,
+                                    corpus_lines=None, on_memory=args.on_memory)
+        num_train_optimization_steps = int(
+            len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
+        if args.local_rank != -1:
+            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
+
+    # Prepare model
+    model = BertForPreTraining.from_pretrained(args.bert_model)
+    if args.fp16:
+        model.half()
+    model.to(device)
+    if args.local_rank != -1:
+        try:
+            from apex.parallel import DistributedDataParallel as DDP
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+        model = DDP(model)
+    elif n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Prepare optimizer
+    param_optimizer = list(model.named_parameters())
+    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+
+    if args.fp16:
+        try:
+            from apex.optimizers import FP16_Optimizer
+            from apex.optimizers import FusedAdam
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+        optimizer = FusedAdam(optimizer_grouped_parameters,
+                              lr=args.learning_rate,
+                              bias_correction=False,
+                              max_grad_norm=1.0)
+        if args.loss_scale == 0:
+            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+        else:
+            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+
+    else:
+        optimizer = BertAdam(optimizer_grouped_parameters,
+                             lr=args.learning_rate,
+                             warmup=args.warmup_proportion,
+                             t_total=num_train_optimization_steps)
+
+    global_step = 0
+    if args.do_train:
+        logger.info("***** Running training *****")
+        logger.info("  Num examples = %d", len(train_dataset))
+        logger.info("  Batch size = %d", args.train_batch_size)
+        logger.info("  Num steps = %d", num_train_optimization_steps)
+
+        if args.local_rank == -1:
+            train_sampler = RandomSampler(train_dataset)
+        else:
+            #TODO: check if this works with current data generator from disk that relies on next(file)
+            # (it doesn't return item back by index)
+            train_sampler = DistributedSampler(train_dataset)
+        train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
+
+        model.train()
+        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
+            tr_loss = 0
+            nb_tr_examples, nb_tr_steps = 0, 0
+            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
+                batch = tuple(t.to(device) for t in batch)
+                input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
+                loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next)
+                if n_gpu > 1:
+                    loss = loss.mean() # mean() to average on multi-gpu.
+                if args.gradient_accumulation_steps > 1:
+                    loss = loss / args.gradient_accumulation_steps
+                if args.fp16:
+                    optimizer.backward(loss)
+                else:
+                    loss.backward()
+                tr_loss += loss.item()
+                nb_tr_examples += input_ids.size(0)
+                nb_tr_steps += 1
+                if (step + 1) % args.gradient_accumulation_steps == 0:
+                    if args.fp16:
+                        # modify learning rate with special warm up BERT uses
+                        # if args.fp16 is False, BertAdam is used that handles this automatically
+                        lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
+                        for param_group in optimizer.param_groups:
+                            param_group['lr'] = lr_this_step
+                    optimizer.step()
+                    optimizer.zero_grad()
+                    global_step += 1
+
+        # Save a trained model
+        logger.info("** ** * Saving fine - tuned model ** ** * ")
+        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
+        if args.do_train:
+            torch.save(model_to_save.state_dict(), output_model_file)
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+def accuracy(out, labels):
+    outputs = np.argmax(out, axis=1)
+    return np.sum(outputs == labels)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/run_openai_gpt.py
+++ b/examples/run_openai_gpt.py
@ -0,0 +1,259 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" OpenAI GPT model fine-tuning script.
+    Adapted from https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/train.py
+    It self adapted from https://github.com/openai/finetune-transformer-lm/blob/master/train.py
+
+    This script with default values fine-tunes and evaluate a pretrained OpenAI GPT on the RocStories dataset
+"""
+import argparse
+import os
+import csv
+import random
+import logging
+from tqdm import tqdm, trange
+
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+
+from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, OpenAIAdam, cached_path
+
+ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+def accuracy(out, labels):
+    outputs = np.argmax(out, axis=1)
+    return np.sum(outputs == labels)
+
+def load_rocstories_dataset(dataset_path):
+    """ Output a list of tuples(story, 1st continuation, 2nd continuation, label) """
+    with open(dataset_path, encoding='utf_8') as f:
+        f = csv.reader(f)
+        output = []
+        next(f) # skip the first line
+        for line in tqdm(f):
+            output.append((' '.join(line[1:5]), line[5], line[6], int(line[-1])-1))
+    return output
+
+def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):
+    """ Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
+
+        To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
+        input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
+    """
+    tensor_datasets = []
+    for dataset in encoded_datasets:
+        n_batch = len(dataset)
+        input_ids = np.zeros((n_batch, 2, input_len), dtype=np.int64)
+        mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
+        lm_labels = np.full((n_batch, 2, input_len), fill_value=-1, dtype=np.int64)
+        mc_labels = np.zeros((n_batch,), dtype=np.int64)
+        for i, (story, cont1, cont2, mc_label), in enumerate(dataset):
+            with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
+            with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
+            input_ids[i, 0, :len(with_cont1)] = with_cont1
+            input_ids[i, 1, :len(with_cont2)] = with_cont2
+            mc_token_ids[i, 0] = len(with_cont1) - 1
+            mc_token_ids[i, 1] = len(with_cont2) - 1
+            lm_labels[i, 0, :len(with_cont1)-1] = with_cont1[1:]
+            lm_labels[i, 1, :len(with_cont2)-1] = with_cont2[1:]
+            mc_labels[i] = mc_label
+        all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels)
+        tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))
+    return tensor_datasets
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_name', type=str, default='openai-gpt',
+                        help='pretrained model name')
+    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
+    parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument('--train_dataset', type=str, default='')
+    parser.add_argument('--eval_dataset', type=str, default='')
+    parser.add_argument('--seed', type=int, default=42)
+    parser.add_argument('--num_train_epochs', type=int, default=3)
+    parser.add_argument('--train_batch_size', type=int, default=8)
+    parser.add_argument('--eval_batch_size', type=int, default=16)
+    parser.add_argument('--max_grad_norm', type=int, default=1)
+    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
+    parser.add_argument('--warmup_proportion', type=float, default=0.002)
+    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
+    parser.add_argument('--weight_decay', type=float, default=0.01)
+    parser.add_argument('--lm_coef', type=float, default=0.9)
+    parser.add_argument('--n_valid', type=int, default=374)
+
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    args = parser.parse_args()
+    print(args)
+
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    torch.cuda.manual_seed_all(args.seed)
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    n_gpu = torch.cuda.device_count()
+    logger.info("device: {}, n_gpu {}".format(device, n_gpu))
+
+    if not args.do_train and not args.do_eval:
+        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
+
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    # Load tokenizer and model
+    # This loading functions also add new tokens and embeddings called `special tokens`
+    # These new embeddings will be fine-tuned on the RocStories dataset
+    special_tokens = ['_start_', '_delimiter_', '_classify_']
+    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens)
+    special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
+    model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens))
+    model.to(device)
+
+    # Load and encode the datasets
+    if not args.train_dataset and not args.eval_dataset:
+        roc_stories = cached_path(ROCSTORIES_URL)
+    def tokenize_and_encode(obj):
+        """ Tokenize and encode a nested object """
+        if isinstance(obj, str):
+            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
+        elif isinstance(obj, int):
+            return obj
+        return list(tokenize_and_encode(o) for o in obj)
+    logger.info("Encoding dataset...")
+    train_dataset = load_rocstories_dataset(args.train_dataset)
+    eval_dataset = load_rocstories_dataset(args.eval_dataset)
+    datasets = (train_dataset, eval_dataset)
+    encoded_datasets = tokenize_and_encode(datasets)
+
+    # Compute the mex input length for the Transformer
+    max_length = model.config.n_positions // 2 - 2
+    input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3  \
+                           for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
+    input_length = min(input_length, model.config.n_positions)  # Max size of input for the pre-trained model
+
+    # Prepare inputs tensors and dataloaders
+    tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids)
+    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[0], tensor_datasets[1]
+
+    train_data = TensorDataset(*train_tensor_dataset)
+    train_sampler = RandomSampler(train_data)
+    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    eval_data = TensorDataset(*eval_tensor_dataset)
+    eval_sampler = SequentialSampler(eval_data)
+    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+    # Prepare optimizer
+    param_optimizer = list(model.named_parameters())
+    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+    num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size
+    optimizer = OpenAIAdam(optimizer_grouped_parameters,
+                           lr=args.learning_rate,
+                           warmup=args.warmup_proportion,
+                           max_grad_norm=args.max_grad_norm,
+                           weight_decay=args.weight_decay,
+                           t_total=num_train_optimization_steps)
+
+    if args.do_train:
+        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
+        model.train()
+        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
+            tr_loss = 0
+            nb_tr_steps = 0
+            tqdm_bar = tqdm(train_dataloader, desc="Training")
+            for step, batch in enumerate(tqdm_bar):
+                batch = tuple(t.to(device) for t in batch)
+                input_ids, mc_token_ids, lm_labels, mc_labels = batch
+                losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)
+                loss = args.lm_coef * losses[0] + losses[1]
+                loss.backward()
+                optimizer.step()
+                tr_loss += loss.item()
+                exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item()
+                nb_tr_steps += 1
+                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, optimizer.get_lr()[0])
+
+    # Save a trained model
+    if args.do_train:
+        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
+        config = model.config
+        torch.save(model_to_save.state_dict(), output_model_file)
+
+        # Load a trained model that you have fine-tuned
+        model_state_dict = torch.load(output_model_file)
+        model = OpenAIGPTDoubleHeadsModel(config)
+        model.load_state_dict(model_state_dict)
+        model.to(device)
+
+    if args.do_eval:
+        model.eval()
+        eval_loss, eval_accuracy = 0, 0
+        nb_eval_steps, nb_eval_examples = 0, 0
+        for batch in tqdm(eval_dataloader, desc="Evaluating"):
+            batch = tuple(t.to(device) for t in batch)
+            input_ids, mc_token_ids, lm_labels, mc_labels = batch
+            with torch.no_grad():
+                _, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels)
+                _, mc_logits = model(input_ids, mc_token_ids)
+
+            mc_logits = mc_logits.detach().cpu().numpy()
+            mc_labels = mc_labels.to('cpu').numpy()
+            tmp_eval_accuracy = accuracy(mc_logits, mc_labels)
+
+            eval_loss += mc_loss.mean().item()
+            eval_accuracy += tmp_eval_accuracy
+
+            nb_eval_examples += input_ids.size(0)
+            nb_eval_steps += 1
+
+        eval_loss = eval_loss / nb_eval_steps
+        eval_accuracy = eval_accuracy / nb_eval_examples
+        train_loss = tr_loss/nb_tr_steps if args.do_train else None
+        result = {'eval_loss': eval_loss,
+                  'eval_accuracy': eval_accuracy,
+                  'train_loss': train_loss}
+
+        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
+        with open(output_eval_file, "w") as writer:
+            logger.info("***** Eval results *****")
+            for key in sorted(result.keys()):
+                logger.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+
+if __name__ == '__main__':
+    main()
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@ -1,5 +1,6 @@
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -14,36 +15,48 @@
 # limitations under the License.
 """Run BERT on SQuAD."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function

 import argparse
 import collections
-import logging
 import json
+import logging
 import math
 import os
 import random
-from tqdm import tqdm, trange
+import sys
+from io import open

 import numpy as np
 import torch
-from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
 from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange

-from pytorch_pretrained_bert.tokenization import printable_text, whitespace_tokenize, BasicTokenizer, BertTokenizer
-from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
-from pytorch_pretrained_bert.optimization import BertAdam
+from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
+from pytorch_pretrained_bert.modeling import BertForQuestionAnswering, BertConfig, WEIGHTS_NAME, CONFIG_NAME
+from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
+from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
+                                                  BertTokenizer,
+                                                  whitespace_tokenize)

-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
 logger = logging.getLogger(__name__)


 class SquadExample(object):
-    """A single training/test example for simple sequence classification."""
+    """
+    A single training/test example for the Squad dataset.
+    For examples without an answer, the start and end position are -1.
+    """

    def __init__(self,
                 qas_id,
@ -51,27 +64,31 @@ class SquadExample(object):
                 doc_tokens,
                 orig_answer_text=None,
                 start_position=None,
-                 end_position=None):
+                 end_position=None,
+                 is_impossible=None):
        self.qas_id = qas_id
        self.question_text = question_text
        self.doc_tokens = doc_tokens
        self.orig_answer_text = orig_answer_text
        self.start_position = start_position
        self.end_position = end_position
+        self.is_impossible = is_impossible

    def __str__(self):
        return self.__repr__()

    def __repr__(self):
        s = ""
-        s += "qas_id: %s" % (printable_text(self.qas_id))
+        s += "qas_id: %s" % (self.qas_id)
        s += ", question_text: %s" % (
-            printable_text(self.question_text))
+            self.question_text)
        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
        if self.start_position:
            s += ", start_position: %d" % (self.start_position)
        if self.start_position:
            s += ", end_position: %d" % (self.end_position)
+        if self.start_position:
+            s += ", is_impossible: %r" % (self.is_impossible)
        return s


@ -89,7 +106,8 @@ class InputFeatures(object):
                 input_mask,
                 segment_ids,
                 start_position=None,
-                 end_position=None):
+                 end_position=None,
+                 is_impossible=None):
        self.unique_id = unique_id
        self.example_index = example_index
        self.doc_span_index = doc_span_index
@ -101,11 +119,12 @@ class InputFeatures(object):
        self.segment_ids = segment_ids
        self.start_position = start_position
        self.end_position = end_position
+        self.is_impossible = is_impossible


-def read_squad_examples(input_file, is_training):
+def read_squad_examples(input_file, is_training, version_2_with_negative):
    """Read a SQuAD json file into a list of SquadExample."""
-    with open(input_file, "r") as reader:
+    with open(input_file, "r", encoding='utf-8') as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
@ -137,29 +156,37 @@ def read_squad_examples(input_file, is_training):
                start_position = None
                end_position = None
                orig_answer_text = None
+                is_impossible = False
                if is_training:
-                    if len(qa["answers"]) != 1:
+                    if version_2_with_negative:
+                        is_impossible = qa["is_impossible"]
+                    if (len(qa["answers"]) != 1) and (not is_impossible):
                        raise ValueError(
                            "For training, each question should have exactly 1 answer.")
-                    answer = qa["answers"][0]
-                    orig_answer_text = answer["text"]
-                    answer_offset = answer["answer_start"]
-                    answer_length = len(orig_answer_text)
-                    start_position = char_to_word_offset[answer_offset]
-                    end_position = char_to_word_offset[answer_offset + answer_length - 1]
-                    # Only add answers where the text can be exactly recovered from the
-                    # document. If this CAN'T happen it's likely due to weird Unicode
-                    # stuff so we will just skip the example.
-                    #
-                    # Note that this means for training mode, every example is NOT
-                    # guaranteed to be preserved.
-                    actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
-                    cleaned_answer_text = " ".join(
-                        whitespace_tokenize(orig_answer_text))
-                    if actual_text.find(cleaned_answer_text) == -1:
-                        logger.warning("Could not find answer: '%s' vs. '%s'",
+                    if not is_impossible:
+                        answer = qa["answers"][0]
+                        orig_answer_text = answer["text"]
+                        answer_offset = answer["answer_start"]
+                        answer_length = len(orig_answer_text)
+                        start_position = char_to_word_offset[answer_offset]
+                        end_position = char_to_word_offset[answer_offset + answer_length - 1]
+                        # Only add answers where the text can be exactly recovered from the
+                        # document. If this CAN'T happen it's likely due to weird Unicode
+                        # stuff so we will just skip the example.
+                        #
+                        # Note that this means for training mode, every example is NOT
+                        # guaranteed to be preserved.
+                        actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
+                        cleaned_answer_text = " ".join(
+                            whitespace_tokenize(orig_answer_text))
+                        if actual_text.find(cleaned_answer_text) == -1:
+                            logger.warning("Could not find answer: '%s' vs. '%s'",
                                           actual_text, cleaned_answer_text)
-                        continue
+                            continue
+                    else:
+                        start_position = -1
+                        end_position = -1
+                        orig_answer_text = ""

                example = SquadExample(
                    qas_id=qas_id,
@ -167,7 +194,8 @@ def read_squad_examples(input_file, is_training):
                    doc_tokens=doc_tokens,
                    orig_answer_text=orig_answer_text,
                    start_position=start_position,
-                    end_position=end_position)
+                    end_position=end_position,
+                    is_impossible=is_impossible)
                examples.append(example)
    return examples

@ -197,7 +225,10 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,

        tok_start_position = None
        tok_end_position = None
-        if is_training:
+        if is_training and example.is_impossible:
+            tok_start_position = -1
+            tok_end_position = -1
+        if is_training and not example.is_impossible:
            tok_start_position = orig_to_tok_index[example.start_position]
            if example.end_position < len(example.doc_tokens) - 1:
                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
@ -269,27 +300,31 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,

            start_position = None
            end_position = None
-            if is_training:
+            if is_training and not example.is_impossible:
                # For training, if our document chunk does not contain an annotation
                # we throw it out, since there is nothing to predict.
                doc_start = doc_span.start
                doc_end = doc_span.start + doc_span.length - 1
-                if (example.start_position < doc_start or
-                        example.end_position < doc_start or
-                        example.start_position > doc_end or example.end_position > doc_end):
-                    continue
-
-                doc_offset = len(query_tokens) + 2
-                start_position = tok_start_position - doc_start + doc_offset
-                end_position = tok_end_position - doc_start + doc_offset
-
+                out_of_span = False
+                if not (tok_start_position >= doc_start and
+                        tok_end_position <= doc_end):
+                    out_of_span = True
+                if out_of_span:
+                    start_position = 0
+                    end_position = 0
+                else:
+                    doc_offset = len(query_tokens) + 2
+                    start_position = tok_start_position - doc_start + doc_offset
+                    end_position = tok_end_position - doc_start + doc_offset
+            if is_training and example.is_impossible:
+                start_position = 0
+                end_position = 0
            if example_index < 20:
                logger.info("*** Example ***")
                logger.info("unique_id: %s" % (unique_id))
                logger.info("example_index: %s" % (example_index))
                logger.info("doc_span_index: %s" % (doc_span_index))
-                logger.info("tokens: %s" % " ".join(
-                    [printable_text(x) for x in tokens]))
+                logger.info("tokens: %s" % " ".join(tokens))
                logger.info("token_to_orig_map: %s" % " ".join([
                    "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]))
                logger.info("token_is_max_context: %s" % " ".join([
@ -300,12 +335,14 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                    "input_mask: %s" % " ".join([str(x) for x in input_mask]))
                logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
-                if is_training:
+                if is_training and example.is_impossible:
+                    logger.info("impossible example")
+                if is_training and not example.is_impossible:
                    answer_text = " ".join(tokens[start_position:(end_position + 1)])
                    logger.info("start_position: %d" % (start_position))
                    logger.info("end_position: %d" % (end_position))
                    logger.info(
-                        "answer: %s" % (printable_text(answer_text)))
+                        "answer: %s" % (answer_text))

            features.append(
                InputFeatures(
@ -319,7 +356,8 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                    input_mask=input_mask,
                    segment_ids=segment_ids,
                    start_position=start_position,
-                    end_position=end_position))
+                    end_position=end_position,
+                    is_impossible=example.is_impossible))
            unique_id += 1

    return features
@ -399,15 +437,15 @@ def _check_is_max_context(doc_spans, cur_span_index, position):
    return cur_span_index == best_span_index


-
 RawResult = collections.namedtuple("RawResult",
                                   ["unique_id", "start_logits", "end_logits"])


 def write_predictions(all_examples, all_features, all_results, n_best_size,
                      max_answer_length, do_lower_case, output_prediction_file,
-                      output_nbest_file, verbose_logging):
-    """Write final predictions to the json file."""
+                      output_nbest_file, output_null_log_odds_file, verbose_logging,
+                      version_2_with_negative, null_score_diff_threshold):
+    """Write final predictions to the json file and log-odds of null if needed."""
    logger.info("Writing predictions to: %s" % (output_prediction_file))
    logger.info("Writing nbest to: %s" % (output_nbest_file))

@ -425,15 +463,29 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,

    all_predictions = collections.OrderedDict()
    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
    for (example_index, example) in enumerate(all_examples):
        features = example_index_to_features[example_index]

        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+        min_null_feature_index = 0  # the paragraph slice with min mull score
+        null_start_logit = 0  # the start logit at the slice with min null score
+        null_end_logit = 0  # the end logit at the slice with min null score
        for (feature_index, feature) in enumerate(features):
            result = unique_id_to_result[feature.unique_id]
-
            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+            # if we could have irrelevant answers, get the min score of irrelevant
+            if version_2_with_negative:
+                feature_null_score = result.start_logits[0] + result.end_logits[0]
+                if feature_null_score < score_null:
+                    score_null = feature_null_score
+                    min_null_feature_index = feature_index
+                    null_start_logit = result.start_logits[0]
+                    null_end_logit = result.end_logits[0]
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # We could hypothetically create invalid predictions, e.g., predict
@ -461,7 +513,14 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
                            end_index=end_index,
                            start_logit=result.start_logits[start_index],
                            end_logit=result.end_logits[end_index]))
-
+        if version_2_with_negative:
+            prelim_predictions.append(
+                _PrelimPrediction(
+                    feature_index=min_null_feature_index,
+                    start_index=0,
+                    end_index=0,
+                    start_logit=null_start_logit,
+                    end_logit=null_end_logit))
        prelim_predictions = sorted(
            prelim_predictions,
            key=lambda x: (x.start_logit + x.end_logit),
@ -476,33 +535,51 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
            if len(nbest) >= n_best_size:
                break
            feature = features[pred.feature_index]
+            if pred.start_index > 0:  # this is a non-null prediction
+                tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+                orig_doc_start = feature.token_to_orig_map[pred.start_index]
+                orig_doc_end = feature.token_to_orig_map[pred.end_index]
+                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+                tok_text = " ".join(tok_tokens)

-            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
-            orig_doc_start = feature.token_to_orig_map[pred.start_index]
-            orig_doc_end = feature.token_to_orig_map[pred.end_index]
-            orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
-            tok_text = " ".join(tok_tokens)
+                # De-tokenize WordPieces that have been split off.
+                tok_text = tok_text.replace(" ##", "")
+                tok_text = tok_text.replace("##", "")

-            # De-tokenize WordPieces that have been split off.
-            tok_text = tok_text.replace(" ##", "")
-            tok_text = tok_text.replace("##", "")
+                # Clean whitespace
+                tok_text = tok_text.strip()
+                tok_text = " ".join(tok_text.split())
+                orig_text = " ".join(orig_tokens)

-            # Clean whitespace
-            tok_text = tok_text.strip()
-            tok_text = " ".join(tok_text.split())
-            orig_text = " ".join(orig_tokens)
+                final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
+                if final_text in seen_predictions:
+                    continue

-            final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
-            if final_text in seen_predictions:
-                continue
+                seen_predictions[final_text] = True
+            else:
+                final_text = ""
+                seen_predictions[final_text] = True

-            seen_predictions[final_text] = True
            nbest.append(
                _NbestPrediction(
                    text=final_text,
                    start_logit=pred.start_logit,
                    end_logit=pred.end_logit))
-
+        # if we didn't include the empty option in the n-best, include it
+        if version_2_with_negative:
+            if "" not in seen_predictions:
+                nbest.append(
+                    _NbestPrediction(
+                        text="",
+                        start_logit=null_start_logit,
+                        end_logit=null_end_logit))
+                
+            # In very rare edge cases we could only have single null prediction.
+            # So we just create a nonce prediction in this case to avoid failure.
+            if len(nbest)==1:
+                nbest.insert(0,
+                    _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+                
        # In very rare edge cases we could have no valid predictions. So we
        # just create a nonce prediction in this case to avoid failure.
        if not nbest:
@ -512,8 +589,12 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
        assert len(nbest) >= 1

        total_scores = []
+        best_non_null_entry = None
        for entry in nbest:
            total_scores.append(entry.start_logit + entry.end_logit)
+            if not best_non_null_entry:
+                if entry.text:
+                    best_non_null_entry = entry

        probs = _compute_softmax(total_scores)

@ -528,8 +609,18 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,

        assert len(nbest_json) >= 1

-        all_predictions[example.qas_id] = nbest_json[0]["text"]
-        all_nbest_json[example.qas_id] = nbest_json
+        if not version_2_with_negative:
+            all_predictions[example.qas_id] = nbest_json[0]["text"]
+        else:
+            # predict "" iff the null score - the score of best non-null > threshold
+            score_diff = score_null - best_non_null_entry.start_logit - (
+                best_non_null_entry.end_logit)
+            scores_diff_json[example.qas_id] = score_diff
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example.qas_id] = ""
+            else:
+                all_predictions[example.qas_id] = best_non_null_entry.text
+                all_nbest_json[example.qas_id] = nbest_json

    with open(output_prediction_file, "w") as writer:
        writer.write(json.dumps(all_predictions, indent=4) + "\n")
@ -537,6 +628,10 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
    with open(output_nbest_file, "w") as writer:
        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")

+    if version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+

 def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
    """Project the tokenized prediction back to the original text."""
@ -599,7 +694,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
    if len(orig_ns_text) != len(tok_ns_text):
        if verbose_logging:
            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
-                            orig_ns_text, tok_ns_text)
+                        orig_ns_text, tok_ns_text)
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using
@ -668,44 +763,16 @@ def _compute_softmax(scores):
        probs.append(score / total_sum)
    return probs

-def copy_optimizer_params_to_model(named_params_model, named_params_optimizer):
-    """ Utility function for optimize_on_cpu and 16-bits training.
-        Copy the parameters optimized on CPU/RAM back to the model on GPU
-    """
-    for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model):
-        if name_opti != name_model:
-            logger.error("name_opti != name_model: {} {}".format(name_opti, name_model))
-            raise ValueError
-        param_model.data.copy_(param_opti.data)
-
-def set_optimizer_params_grad(named_params_optimizer, named_params_model, test_nan=False):
-    """ Utility function for optimize_on_cpu and 16-bits training.
-        Copy the gradient of the GPU parameters to the CPU/RAMM copy of the model
-    """
-    is_nan = False
-    for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model):
-        if name_opti != name_model:
-            logger.error("name_opti != name_model: {} {}".format(name_opti, name_model))
-            raise ValueError
-        if param_model.grad is not None:
-            if test_nan and torch.isnan(param_model.grad).sum() > 0:
-                is_nan = True
-            if param_opti.grad is None:
-                param_opti.grad = torch.nn.Parameter(param_opti.data.new().resize_(*param_opti.data.size()))
-            param_opti.grad.data.copy_(param_model.grad.data)
-        else:
-            param_opti.grad = None
-    return is_nan
-
 def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--bert_model", default=None, type=str, required=True,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
-                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
+                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
+                        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model checkpoints will be written.")
+                        help="The output directory where the model checkpoints and predictions will be written.")

    ## Other parameters
    parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json")
@ -719,15 +786,15 @@ def main():
    parser.add_argument("--max_query_length", default=64, type=int,
                        help="The maximum number of tokens for the question. Questions longer than this will "
                             "be truncated to this length.")
-    parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.")
-    parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
+    parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
    parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs", default=3.0, type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion", default=0.1, type=float,
-                        help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
+                        help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% "
                             "of training.")
    parser.add_argument("--n_best_size", default=20, type=int,
                        help="The total number of n-best predictions to generate in the nbest_predictions.json "
@ -735,58 +802,60 @@ def main():
    parser.add_argument("--max_answer_length", default=30, type=int,
                        help="The maximum length of an answer that can be generated. This is needed because the start "
                             "and end predictions are not conditioned on one another.")
-    parser.add_argument("--verbose_logging", default=False, action='store_true',
+    parser.add_argument("--verbose_logging", action='store_true',
                        help="If true, all of the warnings related to data processing will be printed. "
                             "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
-                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
-    parser.add_argument('--seed', 
-                        type=int, 
+    parser.add_argument('--seed',
+                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument("--do_lower_case",
+                        action='store_true',
+                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--optimize_on_cpu',
-                        default=False,
-                        action='store_true',
-                        help="Whether to perform optimization and keep the optimizer averages on CPU")
    parser.add_argument('--fp16',
-                        default=False,
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
-                        type=float, default=128,
-                        help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
-
+                        type=float, default=0,
+                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
+                             "0 (default value): dynamic loss scaling.\n"
+                             "Positive power of 2: static loss scaling value.\n")
+    parser.add_argument('--version_2_with_negative',
+                        action='store_true',
+                        help='If true, the SQuAD examples contain some that do not have an answer.')
+    parser.add_argument('--null_score_diff_threshold',
+                        type=float, default=0.0,
+                        help="If null_score - best_non_null is greater than the threshold predict null.")
    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
+        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
-        if args.fp16:
-            logger.info("16-bits training currently not supported in distributed training")
-            args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496)
-    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits trainiing: {}".format(
+    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))

-    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)
+    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
@ -806,65 +875,99 @@ def main():
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified.")

-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
        raise ValueError("Output directory () already exists and is not empty.")
-    os.makedirs(args.output_dir, exist_ok=True)
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)

-    tokenizer = BertTokenizer.from_pretrained(args.bert_model)
+    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    train_examples = None
-    num_train_steps = None
+    num_train_optimization_steps = None
    if args.do_train:
        train_examples = read_squad_examples(
-            input_file=args.train_file, is_training=True)
-        num_train_steps = int(
-            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)
+            input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
+        num_train_optimization_steps = int(
+            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
+        if args.local_rank != -1:
+            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()

    # Prepare model
    model = BertForQuestionAnswering.from_pretrained(args.bert_model,
-                cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank))
+                cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)))
+
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank)
+        try:
+            from apex.parallel import DistributedDataParallel as DDP
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
-    if args.fp16:
-        param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \
-                            for n, param in model.named_parameters()]
-    elif args.optimize_on_cpu:
-        param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
-                            for n, param in model.named_parameters()]
-    else:
-        param_optimizer = list(model.named_parameters())
-    no_decay = ['bias', 'gamma', 'beta']
+    param_optimizer = list(model.named_parameters())
+
+    # hack to remove pooler, which is not used
+    # thus it produce None grad that break apex
+    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
+
+    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
-        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
-        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
+        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
-    optimizer = BertAdam(optimizer_grouped_parameters,
-                         lr=args.learning_rate,
-                         warmup=args.warmup_proportion,
-                         t_total=num_train_steps)
+
+    if args.fp16:
+        try:
+            from apex.optimizers import FP16_Optimizer
+            from apex.optimizers import FusedAdam
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+        optimizer = FusedAdam(optimizer_grouped_parameters,
+                              lr=args.learning_rate,
+                              bias_correction=False,
+                              max_grad_norm=1.0)
+        if args.loss_scale == 0:
+            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+        else:
+            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+    else:
+        optimizer = BertAdam(optimizer_grouped_parameters,
+                             lr=args.learning_rate,
+                             warmup=args.warmup_proportion,
+                             t_total=num_train_optimization_steps)

    global_step = 0
    if args.do_train:
-        train_features = convert_examples_to_features(
-            examples=train_examples,
-            tokenizer=tokenizer,
-            max_seq_length=args.max_seq_length,
-            doc_stride=args.doc_stride,
-            max_query_length=args.max_query_length,
-            is_training=True)
+        cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format(
+            list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length))
+        train_features = None
+        try:
+            with open(cached_train_features_file, "rb") as reader:
+                train_features = pickle.load(reader)
+        except:
+            train_features = convert_examples_to_features(
+                examples=train_examples,
+                tokenizer=tokenizer,
+                max_seq_length=args.max_seq_length,
+                doc_stride=args.doc_stride,
+                max_query_length=args.max_query_length,
+                is_training=True)
+            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+                logger.info("  Saving train features into cached file %s", cached_train_features_file)
+                with open(cached_train_features_file, "wb") as writer:
+                    pickle.dump(train_features, writer)
        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
-        logger.info("  Num steps = %d", num_train_steps)
+        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
@ -887,36 +990,45 @@ def main():
                loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions)
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
-                if args.fp16 and args.loss_scale != 1.0:
-                    # rescale loss for fp16 training
-                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
-                    loss = loss * args.loss_scale
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
-                loss.backward()
+
+                if args.fp16:
+                    optimizer.backward(loss)
+                else:
+                    loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
-                    if args.fp16 or args.optimize_on_cpu:
-                        if args.fp16 and args.loss_scale != 1.0:
-                            # scale down gradients for fp16 training
-                            for param in model.parameters():
-                                if param.grad is not None:
-                                    param.grad.data = param.grad.data / args.loss_scale
-                        is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
-                        if is_nan:
-                            logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling")
-                            args.loss_scale = args.loss_scale / 2
-                            model.zero_grad()
-                            continue
-                        optimizer.step()
-                        copy_optimizer_params_to_model(model.named_parameters(), param_optimizer)
-                    else:
-                        optimizer.step()
-                    model.zero_grad()
+                    if args.fp16:
+                        # modify learning rate with special warm up BERT uses
+                        # if args.fp16 is False, BertAdam is used and handles this automatically
+                        lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
+                        for param_group in optimizer.param_groups:
+                            param_group['lr'] = lr_this_step
+                    optimizer.step()
+                    optimizer.zero_grad()
                    global_step += 1

-    if args.do_predict:
+    if args.do_train:
+        # Save a trained model and the associated configuration
+        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+        torch.save(model_to_save.state_dict(), output_model_file)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+        with open(output_config_file, 'w') as f:
+            f.write(model_to_save.config.to_json_string())
+
+        # Load a trained model and config that you have fine-tuned
+        config = BertConfig(output_config_file)
+        model = BertForQuestionAnswering(config)
+        model.load_state_dict(torch.load(output_model_file))
+    else:
+        model = BertForQuestionAnswering.from_pretrained(args.bert_model)
+
+    model.to(device)
+
+    if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        eval_examples = read_squad_examples(
-            input_file=args.predict_file, is_training=False)
+            input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative)
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
@ -935,10 +1047,8 @@ def main():
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
-        if args.local_rank == -1:
-            eval_sampler = SequentialSampler(eval_data)
-        else:
-            eval_sampler = DistributedSampler(eval_data)
+        # Run prediction for full data
+        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)

        model.eval()
@ -962,10 +1072,12 @@ def main():
                                             end_logits=end_logits))
        output_prediction_file = os.path.join(args.output_dir, "predictions.json")
        output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
+        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
        write_predictions(eval_examples, eval_features, all_results,
                          args.n_best_size, args.max_answer_length,
                          args.do_lower_case, output_prediction_file,
-                          output_nbest_file, args.verbose_logging)
+                          output_nbest_file, output_null_log_odds_file, args.verbose_logging,
+                          args.version_2_with_negative, args.null_score_diff_threshold)


 if __name__ == "__main__":
--- a/examples/run_swag.py
+++ b/examples/run_swag.py
@ -0,0 +1,547 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT finetuning runner."""
+
+import argparse
+import csv
+import logging
+import os
+import random
+import sys
+from io import open
+
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
+
+from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
+from pytorch_pretrained_bert.modeling import BertForMultipleChoice
+from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
+from pytorch_pretrained_bert.tokenization import BertTokenizer
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class SwagExample(object):
+    """A single training/test example for the SWAG dataset."""
+    def __init__(self,
+                 swag_id,
+                 context_sentence,
+                 start_ending,
+                 ending_0,
+                 ending_1,
+                 ending_2,
+                 ending_3,
+                 label = None):
+        self.swag_id = swag_id
+        self.context_sentence = context_sentence
+        self.start_ending = start_ending
+        self.endings = [
+            ending_0,
+            ending_1,
+            ending_2,
+            ending_3,
+        ]
+        self.label = label
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        l = [
+            "swag_id: {}".format(self.swag_id),
+            "context_sentence: {}".format(self.context_sentence),
+            "start_ending: {}".format(self.start_ending),
+            "ending_0: {}".format(self.endings[0]),
+            "ending_1: {}".format(self.endings[1]),
+            "ending_2: {}".format(self.endings[2]),
+            "ending_3: {}".format(self.endings[3]),
+        ]
+
+        if self.label is not None:
+            l.append("label: {}".format(self.label))
+
+        return ", ".join(l)
+
+
+class InputFeatures(object):
+    def __init__(self,
+                 example_id,
+                 choices_features,
+                 label
+
+    ):
+        self.example_id = example_id
+        self.choices_features = [
+            {
+                'input_ids': input_ids,
+                'input_mask': input_mask,
+                'segment_ids': segment_ids
+            }
+            for _, input_ids, input_mask, segment_ids in choices_features
+        ]
+        self.label = label
+
+
+def read_swag_examples(input_file, is_training):
+    with open(input_file, 'r', encoding='utf-8') as f:
+        reader = csv.reader(f)
+        lines = []
+        for line in reader:
+            if sys.version_info[0] == 2:
+                line = list(unicode(cell, 'utf-8') for cell in line)
+            lines.append(line)
+
+    if is_training and lines[0][-1] != 'label':
+        raise ValueError(
+            "For training, the input file must contain a label column."
+        )
+
+    examples = [
+        SwagExample(
+            swag_id = line[2],
+            context_sentence = line[4],
+            start_ending = line[5], # in the swag dataset, the
+                                         # common beginning of each
+                                         # choice is stored in "sent2".
+            ending_0 = line[7],
+            ending_1 = line[8],
+            ending_2 = line[9],
+            ending_3 = line[10],
+            label = int(line[11]) if is_training else None
+        ) for line in lines[1:] # we skip the line with the column names
+    ]
+
+    return examples
+
+def convert_examples_to_features(examples, tokenizer, max_seq_length,
+                                 is_training):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    # Swag is a multiple choice task. To perform this task using Bert,
+    # we will use the formatting proposed in "Improving Language
+    # Understanding by Generative Pre-Training" and suggested by
+    # @jacobdevlin-google in this issue
+    # https://github.com/google-research/bert/issues/38.
+    #
+    # Each choice will correspond to a sample on which we run the
+    # inference. For a given Swag example, we will create the 4
+    # following inputs:
+    # - [CLS] context [SEP] choice_1 [SEP]
+    # - [CLS] context [SEP] choice_2 [SEP]
+    # - [CLS] context [SEP] choice_3 [SEP]
+    # - [CLS] context [SEP] choice_4 [SEP]
+    # The model will output a single value for each input. To get the
+    # final decision of the model, we will run a softmax over these 4
+    # outputs.
+    features = []
+    for example_index, example in enumerate(examples):
+        context_tokens = tokenizer.tokenize(example.context_sentence)
+        start_ending_tokens = tokenizer.tokenize(example.start_ending)
+
+        choices_features = []
+        for ending_index, ending in enumerate(example.endings):
+            # We create a copy of the context tokens in order to be
+            # able to shrink it according to ending_tokens
+            context_tokens_choice = context_tokens[:]
+            ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)
+            # Modifies `context_tokens_choice` and `ending_tokens` in
+            # place so that the total length is less than the
+            # specified length.  Account for [CLS], [SEP], [SEP] with
+            # "- 3"
+            _truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)
+
+            tokens = ["[CLS]"] + context_tokens_choice + ["[SEP]"] + ending_tokens + ["[SEP]"]
+            segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)
+
+            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+            input_mask = [1] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            padding = [0] * (max_seq_length - len(input_ids))
+            input_ids += padding
+            input_mask += padding
+            segment_ids += padding
+
+            assert len(input_ids) == max_seq_length
+            assert len(input_mask) == max_seq_length
+            assert len(segment_ids) == max_seq_length
+
+            choices_features.append((tokens, input_ids, input_mask, segment_ids))
+
+        label = example.label
+        if example_index < 5:
+            logger.info("*** Example ***")
+            logger.info("swag_id: {}".format(example.swag_id))
+            for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
+                logger.info("choice: {}".format(choice_idx))
+                logger.info("tokens: {}".format(' '.join(tokens)))
+                logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
+                logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
+                logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
+            if is_training:
+                logger.info("label: {}".format(label))
+
+        features.append(
+            InputFeatures(
+                example_id = example.swag_id,
+                choices_features = choices_features,
+                label = label
+            )
+        )
+
+    return features
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+def accuracy(out, labels):
+    outputs = np.argmax(out, axis=1)
+    return np.sum(outputs == labels)
+
+def select_field(features, field):
+    return [
+        [
+            choice[field]
+            for choice in feature.choices_features
+        ]
+        for feature in features
+    ]
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters
+    parser.add_argument("--data_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The input data dir. Should contain the .csv files (or other data files) for the task.")
+    parser.add_argument("--bert_model", default=None, type=str, required=True,
+                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
+                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
+                        "bert-base-multilingual-cased, bert-base-chinese.")
+    parser.add_argument("--output_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The output directory where the model checkpoints will be written.")
+
+    ## Other parameters
+    parser.add_argument("--max_seq_length",
+                        default=128,
+                        type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. \n"
+                             "Sequences longer than this will be truncated, and sequences shorter \n"
+                             "than this will be padded.")
+    parser.add_argument("--do_train",
+                        action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument("--do_eval",
+                        action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_lower_case",
+                        action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument("--train_batch_size",
+                        default=32,
+                        type=int,
+                        help="Total batch size for training.")
+    parser.add_argument("--eval_batch_size",
+                        default=8,
+                        type=int,
+                        help="Total batch size for eval.")
+    parser.add_argument("--learning_rate",
+                        default=5e-5,
+                        type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument("--num_train_epochs",
+                        default=3.0,
+                        type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--warmup_proportion",
+                        default=0.1,
+                        type=float,
+                        help="Proportion of training to perform linear learning rate warmup for. "
+                             "E.g., 0.1 = 10%% of training.")
+    parser.add_argument("--no_cuda",
+                        action='store_true',
+                        help="Whether not to use CUDA when available")
+    parser.add_argument("--local_rank",
+                        type=int,
+                        default=-1,
+                        help="local_rank for distributed training on gpus")
+    parser.add_argument('--seed',
+                        type=int,
+                        default=42,
+                        help="random seed for initialization")
+    parser.add_argument('--gradient_accumulation_steps',
+                        type=int,
+                        default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument('--fp16',
+                        action='store_true',
+                        help="Whether to use 16-bit float precision instead of 32-bit")
+    parser.add_argument('--loss_scale',
+                        type=float, default=0,
+                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
+                             "0 (default value): dynamic loss scaling.\n"
+                             "Positive power of 2: static loss scaling value.\n")
+
+    args = parser.parse_args()
+
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        n_gpu = torch.cuda.device_count()
+    else:
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        n_gpu = 1
+        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.distributed.init_process_group(backend='nccl')
+    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
+        device, n_gpu, bool(args.local_rank != -1), args.fp16))
+
+    if args.gradient_accumulation_steps < 1:
+        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
+                            args.gradient_accumulation_steps))
+
+    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+    if not args.do_train and not args.do_eval:
+        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
+
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
+        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
+
+    train_examples = None
+    num_train_optimization_steps = None
+    if args.do_train:
+        train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True)
+        num_train_optimization_steps = int(
+            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
+        if args.local_rank != -1:
+            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
+
+    # Prepare model
+    model = BertForMultipleChoice.from_pretrained(args.bert_model,
+        cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)),
+        num_choices=4)
+    if args.fp16:
+        model.half()
+    model.to(device)
+    if args.local_rank != -1:
+        try:
+            from apex.parallel import DistributedDataParallel as DDP
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+        model = DDP(model)
+    elif n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Prepare optimizer
+    param_optimizer = list(model.named_parameters())
+
+    # hack to remove pooler, which is not used
+    # thus it produce None grad that break apex
+    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
+
+    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+    if args.fp16:
+        try:
+            from apex.optimizers import FP16_Optimizer
+            from apex.optimizers import FusedAdam
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+        optimizer = FusedAdam(optimizer_grouped_parameters,
+                              lr=args.learning_rate,
+                              bias_correction=False,
+                              max_grad_norm=1.0)
+        if args.loss_scale == 0:
+            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+        else:
+            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+    else:
+        optimizer = BertAdam(optimizer_grouped_parameters,
+                             lr=args.learning_rate,
+                             warmup=args.warmup_proportion,
+                             t_total=num_train_optimization_steps)
+
+    global_step = 0
+    if args.do_train:
+        train_features = convert_examples_to_features(
+            train_examples, tokenizer, args.max_seq_length, True)
+        logger.info("***** Running training *****")
+        logger.info("  Num examples = %d", len(train_examples))
+        logger.info("  Batch size = %d", args.train_batch_size)
+        logger.info("  Num steps = %d", num_train_optimization_steps)
+        all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long)
+        all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long)
+        all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long)
+        all_label = torch.tensor([f.label for f in train_features], dtype=torch.long)
+        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
+        if args.local_rank == -1:
+            train_sampler = RandomSampler(train_data)
+        else:
+            train_sampler = DistributedSampler(train_data)
+        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+
+        model.train()
+        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
+            tr_loss = 0
+            nb_tr_examples, nb_tr_steps = 0, 0
+            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
+                batch = tuple(t.to(device) for t in batch)
+                input_ids, input_mask, segment_ids, label_ids = batch
+                loss = model(input_ids, segment_ids, input_mask, label_ids)
+                if n_gpu > 1:
+                    loss = loss.mean() # mean() to average on multi-gpu.
+                if args.fp16 and args.loss_scale != 1.0:
+                    # rescale loss for fp16 training
+                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
+                    loss = loss * args.loss_scale
+                if args.gradient_accumulation_steps > 1:
+                    loss = loss / args.gradient_accumulation_steps
+                tr_loss += loss.item()
+                nb_tr_examples += input_ids.size(0)
+                nb_tr_steps += 1
+
+                if args.fp16:
+                    optimizer.backward(loss)
+                else:
+                    loss.backward()
+                if (step + 1) % args.gradient_accumulation_steps == 0:
+                    if args.fp16:
+                        # modify learning rate with special warm up BERT uses
+                        # if args.fp16 is False, BertAdam is used that handles this automatically
+                        lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
+                        for param_group in optimizer.param_groups:
+                            param_group['lr'] = lr_this_step
+                    optimizer.step()
+                    optimizer.zero_grad()
+                    global_step += 1
+
+
+    if args.do_train:
+        # Save a trained model and the associated configuration
+        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+        torch.save(model_to_save.state_dict(), output_model_file)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+        with open(output_config_file, 'w') as f:
+            f.write(model_to_save.config.to_json_string())
+
+        # Load a trained model and config that you have fine-tuned
+        config = BertConfig(output_config_file)
+        model = BertForMultipleChoice(config, num_choices=4)
+        model.load_state_dict(torch.load(output_model_file))
+    else:
+        model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4)
+    model.to(device)
+
+
+    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        eval_examples = read_swag_examples(os.path.join(args.data_dir, 'val.csv'), is_training = True)
+        eval_features = convert_examples_to_features(
+            eval_examples, tokenizer, args.max_seq_length, True)
+        logger.info("***** Running evaluation *****")
+        logger.info("  Num examples = %d", len(eval_examples))
+        logger.info("  Batch size = %d", args.eval_batch_size)
+        all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long)
+        all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long)
+        all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long)
+        all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long)
+        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
+        # Run prediction for full data
+        eval_sampler = SequentialSampler(eval_data)
+        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+        model.eval()
+        eval_loss, eval_accuracy = 0, 0
+        nb_eval_steps, nb_eval_examples = 0, 0
+        for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
+            input_ids = input_ids.to(device)
+            input_mask = input_mask.to(device)
+            segment_ids = segment_ids.to(device)
+            label_ids = label_ids.to(device)
+
+            with torch.no_grad():
+                tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
+                logits = model(input_ids, segment_ids, input_mask)
+
+            logits = logits.detach().cpu().numpy()
+            label_ids = label_ids.to('cpu').numpy()
+            tmp_eval_accuracy = accuracy(logits, label_ids)
+
+            eval_loss += tmp_eval_loss.mean().item()
+            eval_accuracy += tmp_eval_accuracy
+
+            nb_eval_examples += input_ids.size(0)
+            nb_eval_steps += 1
+
+        eval_loss = eval_loss / nb_eval_steps
+        eval_accuracy = eval_accuracy / nb_eval_examples
+
+        result = {'eval_loss': eval_loss,
+                  'eval_accuracy': eval_accuracy,
+                  'global_step': global_step,
+                  'loss': tr_loss/nb_tr_steps}
+
+        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
+        with open(output_eval_file, "w") as writer:
+            logger.info("***** Eval results *****")
+            for key in sorted(result.keys()):
+                logger.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/run_transfo_xl.py
+++ b/examples/run_transfo_xl.py
@ -0,0 +1,152 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Transformer XL model evaluation script.
+    Adapted from https://github.com/kimiyoung/transformer-xl.
+    In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
+
+    This script with default values evaluates a pretrained Transformer-XL on WikiText 103
+"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+import logging
+import time
+import math
+
+import torch
+
+from pytorch_pretrained_bert import TransfoXLLMHeadModel, TransfoXLCorpus
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+def main():
+    parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
+    parser.add_argument('--model_name', type=str, default='transfo-xl-wt103',
+                        help='pretrained model name')
+    parser.add_argument('--split', type=str, default='test',
+                        choices=['all', 'valid', 'test'],
+                        help='which split to evaluate')
+    parser.add_argument('--batch_size', type=int, default=10,
+                        help='batch size')
+    parser.add_argument('--tgt_len', type=int, default=128,
+                        help='number of tokens to predict')
+    parser.add_argument('--ext_len', type=int, default=0,
+                        help='length of the extended context')
+    parser.add_argument('--mem_len', type=int, default=1600,
+                        help='length of the retained previous heads')
+    parser.add_argument('--clamp_len', type=int, default=1000,
+                        help='max positional embedding index')
+    parser.add_argument('--no_cuda', action='store_true',
+                        help='Do not use CUDA even though CUA is available')
+    parser.add_argument('--work_dir', type=str, required=True,
+                        help='path to the work_dir')
+    parser.add_argument('--no_log', action='store_true',
+                        help='do not log the eval result')
+    parser.add_argument('--same_length', action='store_true',
+                        help='set same length attention with masking')
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    args = parser.parse_args()
+    assert args.ext_len >= 0, 'extended context length must be non-negative'
+
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+    logger.info("device: {}".format(device))
+
+    # Load a pre-processed dataset
+    # You can also build the corpus yourself using TransfoXLCorpus methods
+    # The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax
+    # and tokenizing the dataset
+    # The pre-processed corpus is a convertion (using the conversion script )
+    corpus = TransfoXLCorpus.from_pretrained(args.model_name)
+    ntokens = len(corpus.vocab)
+
+    va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len,
+        device=device, ext_len=args.ext_len)
+    te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len,
+        device=device, ext_len=args.ext_len)
+
+    # Load a pre-trained model
+    model = TransfoXLLMHeadModel.from_pretrained(args.model_name)
+    model = model.to(device)
+
+    logger.info('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
+        args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len))
+
+    model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
+    if args.clamp_len > 0:
+        model.clamp_len = args.clamp_len
+    if args.same_length:
+        model.same_length = True
+
+    ###############################################################################
+    # Evaluation code
+    ###############################################################################
+    def evaluate(eval_iter):
+        # Turn on evaluation mode which disables dropout.
+        model.eval()
+        total_len, total_loss = 0, 0.
+        start_time = time.time()
+        with torch.no_grad():
+            mems = None
+            for idx, (data, target, seq_len) in enumerate(eval_iter):
+                ret = model(data, target, mems)
+                loss, mems = ret
+                loss = loss.mean()
+                total_loss += seq_len * loss.item()
+                total_len += seq_len
+            total_time = time.time() - start_time
+        logger.info('Time : {:.2f}s, {:.2f}ms/segment'.format(
+                total_time, 1000 * total_time / (idx+1)))
+        return total_loss / total_len
+
+    # Run on test data.
+    if args.split == 'all':
+        test_loss = evaluate(te_iter)
+        valid_loss = evaluate(va_iter)
+    elif args.split == 'valid':
+        valid_loss = evaluate(va_iter)
+        test_loss = None
+    elif args.split == 'test':
+        test_loss = evaluate(te_iter)
+        valid_loss = None
+
+    def format_log(loss, split):
+        log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format(
+            split, loss, math.exp(loss))
+        return log_str
+
+    log_str = ''
+    if valid_loss is not None:
+        log_str += format_log(valid_loss, 'valid')
+    if test_loss is not None:
+        log_str += format_log(test_loss, 'test')
+
+    logger.info('=' * 100)
+    logger.info(log_str)
+    logger.info('=' * 100)
+
+if __name__ == '__main__':
+    main()
--- a/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
+++ b/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
@ -133,7 +133,7 @@
    "    unique_id = 0\n",
    "    with tf.gfile.GFile(input_file, \"r\") as reader:\n",
    "        while True:\n",
-    "            line = reader.readline()#tokenization.convert_to_unicode(reader.readline())\n",
+    "            line = reader.readline()\n",
    "            if not line:\n",
    "                break\n",
    "            line = line.strip()\n",
--- a/pytorch_pretrained_bert/init.py
+++ b/pytorch_pretrained_bert/init.py
@ -1,6 +1,24 @@
+__version__ = "0.6.0"
 from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
+from .tokenization_openai import OpenAIGPTTokenizer
+from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
+from .tokenization_gpt2 import GPT2Tokenizer
+
 from .modeling import (BertConfig, BertModel, BertForPreTraining,
                       BertForMaskedLM, BertForNextSentencePrediction,
-                       BertForSequenceClassification, BertForQuestionAnswering)
+                       BertForSequenceClassification, BertForMultipleChoice,
+                       BertForTokenClassification, BertForQuestionAnswering,
+                       load_tf_weights_in_bert)
+from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel,
+                              OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
+                              load_tf_weights_in_openai_gpt)
+from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel,
+                                  load_tf_weights_in_transfo_xl)
+from .modeling_gpt2 import (GPT2Config, GPT2Model,
+                            GPT2LMHeadModel, GPT2DoubleHeadsModel,
+                            load_tf_weights_in_gpt2)
+
 from .optimization import BertAdam
-from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE
+from .optimization_openai import OpenAIAdam
+
+from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path
--- a/pytorch_pretrained_bert/main.py
+++ b/pytorch_pretrained_bert/main.py
@ -1,19 +1,83 @@
 # coding: utf8
-if __name__ == '__main__':
+def main():
    import sys
-    try:
-        from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
-    except ModuleNotFoundError:
-        print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
-              "In that case, it requires TensorFlow to be installed. Please see "
-              "https://www.tensorflow.org/install/ for installation instructions.")
-        raise
-
-    if len(sys.argv) != 5:
-        # pylint: disable=line-too-long
-        print("Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
+    if (len(sys.argv) != 4 and len(sys.argv) != 5) or sys.argv[1] not in [
+        "convert_tf_checkpoint_to_pytorch",
+        "convert_openai_checkpoint",
+        "convert_transfo_xl_checkpoint",
+        "convert_gpt2_checkpoint",
+    ]:
+        print(
+        "Should be used as one of: \n"
+        ">> `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`, \n"
+        ">> `pytorch_pretrained_bert convert_openai_checkpoint OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`, \n"
+        ">> `pytorch_pretrained_bert convert_transfo_xl_checkpoint TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG]` or \n"
+        ">> `pytorch_pretrained_bert convert_gpt2_checkpoint TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG]`")
    else:
-        PYTORCH_DUMP_OUTPUT = sys.argv.pop()
-        TF_CONFIG = sys.argv.pop()
-        TF_CHECKPOINT = sys.argv.pop()
-        convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
+        if sys.argv[1] == "convert_tf_checkpoint_to_pytorch":
+            try:
+                from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
+            except ImportError:
+                print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
+                    "https://www.tensorflow.org/install/ for installation instructions.")
+                raise
+
+            if len(sys.argv) != 5:
+                # pylint: disable=line-too-long
+                print("Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
+            else:
+                PYTORCH_DUMP_OUTPUT = sys.argv.pop()
+                TF_CONFIG = sys.argv.pop()
+                TF_CHECKPOINT = sys.argv.pop()
+                convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
+        elif sys.argv[1] == "convert_openai_checkpoint":
+            from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
+            OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
+            PYTORCH_DUMP_OUTPUT = sys.argv[3]
+            if len(sys.argv) == 5:
+                OPENAI_GPT_CONFIG = sys.argv[4]
+            else:
+                OPENAI_GPT_CONFIG = ""
+            convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH,
+                                                 OPENAI_GPT_CONFIG,
+                                                 PYTORCH_DUMP_OUTPUT)
+        elif sys.argv[1] == "convert_transfo_xl_checkpoint":
+            try:
+                from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
+            except ImportError:
+                print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
+                    "https://www.tensorflow.org/install/ for installation instructions.")
+                raise
+
+            if 'ckpt' in sys.argv[2].lower():
+                TF_CHECKPOINT = sys.argv[2]
+                TF_DATASET_FILE = ""
+            else:
+                TF_DATASET_FILE = sys.argv[2]
+                TF_CHECKPOINT = ""
+            PYTORCH_DUMP_OUTPUT = sys.argv[3]
+            if len(sys.argv) == 5:
+                TF_CONFIG = sys.argv[4]
+            else:
+                TF_CONFIG = ""
+            convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE)
+        else:
+            try:
+                from .convert_gpt2_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
+            except ImportError:
+                print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
+                    "https://www.tensorflow.org/install/ for installation instructions.")
+                raise
+
+            TF_CHECKPOINT = sys.argv[2]
+            PYTORCH_DUMP_OUTPUT = sys.argv[3]
+            if len(sys.argv) == 5:
+                TF_CONFIG = sys.argv[4]
+            else:
+                TF_CONFIG = ""
+            convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
+if __name__ == '__main__':
+    main()
--- a/pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py
@ -0,0 +1,72 @@
+# coding=utf-8
+# Copyright 2018 The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert OpenAI GPT checkpoint."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+from io import open
+
+import torch
+
+from pytorch_pretrained_bert.modeling_gpt2 import (CONFIG_NAME, WEIGHTS_NAME,
+                                                     GPT2Config,
+                                                     GPT2Model,
+                                                     load_tf_weights_in_gpt2)
+
+
+def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
+    # Construct model
+    if gpt2_config_file == "":
+        config = GPT2Config()
+    else:
+        config = GPT2Config(gpt2_config_file)
+    model = GPT2Model(config)
+
+    # Load weights from numpy
+    load_tf_weights_in_gpt2(model, gpt2_checkpoint_path)
+
+    # Save pytorch-model
+    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
+    pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
+    print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
+    torch.save(model.state_dict(), pytorch_weights_dump_path)
+    print("Save configuration file to {}".format(pytorch_config_dump_path))
+    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
+        f.write(config.to_json_string())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--gpt2_checkpoint_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path the TensorFlow checkpoint path.")
+    parser.add_argument("--pytorch_dump_folder_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output PyTorch model.")
+    parser.add_argument("--gpt2_config_file",
+                        default = "",
+                        type = str,
+                        help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
+                            "This specifies the model architecture.")
+    args = parser.parse_args()
+    convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path,
+                                         args.gpt2_config_file,
+                                         args.pytorch_dump_folder_path)
--- a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
@ -0,0 +1,72 @@
+# coding=utf-8
+# Copyright 2018 The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert OpenAI GPT checkpoint."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+from io import open
+
+import torch
+
+from pytorch_pretrained_bert.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME,
+                                                     OpenAIGPTConfig,
+                                                     OpenAIGPTModel,
+                                                     load_tf_weights_in_openai_gpt)
+
+
+def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
+    # Construct model
+    if openai_config_file == "":
+        config = OpenAIGPTConfig()
+    else:
+        config = OpenAIGPTConfig(openai_config_file)
+    model = OpenAIGPTModel(config)
+
+    # Load weights from numpy
+    load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path)
+
+    # Save pytorch-model
+    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
+    pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
+    print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
+    torch.save(model.state_dict(), pytorch_weights_dump_path)
+    print("Save configuration file to {}".format(pytorch_config_dump_path))
+    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
+        f.write(config.to_json_string())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--openai_checkpoint_folder_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path the TensorFlow checkpoint path.")
+    parser.add_argument("--pytorch_dump_folder_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output PyTorch model.")
+    parser.add_argument("--openai_config_file",
+                        default = "",
+                        type = str,
+                        help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
+                            "This specifies the model architecture.")
+    args = parser.parse_args()
+    convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path,
+                                         args.openai_config_file,
+                                         args.pytorch_dump_folder_path)
--- a/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
@ -25,62 +25,16 @@ import tensorflow as tf
 import torch
 import numpy as np

-from .modeling import BertConfig, BertForPreTraining
+from pytorch_pretrained_bert.modeling import BertConfig, BertForPreTraining, load_tf_weights_in_bert

 def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
-    config_path = os.path.abspath(bert_config_file)
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    print("Converting TensorFlow checkpoint from {} with config at {}".format(tf_path, config_path))
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        print("Loading TF weight {} with shape {}".format(name, shape))
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array)
-
    # Initialise PyTorch model
    config = BertConfig.from_json_file(bert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = BertForPreTraining(config)

-    for name, array in zip(names, arrays):
-        name = name.split('/')
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if name[-1] in ["adam_v", "adam_m"]:
-            print("Skipping {}".format("/".join(name)))
-            continue
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
-                l = re.split(r'_(\d+)', m_name)
-            else:
-                l = [m_name]
-            if l[0] == 'kernel':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
-            else:
-                pointer = getattr(pointer, l[0])
-            if len(l) >= 2:
-                num = int(l[1])
-                pointer = pointer[num]
-        if m_name[-11:] == '_embeddings':
-            pointer = getattr(pointer, 'weight')
-        elif m_name == 'kernel':
-            array = np.transpose(array)
-        try:
-            assert pointer.shape == array.shape
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        print("Initialize PyTorch weight {}".format(name))
-        pointer.data = torch.from_numpy(array)
+    # Load weights from tf checkpoint
+    load_tf_weights_in_bert(model, tf_checkpoint_path)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
--- a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
@ -0,0 +1,116 @@
+# coding=utf-8
+# Copyright 2018 The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Transformer XL checkpoint and datasets."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import os
+import sys
+from io import open
+
+import torch
+
+import pytorch_pretrained_bert.tokenization_transfo_xl as data_utils
+from pytorch_pretrained_bert.modeling_transfo_xl import (CONFIG_NAME,
+                                                         WEIGHTS_NAME,
+                                                         TransfoXLConfig,
+                                                         TransfoXLLMHeadModel,
+                                                         load_tf_weights_in_transfo_xl)
+from pytorch_pretrained_bert.tokenization_transfo_xl import (CORPUS_NAME,
+                                                             VOCAB_NAME)
+
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle
+
+# We do this to be able to load python 2 datasets pickles
+# See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918
+data_utils.Vocab = data_utils.TransfoXLTokenizer
+data_utils.Corpus = data_utils.TransfoXLCorpus
+sys.modules['data_utils'] = data_utils
+sys.modules['vocabulary'] = data_utils
+
+def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
+                                             transfo_xl_config_file,
+                                             pytorch_dump_folder_path,
+                                             transfo_xl_dataset_file):
+    if transfo_xl_dataset_file:
+        # Convert a pre-processed corpus (see original TensorFlow repo)
+        with open(transfo_xl_dataset_file, "rb") as fp:
+            corpus = pickle.load(fp, encoding="latin1")
+        # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term)
+        pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_NAME
+        print("Save vocabulary to {}".format(pytorch_vocab_dump_path))
+        corpus_vocab_dict = corpus.vocab.__dict__
+        torch.save(corpus_vocab_dict, pytorch_vocab_dump_path)
+
+        corpus_dict_no_vocab = corpus.__dict__
+        corpus_dict_no_vocab.pop('vocab', None)
+        pytorch_dataset_dump_path = pytorch_dump_folder_path + '/' + CORPUS_NAME
+        print("Save dataset to {}".format(pytorch_dataset_dump_path))
+        torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path)
+
+    if tf_checkpoint_path:
+        # Convert a pre-trained TensorFlow model
+        config_path = os.path.abspath(transfo_xl_config_file)
+        tf_path = os.path.abspath(tf_checkpoint_path)
+
+        print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path))
+        # Initialise PyTorch model
+        if transfo_xl_config_file == "":
+            config = TransfoXLConfig()
+        else:
+            config = TransfoXLConfig(transfo_xl_config_file)
+        print("Building PyTorch model from configuration: {}".format(str(config)))
+        model = TransfoXLLMHeadModel(config)
+
+        model = load_tf_weights_in_transfo_xl(model, config, tf_path)
+        # Save pytorch-model
+        pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
+        pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
+        print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
+        torch.save(model.state_dict(), pytorch_weights_dump_path)
+        print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
+        with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
+            f.write(config.to_json_string())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pytorch_dump_folder_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the folder to store the PyTorch model or dataset/vocab.")
+    parser.add_argument("--tf_checkpoint_path",
+                        default = "",
+                        type = str,
+                        help = "An optional path to a TensorFlow checkpoint path to be converted.")
+    parser.add_argument("--transfo_xl_config_file",
+                        default = "",
+                        type = str,
+                        help = "An optional config json file corresponding to the pre-trained BERT model. \n"
+                            "This specifies the model architecture.")
+    parser.add_argument("--transfo_xl_dataset_file",
+                        default = "",
+                        type = str,
+                        help = "An optional dataset file to be converted in a vocabulary.")
+    args = parser.parse_args()
+    convert_transfo_xl_checkpoint_to_pytorch(args.tf_checkpoint_path,
+                                     args.transfo_xl_config_file,
+                                     args.pytorch_dump_folder_path,
+                                     args.transfo_xl_dataset_file)
--- a/pytorch_pretrained_bert/file_utils.py
+++ b/pytorch_pretrained_bert/file_utils.py
@ -3,31 +3,40 @@ Utilities for working with the local dataset cache.
 This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
 Copyright by the AllenNLP authors.
 """
+from __future__ import (absolute_import, division, print_function, unicode_literals)

-import os
+import json
 import logging
+import os
 import shutil
 import tempfile
-import json
-from urllib.parse import urlparse
-from pathlib import Path
-from typing import Optional, Tuple, Union, IO, Callable, Set
-from hashlib import sha256
 from functools import wraps
-
-from tqdm import tqdm
+from hashlib import sha256
+import sys
+from io import open

 import boto3
-from botocore.exceptions import ClientError
 import requests
+from botocore.exceptions import ClientError
+from tqdm import tqdm
+
+try:
+    from urllib.parse import urlparse
+except ImportError:
+    from urlparse import urlparse
+
+try:
+    from pathlib import Path
+    PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+                                                   Path.home() / '.pytorch_pretrained_bert'))
+except AttributeError:
+    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+                                              os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))

 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name

-PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
-                                               Path.home() / '.pytorch_pretrained_bert'))

-
-def url_to_filename(url: str, etag: str = None) -> str:
+def url_to_filename(url, etag=None):
    """
    Convert `url` into a hashed filename in a repeatable way.
    If `etag` is specified, append its hash to the url's, delimited
@ -45,23 +54,25 @@ def url_to_filename(url: str, etag: str = None) -> str:
    return filename


-def filename_to_url(filename: str, cache_dir: str = None) -> Tuple[str, str]:
+def filename_to_url(filename, cache_dir=None):
    """
    Return the url and etag (which may be ``None``) stored for `filename`.
-    Raise ``FileNotFoundError`` if `filename` or its stored metadata do not exist.
+    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
    """
    if cache_dir is None:
        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)

    cache_path = os.path.join(cache_dir, filename)
    if not os.path.exists(cache_path):
-        raise FileNotFoundError("file {} not found".format(cache_path))
+        raise EnvironmentError("file {} not found".format(cache_path))

    meta_path = cache_path + '.json'
    if not os.path.exists(meta_path):
-        raise FileNotFoundError("file {} not found".format(meta_path))
+        raise EnvironmentError("file {} not found".format(meta_path))

-    with open(meta_path) as meta_file:
+    with open(meta_path, encoding="utf-8") as meta_file:
        metadata = json.load(meta_file)
    url = metadata['url']
    etag = metadata['etag']
@ -69,7 +80,7 @@ def filename_to_url(filename: str, cache_dir: str = None) -> Tuple[str, str]:
    return url, etag


-def cached_path(url_or_filename: Union[str, Path], cache_dir: str = None) -> str:
+def cached_path(url_or_filename, cache_dir=None):
    """
    Given something that might be a URL (or might be a local path),
    determine which. If it's a URL, download the file and cache it, and
@ -78,8 +89,10 @@ def cached_path(url_or_filename: Union[str, Path], cache_dir: str = None) -> str
    """
    if cache_dir is None:
        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
-    if isinstance(url_or_filename, Path):
+    if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
        url_or_filename = str(url_or_filename)
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)

    parsed = urlparse(url_or_filename)

@ -91,13 +104,13 @@ def cached_path(url_or_filename: Union[str, Path], cache_dir: str = None) -> str
        return url_or_filename
    elif parsed.scheme == '':
        # File, but it doesn't exist.
-        raise FileNotFoundError("file {} not found".format(url_or_filename))
+        raise EnvironmentError("file {} not found".format(url_or_filename))
    else:
        # Something unknown
        raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))


-def split_s3_path(url: str) -> Tuple[str, str]:
+def split_s3_path(url):
    """Split a full s3 path into the bucket name and path."""
    parsed = urlparse(url)
    if not parsed.netloc or not parsed.path:
@ -110,19 +123,19 @@ def split_s3_path(url: str) -> Tuple[str, str]:
    return bucket_name, s3_path


-def s3_request(func: Callable):
+def s3_request(func):
    """
    Wrapper function for s3 requests in order to create more helpful error
    messages.
    """

    @wraps(func)
-    def wrapper(url: str, *args, **kwargs):
+    def wrapper(url, *args, **kwargs):
        try:
            return func(url, *args, **kwargs)
        except ClientError as exc:
            if int(exc.response["Error"]["Code"]) == 404:
-                raise FileNotFoundError("file {} not found".format(url))
+                raise EnvironmentError("file {} not found".format(url))
            else:
                raise

@ -130,7 +143,7 @@ def s3_request(func: Callable):


@s3_request
-def s3_etag(url: str) -> Optional[str]:
+def s3_etag(url):
    """Check ETag on S3 object."""
    s3_resource = boto3.resource("s3")
    bucket_name, s3_path = split_s3_path(url)
@ -139,14 +152,14 @@ def s3_etag(url: str) -> Optional[str]:


@s3_request
-def s3_get(url: str, temp_file: IO) -> None:
+def s3_get(url, temp_file):
    """Pull a file directly from S3."""
    s3_resource = boto3.resource("s3")
    bucket_name, s3_path = split_s3_path(url)
    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)


-def http_get(url: str, temp_file: IO) -> None:
+def http_get(url, temp_file):
    req = requests.get(url, stream=True)
    content_length = req.headers.get('Content-Length')
    total = int(content_length) if content_length is not None else None
@ -158,15 +171,18 @@ def http_get(url: str, temp_file: IO) -> None:
    progress.close()


-def get_from_cache(url: str, cache_dir: str = None) -> str:
+def get_from_cache(url, cache_dir=None):
    """
    Given a URL, look for the corresponding dataset in the local cache.
    If it's not there, download it. Then return the path to the cached file.
    """
    if cache_dir is None:
        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)

-    os.makedirs(cache_dir, exist_ok=True)
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir)

    # Get eTag to add to filename, if it exists.
    if url.startswith("s3://"):
@ -207,7 +223,7 @@ def get_from_cache(url: str, cache_dir: str = None) -> str:
            logger.info("creating metadata file for %s", cache_path)
            meta = {'url': url, 'etag': etag}
            meta_path = cache_path + '.json'
-            with open(meta_path, 'w') as meta_file:
+            with open(meta_path, 'w', encoding="utf-8") as meta_file:
                json.dump(meta, meta_file)

            logger.info("removing temp file %s", temp_file.name)
@ -215,19 +231,19 @@ def get_from_cache(url: str, cache_dir: str = None) -> str:
    return cache_path


-def read_set_from_file(filename: str) -> Set[str]:
+def read_set_from_file(filename):
    '''
    Extract a de-duped collection (set) of text from a file.
    Expected file format is one item per line.
    '''
    collection = set()
-    with open(filename, 'r') as file_:
+    with open(filename, 'r', encoding='utf-8') as file_:
        for line in file_:
            collection.add(line.rstrip())
    return collection


-def get_file_extension(path: str, dot=True, lower: bool = True):
+def get_file_extension(path, dot=True, lower=True):
    ext = os.path.splitext(path)[1]
    ext = ext if dot else ext[1:]
    return ext.lower() if lower else ext
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@ -1,5 +1,6 @@
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -14,18 +15,18 @@
 # limitations under the License.
 """PyTorch BERT model."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function, unicode_literals

-import os
 import copy
 import json
-import math
 import logging
+import math
+import os
+import shutil
 import tarfile
 import tempfile
-import shutil
+import sys
+from io import open

 import torch
 from torch import nn
@ -33,25 +34,87 @@ from torch.nn import CrossEntropyLoss

 from .file_utils import cached_path

-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
 logger = logging.getLogger(__name__)

 PRETRAINED_MODEL_ARCHIVE_MAP = {
    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
-    'bert-base-multilingual': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual.tar.gz",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
 }
 CONFIG_NAME = 'bert_config.json'
 WEIGHTS_NAME = 'pytorch_model.bin'
+TF_WEIGHTS_NAME = 'model.ckpt'
+
+def load_tf_weights_in_bert(model, tf_checkpoint_path):
+    """ Load tf checkpoints in a pytorch model
+    """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    print("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        print("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in ["adam_v", "adam_m"] for n in name):
+            print("Skipping {}".format("/".join(name)))
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                l = re.split(r'_(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] == 'kernel' or l[0] == 'gamma':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'output_bias' or l[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'output_weights':
+                pointer = getattr(pointer, 'weight')
+            else:
+                pointer = getattr(pointer, l[0])
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        if m_name[-11:] == '_embeddings':
+            pointer = getattr(pointer, 'weight')
+        elif m_name == 'kernel':
+            array = np.transpose(array)
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        print("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+

 def gelu(x):
    """Implementation of the gelu activation function.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+        Also see https://arxiv.org/abs/1606.08415
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

@ -102,8 +165,9 @@ class BertConfig(object):
            initializer_range: The sttdev of the truncated_normal_initializer for
                initializing all weight matrices.
        """
-        if isinstance(vocab_size_or_config_json_file, str):
-            with open(vocab_size_or_config_json_file, "r") as reader:
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
                json_config = json.loads(reader.read())
            for key, value in json_config.items():
                self.__dict__[key] = value
@ -134,7 +198,7 @@ class BertConfig(object):
    @classmethod
    def from_json_file(cls, json_file):
        """Constructs a `BertConfig` from a json file of parameters."""
-        with open(json_file, "r") as reader:
+        with open(json_file, "r", encoding='utf-8') as reader:
            text = reader.read()
        return cls.from_dict(json.loads(text))

@ -150,22 +214,24 @@ class BertConfig(object):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

+try:
+    from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
+except ImportError:
+    print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.")
+    class BertLayerNorm(nn.Module):
+        def __init__(self, hidden_size, eps=1e-12):
+            """Construct a layernorm module in the TF style (epsilon inside the square root).
+            """
+            super(BertLayerNorm, self).__init__()
+            self.weight = nn.Parameter(torch.ones(hidden_size))
+            self.bias = nn.Parameter(torch.zeros(hidden_size))
+            self.variance_epsilon = eps

-class BertLayerNorm(nn.Module):
-    def __init__(self, config, variance_epsilon=1e-12):
-        """Construct a layernorm module in the TF style (epsilon inside the square root).
-        """
-        super(BertLayerNorm, self).__init__()
-        self.gamma = nn.Parameter(torch.ones(config.hidden_size))
-        self.beta = nn.Parameter(torch.zeros(config.hidden_size))
-        self.variance_epsilon = variance_epsilon
-
-    def forward(self, x):
-        u = x.mean(-1, keepdim=True)
-        s = (x - u).pow(2).mean(-1, keepdim=True)
-        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
-        return self.gamma * x + self.beta
-
+        def forward(self, x):
+            u = x.mean(-1, keepdim=True)
+            s = (x - u).pow(2).mean(-1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+            return self.weight * x + self.bias

 class BertEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings.
@ -178,7 +244,7 @@ class BertEmbeddings(nn.Module):

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
-        self.LayerNorm = BertLayerNorm(config)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, input_ids, token_type_ids=None):
@ -253,7 +319,7 @@ class BertSelfOutput(nn.Module):
    def __init__(self, config):
        super(BertSelfOutput, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = BertLayerNorm(config)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
@ -279,8 +345,10 @@ class BertIntermediate(nn.Module):
    def __init__(self, config):
        super(BertIntermediate, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        self.intermediate_act_fn = ACT2FN[config.hidden_act] \
-            if isinstance(config.hidden_act, str) else config.hidden_act
+        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
@ -292,7 +360,7 @@ class BertOutput(nn.Module):
    def __init__(self, config):
        super(BertOutput, self).__init__()
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = BertLayerNorm(config)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
@ -320,7 +388,7 @@ class BertEncoder(nn.Module):
    def __init__(self, config):
        super(BertEncoder, self).__init__()
        layer = BertLayer(config)
-        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])    
+        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])

    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
        all_encoder_layers = []
@ -352,9 +420,11 @@ class BertPredictionHeadTransform(nn.Module):
    def __init__(self, config):
        super(BertPredictionHeadTransform, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.transform_act_fn = ACT2FN[config.hidden_act] \
-            if isinstance(config.hidden_act, str) else config.hidden_act
-        self.LayerNorm = BertLayerNorm(config)
+        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
@ -414,12 +484,12 @@ class BertPreTrainingHeads(nn.Module):
        return prediction_scores, seq_relationship_score


-class PreTrainedBertModel(nn.Module):
+class BertPreTrainedModel(nn.Module):
    """ An abstract class to handle weights initialization and
        a simple interface for dowloading and loading pretrained models.
    """
    def __init__(self, config, *inputs, **kwargs):
-        super(PreTrainedBertModel, self).__init__()
+        super(BertPreTrainedModel, self).__init__()
        if not isinstance(config, BertConfig):
            raise ValueError(
                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
@ -437,46 +507,55 @@ class PreTrainedBertModel(nn.Module):
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        elif isinstance(module, BertLayerNorm):
-            module.beta.data.normal_(mean=0.0, std=self.config.initializer_range)
-            module.gamma.data.normal_(mean=0.0, std=self.config.initializer_range)
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()

    @classmethod
-    def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path, state_dict=None, cache_dir=None,
+                        from_tf=False, *inputs, **kwargs):
        """
-        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
        Download and cache the pre-trained model file if needed.
-        
+
        Params:
-            pretrained_model_name: either:
+            pretrained_model_name_or_path: either:
                - a str with the name of a pre-trained model to load selected in the list of:
                    . `bert-base-uncased`
                    . `bert-large-uncased`
                    . `bert-base-cased`
-                    . `bert-base-multilingual`
+                    . `bert-large-cased`
+                    . `bert-base-multilingual-uncased`
+                    . `bert-base-multilingual-cased`
                    . `bert-base-chinese`
                - a path or url to a pretrained model archive containing:
                    . `bert_config.json` a configuration file for the model
                    . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
+                - a path or url to a pretrained model archive containing:
+                    . `bert_config.json` a configuration file for the model
+                    . `model.chkpt` a TensorFlow checkpoint
+            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
            *inputs, **kwargs: additional input for the specific Bert class
                (ex: num_labels for BertForSequenceClassification)
        """
-        if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP:
-            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name]
+        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
        else:
-            archive_file = pretrained_model_name
+            archive_file = pretrained_model_name_or_path
        # redirect to the cache, if necessary
        try:
            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
-        except FileNotFoundError:
+        except EnvironmentError:
            logger.error(
                "Model name '{}' was not found in model name list ({}). "
                "We assumed '{}' was a path or url but couldn't find any file "
                "associated to this path or url.".format(
-                    pretrained_model_name,
+                    pretrained_model_name_or_path,
                    ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
-                    pretrained_model_name))
+                    archive_file))
            return None
        if resolved_archive_file == archive_file:
            logger.info("loading archive file {}".format(archive_file))
@ -484,7 +563,7 @@ class PreTrainedBertModel(nn.Module):
            logger.info("loading archive file {} from cache at {}".format(
                archive_file, resolved_archive_file))
        tempdir = None
-        if os.path.isdir(resolved_archive_file):
+        if os.path.isdir(resolved_archive_file) or from_tf:
            serialization_dir = resolved_archive_file
        else:
            # Extract archive to temp dir
@ -500,8 +579,30 @@ class PreTrainedBertModel(nn.Module):
        logger.info("Model config {}".format(config))
        # Instantiate model.
        model = cls(config, *inputs, **kwargs)
-        weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
-        state_dict = torch.load(weights_path)
+        if state_dict is None and not from_tf:
+            weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
+            state_dict = torch.load(weights_path, map_location='cpu' if not torch.cuda.is_available() else None)
+        if tempdir:
+            # Clean up temp dir
+            shutil.rmtree(tempdir)
+        if from_tf:
+            # Directly load from a TensorFlow checkpoint
+            weights_path = os.path.join(serialization_dir, TF_WEIGHTS_NAME)
+            return load_tf_weights_in_bert(model, weights_path)
+        # Load from a PyTorch state_dict
+        old_keys = []
+        new_keys = []
+        for key in state_dict.keys():
+            new_key = None
+            if 'gamma' in key:
+                new_key = key.replace('gamma', 'weight')
+            if 'beta' in key:
+                new_key = key.replace('beta', 'bias')
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)

        missing_keys = []
        unexpected_keys = []
@ -519,20 +620,23 @@ class PreTrainedBertModel(nn.Module):
            for name, child in module._modules.items():
                if child is not None:
                    load(child, prefix + name + '.')
-        load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
+        start_prefix = ''
+        if not hasattr(model, 'bert') and any(s.startswith('bert.') for s in state_dict.keys()):
+            start_prefix = 'bert.'
+        load(model, prefix=start_prefix)
        if len(missing_keys) > 0:
            logger.info("Weights of {} not initialized from pretrained model: {}".format(
                model.__class__.__name__, missing_keys))
        if len(unexpected_keys) > 0:
            logger.info("Weights from pretrained model not used in {}: {}".format(
                model.__class__.__name__, unexpected_keys))
-        if tempdir:
-            # Clean up temp dir
-            shutil.rmtree(tempdir)
+        if len(error_msgs) > 0:
+            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+                               model.__class__.__name__, "\n\t".join(error_msgs)))
        return model


-class BertModel(PreTrainedBertModel):
+class BertModel(BertPreTrainedModel):
    """BERT model ("Bidirectional Embedding Representations from a Transformer").

    Params:
@ -557,20 +661,20 @@ class BertModel(PreTrainedBertModel):
                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
-                to the last attention block,
+                to the last attention block of shape [batch_size, sequence_length, hidden_size],
        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
            classifier pretrained on top of the hidden state associated to the first character of the
-            input (`CLF`) to train on the Next-Sentence task (see BERT's paper).
+            input (`CLS`) to train on the Next-Sentence task (see BERT's paper).

    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

-    config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
-        num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
+    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

    model = modeling.BertModel(config=config)
    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
@ -615,7 +719,7 @@ class BertModel(PreTrainedBertModel):
        return encoded_layers, pooled_output


-class BertForPreTraining(PreTrainedBertModel):
+class BertForPreTraining(BertPreTrainedModel):
    """BERT model with pre-training heads.
    This module comprises the BERT model followed by the two pre-training heads:
        - the masked language modeling head, and
@ -635,10 +739,10 @@ class BertForPreTraining(PreTrainedBertModel):
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.
-        `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+        `masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
            is only computed for the labels set in [0, ..., vocab_size]
-        `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
+        `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
            with indices selected in [0, 1].
            0 => next sentence is the continuation, 1 => next sentence is a random sentence.

@ -648,18 +752,18 @@ class BertForPreTraining(PreTrainedBertModel):
            sentence classification loss.
        if `masked_lm_labels` or `next_sentence_label` is `None`:
            Outputs a tuple comprising
-            - the masked language modeling logits, and
-            - the next sentence classification logits.
+            - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
+            - the next sentence classification logits of shape [batch_size, 2].

    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

-    config = BertConfig(vocab_size=32000, hidden_size=512,
-        num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

    model = BertForPreTraining(config)
    masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
@ -678,7 +782,7 @@ class BertForPreTraining(PreTrainedBertModel):

        if masked_lm_labels is not None and next_sentence_label is not None:
            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels(-1))
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
            total_loss = masked_lm_loss + next_sentence_loss
            return total_loss
@ -686,7 +790,7 @@ class BertForPreTraining(PreTrainedBertModel):
            return prediction_scores, seq_relationship_score


-class BertForMaskedLM(PreTrainedBertModel):
+class BertForMaskedLM(BertPreTrainedModel):
    """BERT model with the masked language modeling head.
    This module comprises the BERT model followed by the masked language modeling head.

@ -709,20 +813,20 @@ class BertForMaskedLM(PreTrainedBertModel):
            is only computed for the labels set in [0, ..., vocab_size]

    Outputs:
-        if `masked_lm_labels` is `None`:
+        if `masked_lm_labels` is  not `None`:
            Outputs the masked language modeling loss.
        if `masked_lm_labels` is `None`:
-            Outputs the masked language modeling logits.
+            Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].

    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

-    config = BertConfig(vocab_size=32000, hidden_size=512,
-        num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

    model = BertForMaskedLM(config)
    masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
@ -747,7 +851,7 @@ class BertForMaskedLM(PreTrainedBertModel):
            return prediction_scores


-class BertForNextSentencePrediction(PreTrainedBertModel):
+class BertForNextSentencePrediction(BertPreTrainedModel):
    """BERT model with next sentence prediction head.
    This module comprises the BERT model followed by the next sentence classification head.

@ -774,7 +878,7 @@ class BertForNextSentencePrediction(PreTrainedBertModel):
            Outputs the total_loss which is the sum of the masked language modeling loss and the next
            sentence classification loss.
        if `next_sentence_label` is `None`:
-            Outputs the next sentence classification logits.
+            Outputs the next sentence classification logits of shape [batch_size, 2].

    Example usage:
    ```python
@ -783,8 +887,8 @@ class BertForNextSentencePrediction(PreTrainedBertModel):
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

-    config = BertConfig(vocab_size=32000, hidden_size=512,
-        num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

    model = BertForNextSentencePrediction(config)
    seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
@ -809,7 +913,7 @@ class BertForNextSentencePrediction(PreTrainedBertModel):
            return seq_relationship_score


-class BertForSequenceClassification(PreTrainedBertModel):
+class BertForSequenceClassification(BertPreTrainedModel):
    """BERT model for classification.
    This module is composed of the BERT model with a linear layer on top of
    the pooled output.
@ -836,17 +940,17 @@ class BertForSequenceClassification(PreTrainedBertModel):
        if `labels` is not `None`:
            Outputs the CrossEntropy classification loss of the output with the labels.
        if `labels` is `None`:
-            Outputs the classification logits.
+            Outputs the classification logits of shape [batch_size, num_labels].

    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

-    config = BertConfig(vocab_size=32000, hidden_size=512,
-        num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

    num_labels = 2

@ -854,7 +958,7 @@ class BertForSequenceClassification(PreTrainedBertModel):
    logits = model(input_ids, token_type_ids, input_mask)
    ```
    """
-    def __init__(self, config, num_labels=2):
+    def __init__(self, config, num_labels):
        super(BertForSequenceClassification, self).__init__(config)
        self.num_labels = num_labels
        self.bert = BertModel(config)
@ -870,26 +974,160 @@ class BertForSequenceClassification(PreTrainedBertModel):
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return loss, logits
+            return loss
        else:
            return logits


-class BertForQuestionAnswering(PreTrainedBertModel):
+class BertForMultipleChoice(BertPreTrainedModel):
+    """BERT model for multiple choice tasks.
+    This module is composed of the BERT model with a linear layer on top of
+    the pooled output.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model.
+        `num_choices`: the number of classes for the classifier. Default = 2.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
+            and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, ..., num_choices].
+
+    Outputs:
+        if `labels` is not `None`:
+            Outputs the CrossEntropy classification loss of the output with the labels.
+        if `labels` is `None`:
+            Outputs the classification logits of shape [batch_size, num_labels].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
+    input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
+    token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    num_choices = 2
+
+    model = BertForMultipleChoice(config, num_choices)
+    logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config, num_choices):
+        super(BertForMultipleChoice, self).__init__(config)
+        self.num_choices = num_choices
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1))
+        _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False)
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, self.num_choices)
+
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+            return loss
+        else:
+            return reshaped_logits
+
+
+class BertForTokenClassification(BertPreTrainedModel):
+    """BERT model for token-level classification.
+    This module is composed of the BERT model with a linear layer on top of
+    the full hidden state of the last layer.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model.
+        `num_labels`: the number of classes for the classifier. Default = 2.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [0, ..., num_labels].
+
+    Outputs:
+        if `labels` is not `None`:
+            Outputs the CrossEntropy classification loss of the output with the labels.
+        if `labels` is `None`:
+            Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    num_labels = 2
+
+    model = BertForTokenClassification(config, num_labels)
+    logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config, num_labels):
+        super(BertForTokenClassification, self).__init__(config)
+        self.num_labels = num_labels
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, num_labels)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
+        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)[active_loss]
+                active_labels = labels.view(-1)[active_loss]
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            return loss
+        else:
+            return logits
+
+
+class BertForQuestionAnswering(BertPreTrainedModel):
    """BERT model for Question Answering (span extraction).
    This module is composed of the BERT model with a linear layer on top of
    the sequence output that computes start_logits and end_logits

    Params:
-        `config`: either
-            - a BertConfig class instance with the configuration to build a new model, or
-            - a str with the name of a pre-trained model to load selected in the list of:
-                . `bert-base-uncased`
-                . `bert-large-uncased`
-                . `bert-base-cased`
-                . `bert-base-multilingual`
-                . `bert-base-chinese`
-                The pre-trained model will be downloaded and cached if needed.
+        `config`: a BertConfig class instance with the configuration to build a new model.

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
@ -914,17 +1152,17 @@ class BertForQuestionAnswering(PreTrainedBertModel):
            Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
        if `start_positions` or `end_positions` is `None`:
            Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
-            position tokens.
+            position tokens of shape [batch_size, sequence_length].

    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

-    config = BertConfig(vocab_size=32000, hidden_size=512,
-        num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

    model = BertForQuestionAnswering(config)
    start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@ -0,0 +1,684 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch OpenAI GPT-2 model."""
+
+import collections
+import copy
+import json
+import logging
+import math
+import os
+import shutil
+import tarfile
+import tempfile
+import sys
+from io import open
+
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from torch.nn.parameter import Parameter
+
+from .file_utils import cached_path
+from .modeling import BertLayerNorm as LayerNorm
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin"}
+PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json"}
+
+CONFIG_NAME = "config.json"
+WEIGHTS_NAME = "pytorch_model.bin"
+
+def load_tf_weights_in_gpt2(model, gpt2_checkpoint_path):
+    """ Load tf checkpoints in a pytorch model
+    """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
+    tf_path = os.path.abspath(gpt2_checkpoint_path)
+    print("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        print("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array.squeeze())
+
+    for name, array in zip(names, arrays):
+        name = name[6:]  # skip "model/"
+        name = name.split('/')
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+\d+', m_name):
+                l = re.split(r'(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] == 'w' or l[0] == 'g':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'b':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'wpe' or l[0] == 'wte':
+                pointer = getattr(pointer, l[0])
+                pointer = getattr(pointer, 'weight')
+            else:
+                pointer = getattr(pointer, l[0])
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        print("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+def gelu(x):
+    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+
+
+class GPT2Config(object):
+    """Configuration class to store the configuration of a `GPT2Model`.
+    """
+
+    def __init__(
+        self,
+        vocab_size_or_config_json_file=50257,
+        n_positions=1024,
+        n_ctx=1024,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+    ):
+        """Constructs GPT2Config.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
+            n_positions: Number of positional embeddings.
+            n_ctx: Size of the causal mask (usually same as n_positions).
+            n_embd: Dimensionality of the embeddings and hidden states.
+            n_layer: Number of hidden layers in the Transformer encoder.
+            n_head: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            layer_norm_epsilon: epsilon to use in the layer norm layers
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+        """
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.n_ctx = n_ctx
+            self.n_positions = n_positions
+            self.n_embd = n_embd
+            self.n_layer = n_layer
+            self.n_head = n_head
+            self.layer_norm_epsilon = layer_norm_epsilon
+            self.initializer_range = initializer_range
+        else:
+            raise ValueError(
+                "First argument must be either a vocabulary size (int)"
+                "or the path to a pretrained model config file (str)"
+            )
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `GPT2Config` from a Python dictionary of parameters."""
+        config = GPT2Config(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `GPT2Config` from a json file of parameters."""
+        with open(json_file, "r", encoding="utf-8") as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+
+class Conv1D(nn.Module):
+    def __init__(self, nf, nx):
+        super(Conv1D, self).__init__()
+        self.nf = nf
+        w = torch.empty(nx, nf)
+        nn.init.normal_(w, std=0.02)
+        self.weight = Parameter(w)
+        self.bias = Parameter(torch.zeros(nf))
+
+    def forward(self, x):
+        size_out = x.size()[:-1] + (self.nf,)
+        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
+        x = x.view(*size_out)
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, nx, n_ctx, config, scale=False):
+        super(Attention, self).__init__()
+        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
+        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
+        assert n_state % config.n_head == 0
+        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
+        self.n_head = config.n_head
+        self.split_size = n_state
+        self.scale = scale
+        self.c_attn = Conv1D(n_state * 3, nx)
+        self.c_proj = Conv1D(n_state, nx)
+
+    def _attn(self, q, k, v):
+        w = torch.matmul(q, k)
+        if self.scale:
+            w = w / math.sqrt(v.size(-1))
+        nd, ns = w.size(-2), w.size(-1)
+        b = self.bias[:, :, ns-nd:ns, :ns]
+        w = w * b - 1e10 * (1 - b)
+
+        w = nn.Softmax(dim=-1)(w)
+        return torch.matmul(w, v)
+
+    def merge_heads(self, x):
+        x = x.permute(0, 2, 1, 3).contiguous()
+        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
+        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states
+
+    def split_heads(self, x, k=False):
+        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
+        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
+        if k:
+            return x.permute(0, 2, 3, 1)  # (batch, head, head_features, seq_length)
+        else:
+            return x.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+
+    def forward(self, x, layer_past=None):
+        x = self.c_attn(x)
+        query, key, value = x.split(self.split_size, dim=2)
+        query = self.split_heads(query)
+        key = self.split_heads(key, k=True)
+        value = self.split_heads(value)
+        if layer_past is not None:
+            past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1]  # transpose back cf below
+            key = torch.cat((past_key, key), dim=-1)
+            value = torch.cat((past_value, value), dim=-2)
+        present = torch.stack((key.transpose(-2, -1), value))  # transpose to have same shapes for stacking
+        a = self._attn(query, key, value)
+        a = self.merge_heads(a)
+        a = self.c_proj(a)
+        return a, present
+
+
+class MLP(nn.Module):
+    def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
+        super(MLP, self).__init__()
+        nx = config.n_embd
+        self.c_fc = Conv1D(n_state, nx)
+        self.c_proj = Conv1D(nx, n_state)
+        self.act = gelu
+
+    def forward(self, x):
+        h = self.act(self.c_fc(x))
+        h2 = self.c_proj(h)
+        return h2
+
+
+class Block(nn.Module):
+    def __init__(self, n_ctx, config, scale=False):
+        super(Block, self).__init__()
+        nx = config.n_embd
+        self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
+        self.attn = Attention(nx, n_ctx, config, scale)
+        self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
+        self.mlp = MLP(4 * nx, config)
+
+    def forward(self, x, layer_past=None):
+        a, present = self.attn(self.ln_1(x), layer_past=layer_past)
+        x = x + a
+        m = self.mlp(self.ln_2(x))
+        x = x + m
+        return x, present
+
+
+class GPT2LMHead(nn.Module):
+    """ Language Model Head for the transformer """
+
+    def __init__(self, model_embeddings_weights, config):
+        super(GPT2LMHead, self).__init__()
+        self.n_embd = config.n_embd
+        self.set_embeddings_weights(model_embeddings_weights)
+
+    def set_embeddings_weights(self, model_embeddings_weights):
+        embed_shape = model_embeddings_weights.shape
+        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
+        self.decoder.weight = model_embeddings_weights  # Tied weights
+
+    def forward(self, hidden_state):
+        # Truncated Language modeling logits (we remove the last token)
+        # h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd)
+        lm_logits = self.decoder(hidden_state)
+        return lm_logits
+
+
+class GPT2MultipleChoiceHead(nn.Module):
+    """ Classifier Head for the transformer """
+
+    def __init__(self, config):
+        super(GPT2MultipleChoiceHead, self).__init__()
+        self.n_embd = config.n_embd
+        self.linear = nn.Linear(config.n_embd, 1)
+
+        nn.init.normal_(self.linear.weight, std=0.02)
+        nn.init.normal_(self.linear.bias, 0)
+
+    def forward(self, hidden_states, mc_token_ids):
+        # Classification logits
+        # hidden_state (bsz, num_choices, seq_length, hidden_size)
+        # mc_token_ids (bsz, num_choices)
+        mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))
+        # (bsz, num_choices, 1, hidden_size)
+        multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2)
+        # (bsz, num_choices, hidden_size)
+        multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1)
+        # (bsz, num_choices)
+        return multiple_choice_logits
+
+
+class GPT2PreTrainedModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+
+    def __init__(self, config, *inputs, **kwargs):
+        super(GPT2PreTrainedModel, self).__init__()
+        if not isinstance(config, GPT2Config):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `GPT2Config`. "
+                "To create a model from a pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                )
+            )
+        self.config = config
+
+    def set_tied(self):
+        pass
+
+    def init_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path, state_dict=None, cache_dir=None, from_tf=False, *inputs, **kwargs
+    ):
+        """
+        Instantiate a GPT2PreTrainedModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+
+        Params:
+            pretrained_model_name_or_path: either:
+                - a str with the name of a pre-trained model to load selected in the list of:
+                    . `openai-gpt`
+                - a path or url to a pretrained model archive containing:
+                    . `gpt2_config.json` a configuration file for the model
+                    . `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
+                - a path or url to a pretrained model archive containing:
+                    . `bert_config.json` a configuration file for the model
+                    . a TensorFlow checkpoint with trained weights
+            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
+            *inputs, **kwargs: additional input for the specific Bert class
+                (ex: num_labels for BertForSequenceClassification)
+        """
+        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
+            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
+        else:
+            archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                "at this path or url.".format(
+                    pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
+                    archive_file, config_file
+                )
+            )
+            return None
+        if resolved_archive_file == archive_file and resolved_config_file == config_file:
+            logger.info("loading weights file {}".format(archive_file))
+            logger.info("loading configuration file {}".format(config_file))
+        else:
+            logger.info("loading weights file {} from cache at {}".format(
+                archive_file, resolved_archive_file))
+            logger.info("loading configuration file {} from cache at {}".format(
+                config_file, resolved_config_file))
+        # Load config
+        config = GPT2Config.from_json_file(resolved_config_file)
+        logger.info("Model config {}".format(config))
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        if state_dict is None and not from_tf:
+            state_dict = torch.load(resolved_archive_file, map_location='cpu' if not torch.cuda.is_available() else None)
+        if from_tf:
+            # Directly load from a TensorFlow checkpoint (stored as NumPy array)
+            return load_tf_weights_in_gpt2(model, resolved_archive_file)
+
+        old_keys = []
+        new_keys = []
+        for key in state_dict.keys():
+            new_key = None
+            if key.endswith(".g"):
+                new_key = key[:-2] + ".weight"
+            elif key.endswith(".b"):
+                new_key = key[:-2] + ".bias"
+            elif key.endswith(".w"):
+                new_key = key[:-2] + ".weight"
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, "_metadata", None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=""):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs
+            )
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + ".")
+
+        start_model = model
+        if hasattr(model, "transformer") and all(not s.startswith('transformer.') for s in state_dict.keys()):
+            start_model = model.transformer
+        load(start_model, prefix="")
+
+        if len(missing_keys) > 0:
+            logger.info(
+                "Weights of {} not initialized from pretrained model: {}".format(model.__class__.__name__, missing_keys)
+            )
+        if len(unexpected_keys) > 0:
+            logger.info(
+                "Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys)
+            )
+        if len(error_msgs) > 0:
+            raise RuntimeError(
+                "Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
+            )
+
+        # Make sure we are still sharing the output and input embeddings after loading weights
+        model.set_tied()
+        return model
+
+
+class GPT2Model(GPT2PreTrainedModel):
+    """OpenAI GPT-2 model ("Language Models are Unsupervised Multitask Learners").
+
+    Params:
+        config: a GPT2Config class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
+            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
+        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
+            with the position indices (selected in the range [0, config.n_positions - 1[.
+        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
+            You can use it to add a third type of embedding to each input token in the sequence
+            (the previous two being the word and position embeddings).
+            The input, position and token_type embeddings are summed inside the Transformer before the first
+            self-attention block.
+
+    Outputs:
+        `hidden_states`: the encoded-hidden-states at the top of the model
+            as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
+            (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
+
+    Example usage:
+    ```python
+    # Already been converted into BPE token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+
+    config = modeling_gpt2.GPT2Config()
+
+    model = modeling_gpt2.GPT2Model(config)
+    hidden_states = model(input_ids)
+    ```
+    """
+
+    def __init__(self, config):
+        super(GPT2Model, self).__init__(config)
+        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
+        self.wpe = nn.Embedding(config.n_positions, config.n_embd)
+        block = Block(config.n_ctx, config, scale=True)
+        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
+        self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+
+        self.apply(self.init_weights)
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None):
+        if past is None:
+            past_length = 0
+            past = [None] * len(self.h)
+        else:
+            past_length = past[0][0].size(-2)
+        if position_ids is None:
+            position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_ids.size(-1))
+        position_ids = position_ids.view(-1, position_ids.size(-1))
+
+        inputs_embeds = self.wte(input_ids)
+        position_embeds = self.wpe(position_ids)
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
+            token_type_embeds = self.wte(token_type_ids)
+        else:
+            token_type_embeds = 0
+        hidden_states = inputs_embeds + position_embeds + token_type_embeds
+        presents = []
+        for block, layer_past in zip(self.h, past):
+            hidden_states, present = block(hidden_states, layer_past)
+            presents.append(present)
+        hidden_states = self.ln_f(hidden_states)
+        output_shape = input_shape + (hidden_states.size(-1),)
+        return hidden_states.view(*output_shape), presents
+
+
+class GPT2LMHeadModel(GPT2PreTrainedModel):
+    """OpenAI GPT-2 model with a Language Modeling head ("Language Models are Unsupervised Multitask Learners").
+
+    Params:
+        config: a GPT2Config class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
+            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
+        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
+            with the position indices (selected in the range [0, config.n_positions - 1[.
+        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
+            You can use it to add a third type of embedding to each input token in the sequence
+            (the previous two being the word and position embeddings).
+            The input, position and token_type embeddings are summed inside the Transformer before the first
+            self-attention block.
+        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., vocab_size]
+
+    Outputs:
+        if `lm_labels` is not `None`:
+            Outputs the language modeling loss.
+        else:
+            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, config.vocab_size]
+                (or more generally [d_1, ..., d_n, config.vocab_size] were d_1 ... d_n are the dimension of input_ids)
+
+    Example usage:
+    ```python
+    # Already been converted into BPE token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+
+    config = modeling_gpt2.GPT2Config()
+
+    model = modeling_gpt2.GPT2LMHeadModel(config)
+    lm_logits = model(input_ids)
+    ```
+    """
+
+    def __init__(self, config):
+        super(GPT2LMHeadModel, self).__init__(config)
+        self.transformer = GPT2Model(config)
+        self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
+        self.apply(self.init_weights)
+
+    def set_tied(self):
+        """ Make sure we are sharing the embeddings
+        """
+        self.lm_head.set_embeddings_weights(self.transformer.wte.weight)
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None):
+        hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
+        lm_logits = self.lm_head(hidden_states)
+        if lm_labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))
+            return loss
+        return lm_logits, presents
+
+
+class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
+    """OpenAI GPT-2 model with a Language Modeling and a Multiple Choice head ("Language Models are Unsupervised Multitask Learners").
+
+    Params:
+        config: a GPT2Config class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
+            indices selected in the range [0, config.vocab_size[
+        `mc_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token from
+            which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
+        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
+            with the position indices (selected in the range [0, config.n_positions - 1[.
+        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
+            You can use it to add a third type of embedding to each input token in the sequence
+            (the previous two being the word and position embeddings).
+            The input, position and token_type embeddings are summed inside the Transformer before the first
+            self-attention block.
+        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            with indices selected in [-1, 0, ..., config.vocab_size]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., config.vocab_size]
+        `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, ..., num_choices].
+
+    Outputs:
+        if `lm_labels` and `multiple_choice_labels` are not `None`:
+            Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
+        else: a tuple with
+            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, config.vocab_size]
+            `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
+
+    Example usage:
+    ```python
+    # Already been converted into BPE token ids
+    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]])  # (bsz, number of choice, seq length)
+    mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
+
+    config = modeling_gpt2.GPT2Config()
+
+    model = modeling_gpt2.GPT2LMHeadModel(config)
+    lm_logits, multiple_choice_logits = model(input_ids, mc_token_ids)
+    ```
+    """
+
+    def __init__(self, config):
+        super(GPT2DoubleHeadsModel, self).__init__(config)
+        self.transformer = GPT2Model(config)
+        self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
+        self.multiple_choice_head = GPT2MultipleChoiceHead(config)
+        self.apply(self.init_weights)
+
+    def set_tied(self):
+        """ Make sure we are sharing the embeddings
+        """
+        self.lm_head.set_embeddings_weights(self.transformer.wte.weight)
+
+    def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None, past=None):
+        hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
+        lm_logits = self.lm_head(hidden_states)
+        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
+        losses = []
+        if lm_labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            losses.append(loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)))
+        if mc_labels is not None:
+            loss_fct = CrossEntropyLoss()
+            losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
+        if losses:
+            return losses
+        return lm_logits, mc_logits, presents
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@ -0,0 +1,810 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch OpenAI GPT model."""
+
+import collections
+import copy
+import json
+import logging
+import math
+import os
+import shutil
+import tarfile
+import tempfile
+import sys
+from io import open
+
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from torch.nn.parameter import Parameter
+
+from .file_utils import cached_path
+from .modeling import BertLayerNorm as LayerNorm
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"}
+PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"}
+
+CONFIG_NAME = "config.json"
+WEIGHTS_NAME = "pytorch_model.bin"
+
+def load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path):
+    """ Load tf pre-trained weights in a pytorch model (from NumPy arrays here)
+    """
+    import re
+    import numpy as np
+    print("Loading weights...")
+    names = json.load(open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8'))
+    shapes = json.load(open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8'))
+    offsets = np.cumsum([np.prod(shape) for shape in shapes])
+    init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)]
+    init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
+    init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
+
+    # This was used when we had a single embedding matrix for positions and tokens
+    # init_params[0] = np.concatenate([init_params[1], init_params[0]], 0)
+    # del init_params[1]
+    init_params = [arr.squeeze() for arr in init_params]
+
+    try:
+        assert model.tokens_embed.weight.shape == init_params[1].shape
+        assert model.positions_embed.weight.shape == init_params[0].shape
+    except AssertionError as e:
+        e.args += (model.tokens_embed.weight.shape, init_params[1].shape)
+        e.args += (model.positions_embed.weight.shape, init_params[0].shape)
+        raise
+
+    model.tokens_embed.weight.data = torch.from_numpy(init_params[1])
+    model.positions_embed.weight.data = torch.from_numpy(init_params[0])
+    names.pop(0)
+    # Pop position and token embedding arrays
+    init_params.pop(0)
+    init_params.pop(0)
+
+    for name, array in zip(names, init_params): # names[1:n_transfer], init_params[1:n_transfer]):
+        name = name[6:]  # skip "model/"
+        assert name[-2:] == ":0"
+        name = name[:-2]
+        name = name.split('/')
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+\d+', m_name):
+                l = re.split(r'(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] == 'g':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'b':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'w':
+                pointer = getattr(pointer, 'weight')
+            else:
+                pointer = getattr(pointer, l[0])
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        print("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+def gelu(x):
+    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}
+
+
+class OpenAIGPTConfig(object):
+    """Configuration class to store the configuration of a `OpenAIGPTModel`.
+    """
+
+    def __init__(
+        self,
+        vocab_size_or_config_json_file=40478,
+        n_special=0,
+        n_positions=512,
+        n_ctx=512,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        afn="gelu",
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+    ):
+        """Constructs OpenAIGPTConfig.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
+            n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
+            n_positions: Number of positional embeddings.
+            n_ctx: Size of the causal mask (usually same as n_positions).
+            n_embd: Dimensionality of the embeddings and hidden states.
+            n_layer: Number of hidden layers in the Transformer encoder.
+            n_head: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            afn: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            resid_pdrop: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attn_pdrop: The dropout ratio for the attention
+                probabilities.
+            embd_pdrop: The dropout ratio for the embeddings.
+            layer_norm_epsilon: epsilon to use in the layer norm layers
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+        """
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.n_special = n_special
+            self.n_ctx = n_ctx
+            self.n_positions = n_positions
+            self.n_embd = n_embd
+            self.n_layer = n_layer
+            self.n_head = n_head
+            self.afn = afn
+            self.resid_pdrop = resid_pdrop
+            self.embd_pdrop = embd_pdrop
+            self.attn_pdrop = attn_pdrop
+            self.layer_norm_epsilon = layer_norm_epsilon
+            self.initializer_range = initializer_range
+        else:
+            raise ValueError(
+                "First argument must be either a vocabulary size (int)"
+                "or the path to a pretrained model config file (str)"
+            )
+
+    @property
+    def total_tokens_embeddings(self):
+        return self.vocab_size + self.n_special
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `OpenAIGPTConfig` from a Python dictionary of parameters."""
+        config = OpenAIGPTConfig(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `OpenAIGPTConfig` from a json file of parameters."""
+        with open(json_file, "r", encoding="utf-8") as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+
+class Conv1D(nn.Module):
+    def __init__(self, nf, rf, nx):
+        super(Conv1D, self).__init__()
+        self.rf = rf
+        self.nf = nf
+        if rf == 1:  # faster 1x1 conv
+            w = torch.empty(nx, nf)
+            nn.init.normal_(w, std=0.02)
+            self.weight = Parameter(w)
+            self.bias = Parameter(torch.zeros(nf))
+        else:  # was used to train LM
+            raise NotImplementedError
+
+    def forward(self, x):
+        if self.rf == 1:
+            size_out = x.size()[:-1] + (self.nf,)
+            x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
+            x = x.view(*size_out)
+        else:
+            raise NotImplementedError
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, nx, n_ctx, config, scale=False):
+        super(Attention, self).__init__()
+        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
+        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
+        assert n_state % config.n_head == 0
+        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
+        self.n_head = config.n_head
+        self.split_size = n_state
+        self.scale = scale
+        self.c_attn = Conv1D(n_state * 3, 1, nx)
+        self.c_proj = Conv1D(n_state, 1, nx)
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+
+    def _attn(self, q, k, v):
+        w = torch.matmul(q, k)
+        if self.scale:
+            w = w / math.sqrt(v.size(-1))
+        # w = w * self.bias + -1e9 * (1 - self.bias)  # TF implem method: mask_attn_weights
+        # XD: self.b may be larger than w, so we need to crop it
+        b = self.bias[:, :, : w.size(-2), : w.size(-1)]
+        w = w * b + -1e9 * (1 - b)
+
+        w = nn.Softmax(dim=-1)(w)
+        w = self.attn_dropout(w)
+        return torch.matmul(w, v)
+
+    def merge_heads(self, x):
+        x = x.permute(0, 2, 1, 3).contiguous()
+        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
+        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states
+
+    def split_heads(self, x, k=False):
+        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
+        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
+        if k:
+            return x.permute(0, 2, 3, 1)
+        else:
+            return x.permute(0, 2, 1, 3)
+
+    def forward(self, x):
+        x = self.c_attn(x)
+        query, key, value = x.split(self.split_size, dim=2)
+        query = self.split_heads(query)
+        key = self.split_heads(key, k=True)
+        value = self.split_heads(value)
+        a = self._attn(query, key, value)
+        a = self.merge_heads(a)
+        a = self.c_proj(a)
+        a = self.resid_dropout(a)
+        return a
+
+
+class MLP(nn.Module):
+    def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
+        super(MLP, self).__init__()
+        nx = config.n_embd
+        self.c_fc = Conv1D(n_state, 1, nx)
+        self.c_proj = Conv1D(nx, 1, n_state)
+        self.act = ACT_FNS[config.afn]
+        self.dropout = nn.Dropout(config.resid_pdrop)
+
+    def forward(self, x):
+        h = self.act(self.c_fc(x))
+        h2 = self.c_proj(h)
+        return self.dropout(h2)
+
+
+class Block(nn.Module):
+    def __init__(self, n_ctx, config, scale=False):
+        super(Block, self).__init__()
+        nx = config.n_embd
+        self.attn = Attention(nx, n_ctx, config, scale)
+        self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
+        self.mlp = MLP(4 * nx, config)
+        self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
+
+    def forward(self, x):
+        a = self.attn(x)
+        n = self.ln_1(x + a)
+        m = self.mlp(n)
+        h = self.ln_2(n + m)
+        return h
+
+
+class OpenAIGPTLMHead(nn.Module):
+    """ Language Model Head for the transformer """
+
+    def __init__(self, model_embeddings_weights, config):
+        super(OpenAIGPTLMHead, self).__init__()
+        self.n_embd = config.n_embd
+        self.set_embeddings_weights(model_embeddings_weights)
+
+    def set_embeddings_weights(self, model_embeddings_weights):
+        embed_shape = model_embeddings_weights.shape
+        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
+        self.decoder.weight = model_embeddings_weights  # Tied weights
+
+    def forward(self, hidden_state):
+        # Truncated Language modeling logits (we remove the last token)
+        # h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd)
+        lm_logits = self.decoder(hidden_state)
+        return lm_logits
+
+
+class OpenAIGPTMultipleChoiceHead(nn.Module):
+    """ Classifier Head for the transformer """
+
+    def __init__(self, config):
+        super(OpenAIGPTMultipleChoiceHead, self).__init__()
+        self.n_embd = config.n_embd
+        # self.multiple_choice_token = multiple_choice_token
+        self.dropout = nn.Dropout2d(config.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
+        self.linear = nn.Linear(config.n_embd, 1)
+
+        nn.init.normal_(self.linear.weight, std=0.02)
+        nn.init.normal_(self.linear.bias, 0)
+
+    def forward(self, hidden_states, mc_token_ids):
+        # Classification logits
+        # hidden_state (bsz, num_choices, seq_length, hidden_size)
+        # mc_token_ids (bsz, num_choices)
+        mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))
+        # (bsz, num_choices, 1, hidden_size)
+        multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2)
+        # (bsz, num_choices, hidden_size)
+        multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1)
+        # (bsz, num_choices)
+        return multiple_choice_logits
+
+
+class OpenAIGPTPreTrainedModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+
+    def __init__(self, config, *inputs, **kwargs):
+        super(OpenAIGPTPreTrainedModel, self).__init__()
+        if not isinstance(config, OpenAIGPTConfig):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `OpenAIGPTConfig`. "
+                "To create a model from a pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                )
+            )
+        self.config = config
+
+    def init_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def set_num_special_tokens(self, num_special_tokens):
+        pass
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path, num_special_tokens=None, state_dict=None, cache_dir=None, from_tf=False, *inputs, **kwargs
+    ):
+        """
+        Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+
+        Params:
+            pretrained_model_name_or_path: either:
+                - a str with the name of a pre-trained model to load selected in the list of:
+                    . `openai-gpt`
+                - a path or url to a pretrained model archive containing:
+                    . `openai_gpt_config.json` a configuration file for the model
+                    . `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
+                - a path or url to a pretrained model archive containing:
+                    . `bert_config.json` a configuration file for the model
+                    . a series of NumPy files containing OpenAI TensorFlow trained weights
+            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
+            *inputs, **kwargs: additional input for the specific Bert class
+                (ex: num_labels for BertForSequenceClassification)
+        """
+        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
+            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
+        else:
+            archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                "at this path or url.".format(
+                    pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
+                    archive_file, config_file
+                )
+            )
+            return None
+        if resolved_archive_file == archive_file and resolved_config_file == config_file:
+            logger.info("loading weights file {}".format(archive_file))
+            logger.info("loading configuration file {}".format(config_file))
+        else:
+            logger.info("loading weights file {} from cache at {}".format(
+                archive_file, resolved_archive_file))
+            logger.info("loading configuration file {} from cache at {}".format(
+                config_file, resolved_config_file))
+        # Load config
+        config = OpenAIGPTConfig.from_json_file(resolved_config_file)
+        logger.info("Model config {}".format(config))
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        if state_dict is None and not from_tf:
+            state_dict = torch.load(resolved_archive_file, map_location='cpu' if not torch.cuda.is_available() else None)
+        if from_tf:
+            # Directly load from a TensorFlow checkpoint (stored as NumPy array)
+            return load_tf_weights_in_openai_gpt(model, resolved_archive_file)
+
+        old_keys = []
+        new_keys = []
+        for key in state_dict.keys():
+            new_key = None
+            if key.endswith(".g"):
+                new_key = key[:-2] + ".weight"
+            elif key.endswith(".b"):
+                new_key = key[:-2] + ".bias"
+            elif key.endswith(".w"):
+                new_key = key[:-2] + ".weight"
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, "_metadata", None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=""):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs
+            )
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + ".")
+
+        start_model = model
+        if hasattr(model, "transformer") and all(not s.startswith('transformer.') for s in state_dict.keys()):
+            start_model = model.transformer
+        load(start_model, prefix="")
+
+        if len(missing_keys) > 0:
+            logger.info(
+                "Weights of {} not initialized from pretrained model: {}".format(model.__class__.__name__, missing_keys)
+            )
+        if len(unexpected_keys) > 0:
+            logger.info(
+                "Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys)
+            )
+        if len(error_msgs) > 0:
+            raise RuntimeError(
+                "Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
+            )
+
+        # Add additional embeddings for special tokens if needed
+        # This step also make sure we are still sharing the output and input embeddings after loading weights
+        model.set_num_special_tokens(num_special_tokens if num_special_tokens is not None else config.n_special)
+        return model
+
+
+class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
+    """OpenAI GPT model ("Improving Language Understanding by Generative Pre-Training").
+
+    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
+    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
+    Special tokens need to be trained during the fine-tuning if you use them.
+    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+
+    The embeddings are ordered as follow in the token embeddings matrice:
+        [0,                                                         ----------------------
+         ...                                                        -> word embeddings
+         config.vocab_size - 1,                                     ______________________
+         config.vocab_size,
+         ...                                                        -> special embeddings
+         config.vocab_size + config.n_special - 1]                  ______________________
+
+    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
+        total_tokens_embeddings = config.vocab_size + config.n_special
+    You should use the associate indices to index the embeddings.
+
+    Params:
+        config: a OpenAIGPTConfig class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
+            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
+        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
+            with the position indices (selected in the range [0, config.n_positions - 1[.
+        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
+            You can use it to add a third type of embedding to each input token in the sequence
+            (the previous two being the word and position embeddings).
+            The input, position and token_type embeddings are summed inside the Transformer before the first
+            self-attention block.
+
+    Outputs:
+        `hidden_states`: the encoded-hidden-states at the top of the model
+            as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
+            (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
+
+    Example usage:
+    ```python
+    # Already been converted into BPE token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+
+    config = modeling_openai.OpenAIGPTConfig()
+
+    model = modeling_openai.OpenAIGPTModel(config)
+    hidden_states = model(input_ids)
+    ```
+    """
+
+    def __init__(self, config):
+        super(OpenAIGPTModel, self).__init__(config)
+        num_tokens = config.vocab_size + config.n_special
+        self.tokens_embed = nn.Embedding(num_tokens, config.n_embd)
+        self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
+        self.drop = nn.Dropout(config.embd_pdrop)
+        block = Block(config.n_ctx, config, scale=True)
+        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
+
+        self.apply(self.init_weights)
+        # nn.init.normal_(self.embed.weight, std=0.02)
+
+    def set_num_special_tokens(self, num_special_tokens):
+        " Update input embeddings with new embedding matrice if needed "
+        if self.config.n_special == num_special_tokens:
+            return
+        # Update config
+        self.config.n_special = num_special_tokens
+        # # Build new embeddings and initialize
+        old_embed = self.tokens_embed
+        self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
+        # Initialize all new embeddings (in particular the special tokens)
+        self.init_weights(self.tokens_embed)
+        # Copy word and positional embeddings from the previous weights
+        self.tokens_embed.weight.data[: self.config.vocab_size, :] = old_embed.weight.data[: self.config.vocab_size, :]
+        self.tokens_embed.weight.data[-self.config.n_positions :, :] = old_embed.weight.data[-self.config.n_positions :, :]
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None):
+        if position_ids is None:
+            # This was used when we had a single embedding matrice from position and token embeddings
+            # start = self.config.vocab_size + self.config.n_special
+            # end = start + input_ids.size(-1)
+            # position_ids = torch.arange(start, end, dtype=torch.long, device=input_ids.device)
+            position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_ids.size(-1))
+        position_ids = position_ids.view(-1, position_ids.size(-1))
+
+        inputs_embeds = self.tokens_embed(input_ids)
+        position_embeds = self.positions_embed(position_ids)
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
+            token_type_embeds = self.tokens_embed(token_type_ids)
+        else:
+            token_type_embeds = 0
+        # Add the position information to the input embeddings
+        # h = e.sum(dim=2)
+        hidden_states = inputs_embeds + position_embeds + token_type_embeds
+        for block in self.h:
+            hidden_states = block(hidden_states)
+        output_shape = input_shape + (hidden_states.size(-1),)
+        return hidden_states.view(*output_shape)
+
+
+class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
+    """OpenAI GPT model with a Language Modeling head ("Improving Language Understanding by Generative Pre-Training").
+
+    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
+    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
+    Special tokens need to be trained during the fine-tuning if you use them.
+    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+
+    The embeddings are ordered as follow in the token embeddings matrice:
+        [0,                                                         ----------------------
+         ...                                                        -> word embeddings
+         config.vocab_size - 1,                                     ______________________
+         config.vocab_size,
+         ...                                                        -> special embeddings
+         config.vocab_size + config.n_special - 1]                  ______________________
+
+    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
+        total_tokens_embeddings = config.vocab_size + config.n_special
+    You should use the associate indices to index the embeddings.
+
+    Params:
+        config: a OpenAIGPTConfig class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
+            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
+        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
+            with the position indices (selected in the range [0, config.n_positions - 1[.
+        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
+            You can use it to add a third type of embedding to each input token in the sequence
+            (the previous two being the word and position embeddings).
+            The input, position and token_type embeddings are summed inside the Transformer before the first
+            self-attention block.
+        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., vocab_size]
+
+    Outputs:
+        if `lm_labels` is not `None`:
+            Outputs the language modeling loss.
+        else:
+            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, total_tokens_embeddings]
+                (or more generally [d_1, ..., d_n, total_tokens_embeddings] were d_1 ... d_n are the dimension of input_ids)
+
+    Example usage:
+    ```python
+    # Already been converted into BPE token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+
+    config = modeling_openai.OpenAIGPTConfig()
+
+    model = modeling_openai.OpenAIGPTLMHeadModel(config)
+    lm_logits = model(input_ids)
+    ```
+    """
+
+    def __init__(self, config):
+        super(OpenAIGPTLMHeadModel, self).__init__(config)
+        self.transformer = OpenAIGPTModel(config)
+        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
+        self.apply(self.init_weights)
+
+    def set_num_special_tokens(self, num_special_tokens):
+        """ Update input and output embeddings with new embedding matrice
+            Make sure we are sharing the embeddings
+        """
+        self.transformer.set_num_special_tokens(num_special_tokens)
+        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight)
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None):
+        hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
+        lm_logits = self.lm_head(hidden_states)
+        if lm_labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))
+            return loss
+        return lm_logits
+
+
+class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
+    """OpenAI GPT model with a Language Modeling and a Multiple Choice head ("Improving Language Understanding by Generative Pre-Training").
+
+    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
+    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
+    Special tokens need to be trained during the fine-tuning if you use them.
+    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+
+    The embeddings are ordered as follow in the token embeddings matrice:
+        [0,                                                         ----------------------
+         ...                                                        -> word embeddings
+         config.vocab_size - 1,                                     ______________________
+         config.vocab_size,
+         ...                                                        -> special embeddings
+         config.vocab_size + config.n_special - 1]                  ______________________
+
+    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
+        total_tokens_embeddings = config.vocab_size + config.n_special
+    You should use the associate indices to index the embeddings.
+
+    Params:
+        config: a OpenAIGPTConfig class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
+            indices selected in the range [0, total_tokens_embeddings[
+        `mc_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token from
+            which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
+        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
+            with the position indices (selected in the range [0, config.n_positions - 1[.
+        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
+            You can use it to add a third type of embedding to each input token in the sequence
+            (the previous two being the word and position embeddings).
+            The input, position and token_type embeddings are summed inside the Transformer before the first
+            self-attention block.
+        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            with indices selected in [-1, 0, ..., total_tokens_embeddings]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., total_tokens_embeddings]
+        `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, ..., num_choices].
+
+    Outputs:
+        if `lm_labels` and `multiple_choice_labels` are not `None`:
+            Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
+        else: a tuple with
+            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_tokens_embeddings]
+            `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
+
+    Example usage:
+    ```python
+    # Already been converted into BPE token ids
+    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]])  # (bsz, number of choice, seq length)
+    mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
+
+    config = modeling_openai.OpenAIGPTConfig()
+
+    model = modeling_openai.OpenAIGPTLMHeadModel(config)
+    lm_logits, multiple_choice_logits = model(input_ids, mc_token_ids)
+    ```
+    """
+
+    def __init__(self, config):
+        super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
+        self.transformer = OpenAIGPTModel(config)
+        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
+        self.multiple_choice_head = OpenAIGPTMultipleChoiceHead(config)
+        self.apply(self.init_weights)
+
+    def set_num_special_tokens(self, num_special_tokens):
+        """ Update input and output embeddings with new embedding matrice
+            Make sure we are sharing the embeddings
+        """
+        self.transformer.set_num_special_tokens(num_special_tokens)
+        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight)
+
+    def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None):
+        hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
+        lm_logits = self.lm_head(hidden_states)
+        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
+        losses = []
+        if lm_labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            losses.append(loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)))
+        if mc_labels is not None:
+            loss_fct = CrossEntropyLoss()
+            losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
+        if losses:
+            return losses
+        return lm_logits, mc_logits
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
--- a/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
@ -0,0 +1,402 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Utilities for PyTorch Transformer XL model.
+    Directly adapted from https://github.com/kimiyoung/transformer-xl.
+"""
+
+from collections import defaultdict
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# CUDA_MAJOR = int(torch.version.cuda.split('.')[0])
+# CUDA_MINOR = int(torch.version.cuda.split('.')[1])
+
+class ProjectedAdaptiveLogSoftmax(nn.Module):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
+                 keep_order=False):
+        super(ProjectedAdaptiveLogSoftmax, self).__init__()
+
+        self.n_token = n_token
+        self.d_embed = d_embed
+        self.d_proj = d_proj
+
+        self.cutoffs = cutoffs + [n_token]
+        self.cutoff_ends = [0] + self.cutoffs
+        self.div_val = div_val
+
+        self.shortlist_size = self.cutoffs[0]
+        self.n_clusters = len(self.cutoffs) - 1
+        self.head_size = self.shortlist_size + self.n_clusters
+
+        if self.n_clusters > 0:
+            self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed))
+            self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters))
+
+        self.out_layers = nn.ModuleList()
+        self.out_projs = nn.ParameterList()
+
+        if div_val == 1:
+            for i in range(len(self.cutoffs)):
+                if d_proj != d_embed:
+                    self.out_projs.append(
+                        nn.Parameter(torch.Tensor(d_proj, d_embed))
+                    )
+                else:
+                    self.out_projs.append(None)
+
+            self.out_layers.append(nn.Linear(d_embed, n_token))
+        else:
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
+                d_emb_i = d_embed // (div_val ** i)
+
+                self.out_projs.append(
+                    nn.Parameter(torch.Tensor(d_proj, d_emb_i))
+                )
+
+                self.out_layers.append(nn.Linear(d_emb_i, r_idx-l_idx))
+
+        self.keep_order = keep_order
+
+    def _compute_logit(self, hidden, weight, bias, proj):
+        if proj is None:
+            logit = F.linear(hidden, weight, bias=bias)
+        else:
+            # if CUDA_MAJOR <= 9 and CUDA_MINOR <= 1:
+            proj_hid = F.linear(hidden, proj.t().contiguous())
+            logit = F.linear(proj_hid, weight, bias=bias)
+            # else:
+            #     logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t()))
+            #     if bias is not None:
+            #         logit = logit + bias
+
+        return logit
+
+    def forward(self, hidden, target=None, keep_order=False):
+        '''
+            Params:
+                hidden :: [len*bsz x d_proj]
+                target :: [len*bsz]
+            Return:
+                if target is None:
+                    out :: [len*bsz] Negative log likelihood
+                else:
+                    out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary
+            We could replace this implementation by the native PyTorch one
+            if their's had an option to set bias on all clusters in the native one.
+            here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
+        '''
+
+        if target is not None:
+            target = target.view(-1)
+            if hidden.size(0) != target.size(0):
+                raise RuntimeError('Input and target should have the same size '
+                                'in the batch dimension.')
+
+        if self.n_clusters == 0:
+            logit = self._compute_logit(hidden, self.out_layers[0].weight,
+                                        self.out_layers[0].bias, self.out_projs[0])
+            if target is not None:
+                output = -F.log_softmax(logit, dim=-1) \
+                        .gather(1, target.unsqueeze(1)).squeeze(1)
+            else:
+                output = F.log_softmax(logit, dim=-1)
+        else:
+            # construct weights and biases
+            weights, biases = [], []
+            for i in range(len(self.cutoffs)):
+                if self.div_val == 1:
+                    l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                    weight_i = self.out_layers[0].weight[l_idx:r_idx]
+                    bias_i = self.out_layers[0].bias[l_idx:r_idx]
+                else:
+                    weight_i = self.out_layers[i].weight
+                    bias_i = self.out_layers[i].bias
+
+                if i == 0:
+                    weight_i = torch.cat(
+                        [weight_i, self.cluster_weight], dim=0)
+                    bias_i = torch.cat(
+                        [bias_i, self.cluster_bias], dim=0)
+
+                weights.append(weight_i)
+                biases.append(bias_i)
+
+            head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]
+
+            head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
+            head_logprob = F.log_softmax(head_logit, dim=1)
+
+            if target is None:
+                out = hidden.new_empty((head_logit.size(0), self.n_token))
+            else:
+                out = torch.zeros_like(target, dtype=hidden.dtype, device=hidden.device)
+
+            offset = 0
+            cutoff_values = [0] + self.cutoffs
+            for i in range(len(cutoff_values) - 1):
+                l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1]
+
+                if target is not None:
+                    mask_i = (target >= l_idx) & (target < r_idx)
+                    indices_i = mask_i.nonzero().squeeze()
+
+                    if indices_i.numel() == 0:
+                        continue
+
+                    target_i = target.index_select(0, indices_i) - l_idx
+                    head_logprob_i = head_logprob.index_select(0, indices_i)
+                    hidden_i = hidden.index_select(0, indices_i)
+                else:
+                    hidden_i = hidden
+
+                if i == 0:
+                    if target is not None:
+                        logprob_i = head_logprob_i.gather(1, target_i[:, None]).squeeze(1)
+                    else:
+                        out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]]
+                else:
+                    weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
+
+                    tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i)
+                    tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
+                    cluster_prob_idx = self.cutoffs[0] + i - 1  # No probability for the head cluster
+                    if target is not None:
+                        logprob_i = head_logprob_i[:, cluster_prob_idx] \
+                                + tail_logprob_i.gather(1, target_i[:, None]).squeeze(1)
+                    else:
+                        logprob_i = head_logprob[:, cluster_prob_idx, None] + tail_logprob_i
+                        out[:, l_idx:r_idx] = logprob_i
+
+                if target is not None:
+                    if (hasattr(self, 'keep_order') and self.keep_order) or keep_order:
+                        out.index_copy_(0, indices_i, -logprob_i)
+                    else:
+                        out[offset:offset+logprob_i.size(0)].copy_(-logprob_i)
+                    offset += logprob_i.size(0)
+
+        return out
+
+
+    def log_prob(self, hidden):
+        r""" Computes log probabilities for all :math:`n\_classes`
+        From: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.py
+        Args:
+            hidden (Tensor): a minibatch of examples
+        Returns:
+            log-probabilities of for each class :math:`c`
+            in range :math:`0 <= c <= n\_classes`, where :math:`n\_classes` is a
+            parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor.
+        Shape:
+            - Input: :math:`(N, in\_features)`
+            - Output: :math:`(N, n\_classes)`
+        """
+        if self.n_clusters == 0:
+            logit = self._compute_logit(hidden, self.out_layers[0].weight,
+                                        self.out_layers[0].bias, self.out_projs[0])
+            return F.log_softmax(logit, dim=-1)
+        else:
+            # construct weights and biases
+            weights, biases = [], []
+            for i in range(len(self.cutoffs)):
+                if self.div_val == 1:
+                    l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                    weight_i = self.out_layers[0].weight[l_idx:r_idx]
+                    bias_i = self.out_layers[0].bias[l_idx:r_idx]
+                else:
+                    weight_i = self.out_layers[i].weight
+                    bias_i = self.out_layers[i].bias
+
+                if i == 0:
+                    weight_i = torch.cat(
+                        [weight_i, self.cluster_weight], dim=0)
+                    bias_i = torch.cat(
+                        [bias_i, self.cluster_bias], dim=0)
+
+                weights.append(weight_i)
+                biases.append(bias_i)
+
+            head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]
+            head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
+
+            out = hidden.new_empty((head_logit.size(0), self.n_token))
+            head_logprob = F.log_softmax(head_logit, dim=1)
+
+            cutoff_values = [0] + self.cutoffs
+            for i in range(len(cutoff_values) - 1):
+                start_idx, stop_idx = cutoff_values[i], cutoff_values[i + 1]
+
+                if i == 0:
+                    out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]]
+                else:
+                    weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
+
+                    tail_logit_i = self._compute_logit(hidden, weight_i, bias_i, proj_i)
+                    tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
+
+                    logprob_i = head_logprob[:, -i] + tail_logprob_i
+                    out[:, start_idx, stop_idx] = logprob_i
+
+            return out
+
+
+class LogUniformSampler(object):
+    def __init__(self, range_max, n_sample):
+        """
+        Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py
+            `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
+
+        expected count can be approximated by 1 - (1 - p)^n
+        and we use a numerically stable version -expm1(num_tries * log1p(-p))
+
+        Our implementation fixes num_tries at 2 * n_sample, and the actual #samples will vary from run to run
+        """
+        with torch.no_grad():
+            self.range_max = range_max
+            log_indices = torch.arange(1., range_max+2., 1.).log_()
+            self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
+            # print('P', self.dist.numpy().tolist()[-30:])
+
+            self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float()
+
+        self.n_sample = n_sample
+
+    def sample(self, labels):
+        """
+            labels: [b1, b2]
+        Return
+            true_log_probs: [b1, b2]
+            samp_log_probs: [n_sample]
+            neg_samples: [n_sample]
+        """
+
+        # neg_samples = torch.empty(0).long()
+        n_sample = self.n_sample
+        n_tries = 2 * n_sample
+
+        with torch.no_grad():
+            neg_samples = torch.multinomial(self.dist, n_tries, replacement=True).unique()
+            device = labels.device
+            neg_samples = neg_samples.to(device)
+            true_log_probs = self.log_q[labels].to(device)
+            samp_log_probs = self.log_q[neg_samples].to(device)
+            return true_log_probs, samp_log_probs, neg_samples
+
+def sample_logits(embedding, bias, labels, inputs, sampler):
+    """
+        embedding: an nn.Embedding layer
+        bias: [n_vocab]
+        labels: [b1, b2]
+        inputs: [b1, b2, n_emb]
+        sampler: you may use a LogUniformSampler
+    Return
+        logits: [b1, b2, 1 + n_sample]
+    """
+    true_log_probs, samp_log_probs, neg_samples = sampler.sample(labels)
+    n_sample = neg_samples.size(0)
+    b1, b2 = labels.size(0), labels.size(1)
+    all_ids = torch.cat([labels.view(-1), neg_samples])
+    all_w = embedding(all_ids)
+    true_w = all_w[: -n_sample].view(b1, b2, -1)
+    sample_w = all_w[- n_sample:].view(n_sample, -1)
+
+    all_b = bias[all_ids]
+    true_b = all_b[: -n_sample].view(b1, b2)
+    sample_b = all_b[- n_sample:]
+
+    hit = (labels[:, :, None] == neg_samples).detach()
+
+    true_logits = torch.einsum('ijk,ijk->ij',
+        [true_w, inputs]) + true_b - true_log_probs
+    sample_logits = torch.einsum('lk,ijk->ijl',
+        [sample_w, inputs]) + sample_b - samp_log_probs
+    sample_logits.masked_fill_(hit, -1e30)
+    logits = torch.cat([true_logits[:, :, None], sample_logits], -1)
+
+    return logits
+
+
+# class LogUniformSampler(object):
+#     def __init__(self, range_max, unique=False):
+#         """
+#         Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py
+#             `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
+#         """
+#         self.range_max = range_max
+#         log_indices = torch.arange(1., range_max+2., 1.).log_()
+#         self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
+
+#         self.unique = unique
+
+#         if self.unique:
+#             self.exclude_mask = torch.ByteTensor(range_max).fill_(0)
+
+#     def sample(self, n_sample, labels):
+#         pos_sample, new_labels = labels.unique(return_inverse=True)
+#         n_pos_sample = pos_sample.size(0)
+#         n_neg_sample = n_sample - n_pos_sample
+
+#         if self.unique:
+#             self.exclude_mask.index_fill_(0, pos_sample, 1)
+#             sample_dist = self.dist.clone().masked_fill_(self.exclude_mask, 0)
+#             self.exclude_mask.index_fill_(0, pos_sample, 0)
+#         else:
+#             sample_dist = self.dist
+
+#         neg_sample = torch.multinomial(sample_dist, n_neg_sample)
+
+#         sample = torch.cat([pos_sample, neg_sample])
+#         sample_prob = self.dist[sample]
+
+#         return new_labels, sample, sample_prob
+
+
+if __name__ == '__main__':
+    S, B = 3, 4
+    n_vocab = 10000
+    n_sample = 5
+    H = 32
+
+    labels = torch.LongTensor(S, B).random_(0, n_vocab)
+
+    # sampler = LogUniformSampler(n_vocab, unique=False)
+    # new_labels, sample, sample_prob = sampler.sample(n_sample, labels)
+
+    sampler = LogUniformSampler(n_vocab, n_sample)#, unique=True)
+    # true_probs, samp_probs, neg_samples = sampler.sample(n_sample, labels)
+
+    # print('true_probs', true_probs.numpy().tolist())
+    # print('samp_probs', samp_probs.numpy().tolist())
+    # print('neg_samples', neg_samples.numpy().tolist())
+
+    # print('sum', torch.sum(sampler.dist).item())
+
+    # assert torch.all(torch.sort(sample.unique())[0].eq(torch.sort(sample)[0])).item()
+
+    embedding = nn.Embedding(n_vocab, H)
+    bias = torch.zeros(n_vocab)
+    inputs = torch.Tensor(S, B, H).normal_()
+
+    logits, out_labels = sample_logits(embedding, bias, labels, inputs, sampler, n_sample)
+    print('logits', logits.detach().numpy().tolist())
+    print('logits shape', logits.size())
+    print('out_labels', out_labels.detach().numpy().tolist())
+    print('out_labels shape', out_labels.size())
+
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@ -17,6 +17,7 @@
 import math
 import torch
 from torch.optim import Optimizer
+from torch.optim.optimizer import required
 from torch.nn.utils import clip_grad_norm_

 def warmup_cosine(x, warmup=0.002):
@ -52,13 +53,13 @@ class BertAdam(Optimizer):
        b1: Adams b1. Default: 0.9
        b2: Adams b2. Default: 0.999
        e: Adams epsilon. Default: 1e-6
-        weight_decay_rate: Weight decay. Default: 0.01
+        weight_decay: Weight decay. Default: 0.01
        max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
    """
-    def __init__(self, params, lr, warmup=-1, t_total=-1, schedule='warmup_linear',
-                 b1=0.9, b2=0.999, e=1e-6, weight_decay_rate=0.01,
+    def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
+                 b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01,
                 max_grad_norm=1.0):
-        if not lr >= 0.0:
+        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
        if schedule not in SCHEDULES:
            raise ValueError("Invalid schedule parameter: {}".format(schedule))
@ -71,7 +72,7 @@ class BertAdam(Optimizer):
        if not e >= 0.0:
            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
        defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
-                        b1=b1, b2=b2, e=e, weight_decay_rate=weight_decay_rate,
+                        b1=b1, b2=b2, e=e, weight_decay=weight_decay,
                        max_grad_norm=max_grad_norm)
        super(BertAdam, self).__init__(params, defaults)

@ -139,8 +140,8 @@ class BertAdam(Optimizer):
                # Instead we want to decay the weights in a manner that doesn't interact
                # with the m/v parameters. This is equivalent to adding the square
                # of the weights to the loss with plain (non-momentum) SGD.
-                if group['weight_decay_rate'] > 0.0:
-                    update += group['weight_decay_rate'] * p.data
+                if group['weight_decay'] > 0.0:
+                    update += group['weight_decay'] * p.data

                if group['t_total'] != -1:
                    schedule_fct = SCHEDULES[group['schedule']]
--- a/pytorch_pretrained_bert/optimization_openai.py
+++ b/pytorch_pretrained_bert/optimization_openai.py
@ -0,0 +1,140 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch optimization for OpenAI GPT model."""
+
+import math
+import torch
+from torch.optim import Optimizer
+from torch.optim.optimizer import required
+from torch.nn.utils import clip_grad_norm_
+
+def warmup_cosine(x, warmup=0.002):
+    s = 1 if x <= warmup else 0
+    return s*(x/warmup) + (1-s)*(0.5 * (1 + torch.cos(math.pi * x)))
+
+def warmup_constant(x, warmup=0.002):
+    s = 1 if x <= warmup else 0
+    return s*(x/warmup) + (1-s)*1
+
+def warmup_linear(x, warmup=0.002):
+    s = 1 if x <= warmup else 0
+    return (s*(x/warmup) + (1-s))*(1-x)
+
+SCHEDULES = {
+    'warmup_cosine':warmup_cosine,
+    'warmup_constant':warmup_constant,
+    'warmup_linear':warmup_linear,
+}
+
+
+class OpenAIAdam(Optimizer):
+    """Implements Open AI version of Adam algorithm with weight decay fix.
+    """
+    def __init__(self, params, lr=required, schedule='warmup_linear', warmup=-1, t_total=-1,
+                 b1=0.9, b2=0.999, e=1e-8, weight_decay=0,
+                 vector_l2=False, max_grad_norm=-1, **kwargs):
+        if lr is not required and lr < 0.0:
+            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
+        if schedule not in SCHEDULES:
+            raise ValueError("Invalid schedule parameter: {}".format(schedule))
+        if not 0.0 <= warmup < 1.0 and not warmup == -1:
+            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
+        if not 0.0 <= b1 < 1.0:
+            raise ValueError("Invalid b1 parameter: {}".format(b1))
+        if not 0.0 <= b2 < 1.0:
+            raise ValueError("Invalid b2 parameter: {}".format(b2))
+        if not e >= 0.0:
+            raise ValueError("Invalid epsilon value: {}".format(e))
+        defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
+                        b1=b1, b2=b2, e=e, weight_decay=weight_decay, vector_l2=vector_l2,
+                        max_grad_norm=max_grad_norm)
+        super(OpenAIAdam, self).__init__(params, defaults)
+
+    def get_lr(self):
+        lr = []
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+                if len(state) == 0:
+                    return [0]
+                if group['t_total'] != -1:
+                    schedule_fct = SCHEDULES[group['schedule']]
+                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
+                else:
+                    lr_scheduled = group['lr']
+                lr.append(lr_scheduled)
+        return lr
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['b1'], group['b2']
+
+                state['step'] += 1
+
+                # Add grad clipping
+                if group['max_grad_norm'] > 0:
+                    clip_grad_norm_(p, group['max_grad_norm'])
+
+                # Decay the first and second moment running average coefficient
+                exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                denom = exp_avg_sq.sqrt().add_(group['e'])
+
+                bias_correction1 = 1 - beta1 ** state['step']
+                bias_correction2 = 1 - beta2 ** state['step']
+
+                if group['t_total'] != -1:
+                    schedule_fct = SCHEDULES[group['schedule']]
+                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
+                else:
+                    lr_scheduled = group['lr']
+
+                step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
+
+                p.data.addcdiv_(-step_size, exp_avg, denom)
+
+                # Add weight decay at the end (fixed version)
+                if (len(p.size()) > 1 or group['vector_l2']) and group['weight_decay'] > 0:
+                    p.data.add_(-lr_scheduled * group['weight_decay'], p.data)
+
+        return loss
--- a/pytorch_pretrained_bert/tokenization.py
+++ b/pytorch_pretrained_bert/tokenization.py
@ -14,41 +14,37 @@
 # limitations under the License.
 """Tokenization classes."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function, unicode_literals

 import collections
-import unicodedata
-import os
 import logging
+import os
+import unicodedata
+from io import open

 from .file_utils import cached_path

-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
 logger = logging.getLogger(__name__)

 PRETRAINED_VOCAB_ARCHIVE_MAP = {
    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
-    'bert-base-multilingual': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-vocab.txt",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
 }
-
-def printable_text(text):
-    """Returns text encoded in a way suitable for print or `tf.logging`."""
-
-    # These functions want `str` for both Python2 and Python3, but in one case
-    # it's a Unicode string and in the other it's a byte string.
-    if isinstance(text, str):
-        return text
-    elif isinstance(text, bytes):
-        return text.decode("utf-8", "ignore")
-    else:
-        raise ValueError("Unsupported string type: %s" % (type(text)))
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'bert-base-uncased': 512,
+    'bert-large-uncased': 512,
+    'bert-base-cased': 512,
+    'bert-large-cased': 512,
+    'bert-base-multilingual-uncased': 512,
+    'bert-base-multilingual-cased': 512,
+    'bert-base-chinese': 512,
+}
+VOCAB_NAME = 'vocab.txt'


 def load_vocab(vocab_file):
@ -77,7 +73,9 @@ def whitespace_tokenize(text):

 class BertTokenizer(object):
    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
-    def __init__(self, vocab_file, do_lower_case=True):
+
+    def __init__(self, vocab_file, do_lower_case=True, max_len=None,
+                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
@ -85,8 +83,10 @@ class BertTokenizer(object):
        self.vocab = load_vocab(vocab_file)
        self.ids_to_tokens = collections.OrderedDict(
            [(ids, tok) for tok, ids in self.vocab.items()])
-        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
+                                              never_split=never_split)
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+        self.max_len = max_len if max_len is not None else int(1e12)

    def tokenize(self, text):
        split_tokens = []
@ -100,6 +100,12 @@ class BertTokenizer(object):
        ids = []
        for token in tokens:
            ids.append(self.vocab[token])
+        if len(ids) > self.max_len:
+            raise ValueError(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this BERT model ({} > {}). Running this"
+                " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
+            )
        return ids

    def convert_ids_to_tokens(self, ids):
@ -110,47 +116,57 @@ class BertTokenizer(object):
        return tokens

    @classmethod
-    def from_pretrained(cls, pretrained_model_name, do_lower_case=True):
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
        """
        Instantiate a PreTrainedBertModel from a pre-trained model file.
        Download and cache the pre-trained model file if needed.
        """
-        if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP:
-            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name]
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
        else:
-            vocab_file = pretrained_model_name
+            vocab_file = pretrained_model_name_or_path
+        if os.path.isdir(vocab_file):
+            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
        # redirect to the cache, if necessary
        try:
-            resolved_vocab_file = cached_path(vocab_file)
-            if resolved_vocab_file == vocab_file:
-                logger.info("loading vocabulary file {}".format(vocab_file))
-            else:
-                logger.info("loading vocabulary file {} from cache at {}".format(
-                    vocab_file, resolved_vocab_file))
-            # Instantiate tokenizer.
-            tokenizer = cls(resolved_vocab_file, do_lower_case)
-        except FileNotFoundError:
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+        except EnvironmentError:
            logger.error(
                "Model name '{}' was not found in model name list ({}). "
                "We assumed '{}' was a path or url but couldn't find any file "
                "associated to this path or url.".format(
-                    pretrained_model_name,
+                    pretrained_model_name_or_path,
                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                    pretrained_model_name))
-            tokenizer = None
+                    vocab_file))
+            return None
+        if resolved_vocab_file == vocab_file:
+            logger.info("loading vocabulary file {}".format(vocab_file))
+        else:
+            logger.info("loading vocabulary file {} from cache at {}".format(
+                vocab_file, resolved_vocab_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
        return tokenizer


 class BasicTokenizer(object):
    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""

-    def __init__(self, do_lower_case=True):
+    def __init__(self,
+                 do_lower_case=True,
+                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
        """Constructs a BasicTokenizer.

        Args:
          do_lower_case: Whether to lower case the input.
        """
        self.do_lower_case = do_lower_case
+        self.never_split = never_split

    def tokenize(self, text):
        """Tokenizes a piece of text."""
@ -165,7 +181,7 @@ class BasicTokenizer(object):
        orig_tokens = whitespace_tokenize(text)
        split_tokens = []
        for token in orig_tokens:
-            if self.do_lower_case:
+            if self.do_lower_case and token not in self.never_split:
                token = token.lower()
                token = self._run_strip_accents(token)
            split_tokens.extend(self._run_split_on_punc(token))
@ -186,6 +202,8 @@ class BasicTokenizer(object):

    def _run_split_on_punc(self, text):
        """Splits punctuation on a piece of text."""
+        if text in self.never_split:
+            return [text]
        chars = list(text)
        i = 0
        start_new_word = True
@ -203,7 +221,7 @@ class BasicTokenizer(object):
            i += 1

        return ["".join(x) for x in output]
-    
+
    def _tokenize_chinese_chars(self, text):
        """Adds whitespace around any CJK character."""
        output = []
@ -228,17 +246,17 @@ class BasicTokenizer(object):
        # space-separated words, so they are not treated specially and handled
        # like the all of the other languages.
        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
-            (cp >= 0x3400 and cp <= 0x4DBF) or  #
-            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
-            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
-            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
-            (cp >= 0x2B820 and cp <= 0x2CEAF) or
-            (cp >= 0xF900 and cp <= 0xFAFF) or  #
-            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+                (cp >= 0x3400 and cp <= 0x4DBF) or  #
+                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+                (cp >= 0x2B820 and cp <= 0x2CEAF) or
+                (cp >= 0xF900 and cp <= 0xFAFF) or  #
+                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
            return True
-    
+
        return False
-    
+
    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
@ -273,7 +291,7 @@ class WordpieceTokenizer(object):

        Args:
          text: A single token or whitespace separated tokens. This should have
-            already been passed through `BasicTokenizer.
+            already been passed through `BasicTokenizer`.

        Returns:
          A list of wordpiece tokens.
--- a/pytorch_pretrained_bert/tokenization_gpt2.py
+++ b/pytorch_pretrained_bert/tokenization_gpt2.py
@ -0,0 +1,206 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import json
+import logging
+import os
+import regex as re
+from io import open
+
+try:
+    from functools import lru_cache
+except ImportError:
+    # Just a dummy decorator to get the checks to run on python2
+    # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
+    def lru_cache():
+        return lambda func: func
+
+from .file_utils import cached_path
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
+}
+PRETRAINED_MERGES_ARCHIVE_MAP = {
+    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'gpt2': 1024,
+}
+VOCAB_NAME = 'vocab.json'
+MERGES_NAME = 'merges.txt'
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+class GPT2Tokenizer(object):
+    """
+    GPT-2 BPE tokenizer. Peculiarities:
+        - Byte-level BPE
+    """
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
+        else:
+            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
+            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                "at this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    pretrained_model_name_or_path,
+                    vocab_file, merges_file))
+            return None
+        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
+            logger.info("loading vocabulary file {}".format(vocab_file))
+            logger.info("loading merges file {}".format(merges_file))
+        else:
+            logger.info("loading vocabulary file {} from cache at {}".format(
+                vocab_file, resolved_vocab_file))
+            logger.info("loading merges file {} from cache at {}".format(
+                merges_file, resolved_merges_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs, **kwargs)
+        return tokenizer
+
+    def __init__(self, vocab_file, merges_file, errors='replace', max_len=None):
+        self.max_len = max_len if max_len is not None else int(1e12)
+        self.encoder = json.load(open(vocab_file))
+        self.decoder = {v:k for k,v in self.encoder.items()}
+        self.errors = errors # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
+        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+
+        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+    def __len__(self):
+        return len(self.encoder)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        if len(bpe_tokens) > self.max_len:
+            raise ValueError(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this OpenAI GPT-2 model ({} > {}). Running this"
+                " sequence through the model will result in indexing errors".format(len(bpe_tokens), self.max_len)
+            )
+        return bpe_tokens
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        return text
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@ -0,0 +1,263 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import json
+import logging
+import os
+import re
+import sys
+from io import open
+
+from tqdm import tqdm
+
+from .file_utils import cached_path
+from .tokenization import BasicTokenizer
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json",
+}
+PRETRAINED_MERGES_ARCHIVE_MAP = {
+    'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt",
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'openai-gpt': 512,
+}
+VOCAB_NAME = 'vocab.json'
+MERGES_NAME = 'merges.txt'
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+    word is represented as tuple of symbols (symbols being variable-length strings)
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+def text_standardize(text):
+    """
+    fixes some issues the spacy tokenizer had on books corpus
+    also does some whitespace standardization
+    """
+    text = text.replace('—', '-')
+    text = text.replace('–', '-')
+    text = text.replace('―', '-')
+    text = text.replace('…', '...')
+    text = text.replace('´', "'")
+    text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
+    text = re.sub(r'\s*\n\s*', ' \n ', text)
+    text = re.sub(r'[^\S\n]+', ' ', text)
+    return text.strip()
+
+class OpenAIGPTTokenizer(object):
+    """
+    BPE tokenizer. Peculiarities:
+        - lower case all inputs
+        - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
+        - argument special_tokens and function set_special_tokens:
+            can be used to add additional symbols (ex: "__classify__") to a vocabulary.
+    """
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
+        else:
+            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
+            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                "at this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    pretrained_model_name_or_path,
+                    vocab_file, merges_file))
+            return None
+        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
+            logger.info("loading vocabulary file {}".format(vocab_file))
+            logger.info("loading merges file {}".format(merges_file))
+        else:
+            logger.info("loading vocabulary file {} from cache at {}".format(
+                vocab_file, resolved_vocab_file))
+            logger.info("loading merges file {} from cache at {}".format(
+                merges_file, resolved_merges_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs, **kwargs)
+        return tokenizer
+
+    def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
+        try:
+            import ftfy
+            import spacy
+            self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
+            self.fix_text = ftfy.fix_text
+        except ImportError:
+            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
+            self.nlp = BasicTokenizer(do_lower_case=True,
+                                      never_split=special_tokens if special_tokens is not None else [])
+            self.fix_text = None
+
+        self.max_len = max_len if max_len is not None else int(1e12)
+        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
+        self.decoder = {v:k for k,v in self.encoder.items()}
+        merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
+        merges = [tuple(merge.split()) for merge in merges]
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {}
+        self.set_special_tokens(special_tokens)
+
+    def __len__(self):
+        return len(self.encoder) + len(self.special_tokens)
+
+    def set_special_tokens(self, special_tokens):
+        """ Add a list of additional tokens to the encoder.
+            The additional tokens are indexed starting from the last index of the
+            current vocabulary in the order of the `special_tokens` list.
+        """
+        if not special_tokens:
+            self.special_tokens = {}
+            self.special_tokens_decoder = {}
+            return
+        self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
+        self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
+        if self.fix_text is None:
+            # Using BERT's BasicTokenizer: we can update the tokenizer
+            self.nlp.never_split = special_tokens
+        logger.info("Special tokens {}".format(self.special_tokens))
+
+    def bpe(self, token):
+        word = tuple(token[:-1]) + (token[-1] + '</w>',)
+        if token in self.cache:
+            return self.cache[token]
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token+'</w>'
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        if word == '\n  </w>':
+            word = '\n</w>'
+        self.cache[token] = word
+        return word
+
+    def tokenize(self, text):
+        """ Tokenize a string. """
+        split_tokens = []
+        if self.fix_text is None:
+            # Using BERT's BasicTokenizer
+            text = self.nlp.tokenize(text)
+            for token in text:
+                split_tokens.extend([t for t in self.bpe(token).split(' ')])
+        else:
+            # Using SpaCy & ftfy (original tokenization process of OpenAI GPT)
+            text = self.nlp(text_standardize(self.fix_text(text)))
+            for token in text:
+                split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """ Converts a sequence of tokens into ids using the vocab. """
+        ids = []
+        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.encoder.get(tokens, 0)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.encoder.get(token, 0))
+        if len(ids) > self.max_len:
+            raise ValueError(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this OpenAI GPT model ({} > {}). Running this"
+                " sequence through the model will result in indexing errors".format(len(ids), self.max_len)
+            )
+        return ids
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """Converts a sequence of ids in BPE tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            if i in self.special_tokens_decoder:
+                if not skip_special_tokens:
+                    tokens.append(self.special_tokens_decoder[i])
+            else:
+                tokens.append(self.decoder[i])
+        return tokens
+
+    def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=False):
+        """Converts a sequence of ids in a string."""
+        tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)
+        out_string = ''.join(tokens).replace('</w>', ' ').strip()
+        if clean_up_tokenization_spaces:
+            out_string = out_string.replace('<unk>', '')
+            out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',').replace(' ,', ','
+                    ).replace(" n't", "n't").replace(" 'm", "'m").replace(" 're", "'re").replace(" do not", " don't"
+                    ).replace(" 's", "'s").replace(" t ", "'t ").replace(" s ", "'s ").replace(" m ", "'m "
+                    ).replace(" 've", "'ve")
+        return out_string
--- a/pytorch_pretrained_bert/tokenization_transfo_xl.py
+++ b/pytorch_pretrained_bert/tokenization_transfo_xl.py
@ -0,0 +1,672 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tokenization classes for Transformer XL model.
+    Adapted from https://github.com/kimiyoung/transformer-xl.
+"""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import glob
+import logging
+import os
+import sys
+from collections import Counter, OrderedDict
+from io import open
+import unicodedata
+
+import torch
+import numpy as np
+
+from .file_utils import cached_path
+
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle
+
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin",
+}
+VOCAB_NAME = 'vocab.bin'
+
+PRETRAINED_CORPUS_ARCHIVE_MAP = {
+    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-corpus.bin",
+}
+CORPUS_NAME = 'corpus.bin'
+
+class TransfoXLTokenizer(object):
+    """
+    Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
+    """
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a TransfoXLTokenizer.
+        The TransfoXLTokenizer.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+        else:
+            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find files {} "
+                "at this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    pretrained_model_name_or_path,
+                    vocab_file))
+            return None
+        if resolved_vocab_file == vocab_file:
+            logger.info("loading vocabulary file {}".format(vocab_file))
+        else:
+            logger.info("loading vocabulary file {} from cache at {}".format(
+                vocab_file, resolved_vocab_file))
+
+        # Instantiate tokenizer.
+        tokenizer = cls(*inputs, **kwargs)
+        vocab_dict = torch.load(resolved_vocab_file)
+        for key, value in vocab_dict.items():
+            tokenizer.__dict__[key] = value
+        return tokenizer
+
+    def __init__(self, special=[], min_freq=0, max_size=None, lower_case=False,
+                 delimiter=None, vocab_file=None, never_split=("<unk>", "<eos>", "<formula>")):
+        self.counter = Counter()
+        self.special = special
+        self.min_freq = min_freq
+        self.max_size = max_size
+        self.lower_case = lower_case
+        self.delimiter = delimiter
+        self.vocab_file = vocab_file
+        self.never_split = never_split
+
+    def count_file(self, path, verbose=False, add_eos=False):
+        if verbose: print('counting file {} ...'.format(path))
+        assert os.path.exists(path)
+
+        sents = []
+        with open(path, 'r', encoding='utf-8') as f:
+            for idx, line in enumerate(f):
+                if verbose and idx > 0 and idx % 500000 == 0:
+                    print('    line {}'.format(idx))
+                symbols = self.tokenize(line, add_eos=add_eos)
+                self.counter.update(symbols)
+                sents.append(symbols)
+
+        return sents
+
+    def count_sents(self, sents, verbose=False):
+        """
+            sents : a list of sentences, each a list of tokenized symbols
+        """
+        if verbose: print('counting {} sents ...'.format(len(sents)))
+        for idx, symbols in enumerate(sents):
+            if verbose and idx > 0 and idx % 500000 == 0:
+                print('    line {}'.format(idx))
+            self.counter.update(symbols)
+
+    def _build_from_file(self, vocab_file):
+        self.idx2sym = []
+        self.sym2idx = OrderedDict()
+
+        with open(vocab_file, 'r', encoding='utf-8') as f:
+            for line in f:
+                symb = line.strip().split()[0]
+                self.add_symbol(symb)
+        if '<UNK>' in self.sym2idx:
+            self.unk_idx = self.sym2idx['<UNK>']
+        elif '<unk>' in self.sym2idx:
+            self.unk_idx = self.sym2idx['<unk>']
+        else:
+            raise ValueError('No <unkown> token in vocabulary')
+
+    def build_vocab(self):
+        if self.vocab_file:
+            print('building vocab from {}'.format(self.vocab_file))
+            self._build_from_file(self.vocab_file)
+            print('final vocab size {}'.format(len(self)))
+        else:
+            print('building vocab with min_freq={}, max_size={}'.format(
+                self.min_freq, self.max_size))
+            self.idx2sym = []
+            self.sym2idx = OrderedDict()
+
+            for sym in self.special:
+                self.add_special(sym)
+
+            for sym, cnt in self.counter.most_common(self.max_size):
+                if cnt < self.min_freq: break
+                self.add_symbol(sym)
+
+            print('final vocab size {} from {} unique tokens'.format(
+                len(self), len(self.counter)))
+
+    def encode_file(self, path, ordered=False, verbose=False, add_eos=True,
+            add_double_eos=False):
+        if verbose: print('encoding file {} ...'.format(path))
+        assert os.path.exists(path)
+        encoded = []
+        with open(path, 'r', encoding='utf-8') as f:
+            for idx, line in enumerate(f):
+                if verbose and idx > 0 and idx % 500000 == 0:
+                    print('    line {}'.format(idx))
+                symbols = self.tokenize(line, add_eos=add_eos,
+                    add_double_eos=add_double_eos)
+                encoded.append(self.convert_to_tensor(symbols))
+
+        if ordered:
+            encoded = torch.cat(encoded)
+
+        return encoded
+
+    def encode_sents(self, sents, ordered=False, verbose=False):
+        if verbose: print('encoding {} sents ...'.format(len(sents)))
+        encoded = []
+        for idx, symbols in enumerate(sents):
+            if verbose and idx > 0 and idx % 500000 == 0:
+                print('    line {}'.format(idx))
+            encoded.append(self.convert_to_tensor(symbols))
+
+        if ordered:
+            encoded = torch.cat(encoded)
+
+        return encoded
+
+    def add_special(self, sym):
+        if sym not in self.sym2idx:
+            self.idx2sym.append(sym)
+            self.sym2idx[sym] = len(self.idx2sym) - 1
+            setattr(self, '{}_idx'.format(sym.strip('<>')), self.sym2idx[sym])
+
+    def add_symbol(self, sym):
+        if sym not in self.sym2idx:
+            self.idx2sym.append(sym)
+            self.sym2idx[sym] = len(self.idx2sym) - 1
+
+    def get_sym(self, idx):
+        assert 0 <= idx < len(self), 'Index {} out of vocabulary range'.format(idx)
+        return self.idx2sym[idx]
+
+    def get_idx(self, sym):
+        if sym in self.sym2idx:
+            return self.sym2idx[sym]
+        else:
+            # print('encounter unk {}'.format(sym))
+            # assert '<eos>' not in sym
+            if hasattr(self, 'unk_idx'):
+                return self.sym2idx.get(sym, self.unk_idx)
+            # Backward compatibility with pre-trained models
+            elif '<unk>' in self.sym2idx:
+                return self.sym2idx['<unk>']
+            elif '<UNK>' in self.sym2idx:
+                return self.sym2idx['<UNK>']
+            else:
+                raise ValueError('Token not in vocabulary and no <unk> token in vocabulary for replacement')
+
+    def convert_ids_to_tokens(self, indices):
+        """Converts a sequence of indices in symbols using the vocab."""
+        return [self.get_sym(idx) for idx in indices]
+
+    def convert_tokens_to_ids(self, symbols):
+        """Converts a sequence of symbols into ids using the vocab."""
+        return [self.get_idx(sym) for sym in symbols]
+
+    def convert_to_tensor(self, symbols):
+        return torch.LongTensor(self.convert_tokens_to_ids(symbols))
+
+    def decode(self, indices, exclude=None):
+        """Converts a sequence of indices in a string."""
+        if exclude is None:
+            return ' '.join([self.get_sym(idx) for idx in indices])
+        else:
+            return ' '.join([self.get_sym(idx) for idx in indices if idx not in exclude])
+
+    def __len__(self):
+        return len(self.idx2sym)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        if text in self.never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def whitespace_tokenize(self, text):
+        """Runs basic whitespace cleaning and splitting on a peice of text."""
+        text = text.strip()
+        if not text:
+            return []
+        if self.delimiter == '':
+            tokens = text
+        else:
+            tokens = text.split(self.delimiter)
+        return tokens
+
+    def tokenize(self, line, add_eos=False, add_double_eos=False):
+        line = self._clean_text(line)
+        line = line.strip()
+
+        symbols = self.whitespace_tokenize(line)
+
+        split_symbols = []
+        for symbol in symbols:
+            if self.lower_case and symbol not in self.never_split:
+                symbol = symbol.lower()
+                symbol = self._run_strip_accents(symbol)
+            split_symbols.extend(self._run_split_on_punc(symbol))
+
+        if add_double_eos: # lm1b
+            return ['<S>'] + split_symbols + ['<S>']
+        elif add_eos:
+            return split_symbols + ['<eos>']
+        else:
+            return split_symbols
+
+
+class LMOrderedIterator(object):
+    def __init__(self, data, bsz, bptt, device='cpu', ext_len=None):
+        """
+            data -- LongTensor -- the LongTensor is strictly ordered
+        """
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+
+        self.device = device
+
+        # Work out how cleanly we can divide the dataset into bsz parts.
+        self.n_step = data.size(0) // bsz
+
+        # Trim off any extra elements that wouldn't cleanly fit (remainders).
+        data = data.narrow(0, 0, self.n_step * bsz)
+
+        # Evenly divide the data across the bsz batches.
+        self.data = data.view(bsz, -1).t().contiguous().to(device)
+
+        # Number of mini-batches
+        self.n_batch = (self.n_step + self.bptt - 1) // self.bptt
+
+    def get_batch(self, i, bptt=None):
+        if bptt is None: bptt = self.bptt
+        seq_len = min(bptt, self.data.size(0) - 1 - i)
+
+        end_idx = i + seq_len
+        beg_idx = max(0, i - self.ext_len)
+
+        data = self.data[beg_idx:end_idx]
+        target = self.data[i+1:i+1+seq_len]
+
+        data_out = data.transpose(0, 1).contiguous().to(self.device)
+        target_out = target.transpose(0, 1).contiguous().to(self.device)
+
+        return data_out, target_out, seq_len
+
+    def get_fixlen_iter(self, start=0):
+        for i in range(start, self.data.size(0) - 1, self.bptt):
+            yield self.get_batch(i)
+
+    def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3):
+        max_len = self.bptt + max_deviation * std
+        i = start
+        while True:
+            bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.
+            bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std))))
+            data, target, seq_len = self.get_batch(i, bptt)
+            i += seq_len
+            yield data, target, seq_len
+            if i >= self.data.size(0) - 2:
+                break
+
+    def __iter__(self):
+        return self.get_fixlen_iter()
+
+
+class LMShuffledIterator(object):
+    def __init__(self, data, bsz, bptt, device='cpu', ext_len=None, shuffle=False):
+        """
+            data -- list[LongTensor] -- there is no order among the LongTensors
+        """
+        self.data = data
+
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+
+        self.device = device
+        self.shuffle = shuffle
+
+    def get_sent_stream(self):
+        # index iterator
+        epoch_indices = np.random.permutation(len(self.data)) if self.shuffle \
+            else np.array(range(len(self.data)))
+
+        # sentence iterator
+        for idx in epoch_indices:
+            yield self.data[idx]
+
+    def stream_iterator(self, sent_stream):
+        # streams for each data in the batch
+        streams = [None] * self.bsz
+
+        data = torch.LongTensor(self.bptt, self.bsz)
+        target = torch.LongTensor(self.bptt, self.bsz)
+
+        n_retain = 0
+
+        while True:
+            # data   : [n_retain+bptt x bsz]
+            # target : [bptt x bsz]
+            data[n_retain:].fill_(-1)
+            target.fill_(-1)
+
+            valid_batch = True
+
+            for i in range(self.bsz):
+                n_filled = 0
+                try:
+                    while n_filled < self.bptt:
+                        if streams[i] is None or len(streams[i]) <= 1:
+                            streams[i] = next(sent_stream)
+                        # number of new tokens to fill in
+                        n_new = min(len(streams[i]) - 1, self.bptt - n_filled)
+                        # first n_retain tokens are retained from last batch
+                        data[n_retain+n_filled:n_retain+n_filled+n_new, i] = \
+                            streams[i][:n_new]
+                        target[n_filled:n_filled+n_new, i] = \
+                            streams[i][1:n_new+1]
+                        streams[i] = streams[i][n_new:]
+                        n_filled += n_new
+                except StopIteration:
+                    valid_batch = False
+                    break
+
+            if not valid_batch:
+                return
+
+            data_out = data.transpose(0, 1).contiguous().to(self.device)
+            target_out = target.transpose(0, 1).contiguous().to(self.device)
+
+            yield data_out, target_out, self.bptt
+
+            n_retain = min(data.size(0), self.ext_len)
+            if n_retain > 0:
+                data[:n_retain] = data[-n_retain:]
+            data.resize_(n_retain + self.bptt, data.size(1))
+
+    def __iter__(self):
+        # sent_stream is an iterator
+        sent_stream = self.get_sent_stream()
+
+        for batch in self.stream_iterator(sent_stream):
+            yield batch
+
+
+class LMMultiFileIterator(LMShuffledIterator):
+    def __init__(self, paths, vocab, bsz, bptt, device='cpu', ext_len=None,
+        shuffle=False):
+
+        self.paths = paths
+        self.vocab = vocab
+
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+
+        self.device = device
+        self.shuffle = shuffle
+
+    def get_sent_stream(self, path):
+        sents = self.vocab.encode_file(path, add_double_eos=True)
+        if self.shuffle:
+            np.random.shuffle(sents)
+        sent_stream = iter(sents)
+
+        return sent_stream
+
+    def __iter__(self):
+        if self.shuffle:
+            np.random.shuffle(self.paths)
+
+        for path in self.paths:
+            # sent_stream is an iterator
+            sent_stream = self.get_sent_stream(path)
+            for batch in self.stream_iterator(sent_stream):
+                yield batch
+
+
+class TransfoXLCorpus(object):
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a pre-processed corpus.
+        """
+        vocab = TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        if pretrained_model_name_or_path in PRETRAINED_CORPUS_ARCHIVE_MAP:
+            corpus_file = PRETRAINED_CORPUS_ARCHIVE_MAP[pretrained_model_name_or_path]
+        else:
+            corpus_file = os.path.join(pretrained_model_name_or_path, CORPUS_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_corpus_file = cached_path(corpus_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Corpus '{}' was not found in corpus list ({}). "
+                "We assumed '{}' was a path or url but couldn't find files {} "
+                "at this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    pretrained_model_name_or_path,
+                    corpus_file))
+            return None
+        if resolved_corpus_file == corpus_file:
+            logger.info("loading corpus file {}".format(corpus_file))
+        else:
+            logger.info("loading corpus file {} from cache at {}".format(
+                corpus_file, resolved_corpus_file))
+
+        # Instantiate tokenizer.
+        corpus = cls(*inputs, **kwargs)
+        corpus_dict = torch.load(resolved_corpus_file)
+        for key, value in corpus_dict.items():
+            corpus.__dict__[key] = value
+        corpus.vocab = vocab
+        if corpus.train is not None:
+            corpus.train = torch.tensor(corpus.train, dtype=torch.long)
+        if corpus.valid is not None:
+            corpus.valid = torch.tensor(corpus.valid, dtype=torch.long)
+        if corpus.test is not None:
+            corpus.test = torch.tensor(corpus.test, dtype=torch.long)
+        return corpus
+
+    def __init__(self, *args, **kwargs):
+        self.vocab = TransfoXLTokenizer(*args, **kwargs)
+        self.dataset = None
+        self.train = None
+        self.valid = None
+        self.test = None
+
+    def build_corpus(self, path, dataset):
+        self.dataset = dataset
+
+        if self.dataset in ['ptb', 'wt2', 'enwik8', 'text8']:
+            self.vocab.count_file(os.path.join(path, 'train.txt'))
+            self.vocab.count_file(os.path.join(path, 'valid.txt'))
+            self.vocab.count_file(os.path.join(path, 'test.txt'))
+        elif self.dataset == 'wt103':
+            self.vocab.count_file(os.path.join(path, 'train.txt'))
+        elif self.dataset == 'lm1b':
+            train_path_pattern = os.path.join(
+                path, '1-billion-word-language-modeling-benchmark-r13output',
+                'training-monolingual.tokenized.shuffled', 'news.en-*')
+            train_paths = glob.glob(train_path_pattern)
+            # the vocab will load from file when build_vocab() is called
+
+        self.vocab.build_vocab()
+
+        if self.dataset in ['ptb', 'wt2', 'wt103']:
+            self.train = self.vocab.encode_file(
+                os.path.join(path, 'train.txt'), ordered=True)
+            self.valid = self.vocab.encode_file(
+                os.path.join(path, 'valid.txt'), ordered=True)
+            self.test = self.vocab.encode_file(
+                os.path.join(path, 'test.txt'), ordered=True)
+        elif self.dataset in ['enwik8', 'text8']:
+            self.train = self.vocab.encode_file(
+                os.path.join(path, 'train.txt'), ordered=True, add_eos=False)
+            self.valid = self.vocab.encode_file(
+                os.path.join(path, 'valid.txt'), ordered=True, add_eos=False)
+            self.test = self.vocab.encode_file(
+                os.path.join(path, 'test.txt'), ordered=True, add_eos=False)
+        elif self.dataset == 'lm1b':
+            self.train = train_paths
+            self.valid = self.vocab.encode_file(
+                os.path.join(path, 'valid.txt'), ordered=False, add_double_eos=True)
+            self.test = self.vocab.encode_file(
+                os.path.join(path, 'test.txt'), ordered=False, add_double_eos=True)
+
+    def get_iterator(self, split, *args, **kwargs):
+        if split == 'train':
+            if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
+                data_iter = LMOrderedIterator(self.train, *args, **kwargs)
+            elif self.dataset == 'lm1b':
+                kwargs['shuffle'] = True
+                data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs)
+        elif split in ['valid', 'test']:
+            data = self.valid if split == 'valid' else self.test
+            if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
+                data_iter = LMOrderedIterator(data, *args, **kwargs)
+            elif self.dataset == 'lm1b':
+                data_iter = LMShuffledIterator(data, *args, **kwargs)
+
+        return data_iter
+
+
+def get_lm_corpus(datadir, dataset):
+    fn = os.path.join(datadir, 'cache.pt')
+    fn_pickle = os.path.join(datadir, 'cache.pkl')
+    if os.path.exists(fn):
+        print('Loading cached dataset...')
+        corpus = torch.load(fn_pickle)
+    elif os.path.exists(fn):
+        print('Loading cached dataset from pickle...')
+        with open(fn, "rb") as fp:
+            corpus = pickle.load(fp)
+    else:
+        print('Producing dataset {}...'.format(dataset))
+        kwargs = {}
+        if dataset in ['wt103', 'wt2']:
+            kwargs['special'] = ['<eos>']
+            kwargs['lower_case'] = False
+        elif dataset == 'ptb':
+            kwargs['special'] = ['<eos>']
+            kwargs['lower_case'] = True
+        elif dataset == 'lm1b':
+            kwargs['special'] = []
+            kwargs['lower_case'] = False
+            kwargs['vocab_file'] = os.path.join(datadir, '1b_word_vocab.txt')
+        elif dataset in ['enwik8', 'text8']:
+            pass
+
+        corpus = TransfoXLCorpus(datadir, dataset, **kwargs)
+        torch.save(corpus, fn)
+
+    return corpus
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,5 @@
-# This installs Pytorch for CUDA 8 only. If you are using a newer version,
-# please visit http://pytorch.org/ and install the relevant version.
-torch>=0.4.1,<0.5.0
+# PyTorch
+torch>=0.4.1
 # progress bars in model download and training scripts
 tqdm
 # Accessing files from S3 directly.
--- a/setup.py
+++ b/setup.py
@ -1,12 +1,48 @@
+"""
+Simple check list from AllenNLP repo: https://github.com/allenai/allennlp/blob/master/setup.py
+
+To create the package for pypi.
+
+1. Change the version in __init__.py and setup.py.
+
+2. Commit these changes with the message: "Release: VERSION"
+
+3. Add a tag in git to mark the release: "git tag VERSION -m'Adds tag VERSION for pypi' "
+   Push the tag to git: git push --tags origin master
+
+4. Build both the sources and the wheel. Do not change anything in setup.py between
+   creating the wheel and the source distribution (obviously).
+
+   For the wheel, run: "python setup.py bdist_wheel" in the top level allennlp directory.
+   (this will build a wheel for the python version you use to build it - make sure you use python 3.x).
+
+   For the sources, run: "python setup.py sdist"
+   You should now have a /dist directory with both .whl and .tar.gz source versions of allennlp.
+
+5. Check that everything looks correct by uploading the package to the pypi test server:
+
+   twine upload dist/* -r pypitest
+   (pypi suggest using twine as other methods upload files via plaintext.)
+
+   Check that you can install it in a virtualenv by running:
+   pip install -i https://testpypi.python.org/pypi allennlp
+
+6. Upload the final version to actual pypi:
+   twine upload dist/* -r pypi
+
+7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
+
+"""
+from io import open
 from setuptools import find_packages, setup

 setup(
    name="pytorch_pretrained_bert",
-    version="0.2.0",
-    author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors",
+    version="0.6.0",
+    author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors",
    author_email="thomas@huggingface.co",
    description="PyTorch version of Google AI BERT model with script to load Google pre-trained models",
-    long_description=open("README.md", "r").read(),
+    long_description=open("README.md", "r", encoding='utf-8').read(),
    long_description_content_type="text/markdown",
    keywords='BERT NLP deep learning google',
    license='Apache',
@ -18,8 +54,12 @@ setup(
                      'boto3',
                      'requests',
                      'tqdm'],
-    scripts=["bin/pytorch_pretrained_bert"],
-    python_requires='>=3.5.0',
+    entry_points={
+      'console_scripts': [
+        "pytorch_pretrained_bert=pytorch_pretrained_bert.__main__:main",
+      ]
+    },
+    # python_requires='>=3.5.0',
    tests_require=['pytest'],
    classifiers=[
          'Intended Audience :: Science/Research',
--- a/tests/modeling_gpt2_test.py
+++ b/tests/modeling_gpt2_test.py
@ -0,0 +1,210 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import json
+import random
+
+import torch
+
+from pytorch_pretrained_bert import (GPT2Config, GPT2Model,
+                                     GPT2LMHeadModel, GPT2DoubleHeadsModel)
+
+
+class GPT2ModelTest(unittest.TestCase):
+    class GPT2ModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_position_ids=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     n_positions=33,
+                     n_embd=32,
+                     n_layer=5,
+                     n_head=4,
+                     n_choices=3,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     scope=None):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_position_ids = use_position_ids
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.n_positions = n_positions
+            self.n_embd = n_embd
+            self.n_layer = n_layer
+            self.n_head = n_head
+            self.n_choices = n_choices
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.vocab_size)
+
+            position_ids = None
+            if self.use_position_ids:
+                position_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                total_voc = self.vocab_size
+                token_type_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
+
+            mc_labels = None
+            lm_labels = None
+            mc_token_ids = None
+            if self.use_labels:
+                mc_labels = GPT2ModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
+                lm_labels = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
+                mc_token_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices], self.seq_length)
+
+            config = GPT2Config(
+                vocab_size_or_config_json_file=self.vocab_size,
+                n_positions=self.n_positions,
+                n_embd=self.n_embd,
+                n_layer=self.n_layer,
+                n_head=self.n_head,
+                initializer_range=self.initializer_range)
+
+            return (config, input_ids, token_type_ids, position_ids,
+                    mc_labels, lm_labels, mc_token_ids)
+
+        def create_gpt2_model(self, config, input_ids, token_type_ids, position_ids,
+                                mc_labels, lm_labels, mc_token_ids):
+            model = GPT2Model(config)
+            model.eval()
+            hidden_states, presents = model(input_ids, position_ids, token_type_ids)
+            outputs = {
+                "hidden_states": hidden_states,
+                "presents": presents,
+            }
+            return outputs
+
+        def check_gpt2_model_output(self, result):
+            self.parent.assertListEqual(
+                list(result["hidden_states"].size()),
+                [self.batch_size, self.n_choices, self.seq_length, self.n_embd])
+
+
+        def create_gpt2_lm_head(self, config, input_ids, token_type_ids, position_ids,
+                                       mc_labels, lm_labels, mc_token_ids):
+            model = GPT2LMHeadModel(config)
+            model.eval()
+            loss = model(input_ids, position_ids, token_type_ids, lm_labels)
+            lm_logits, presents = model(input_ids, position_ids, token_type_ids)
+            outputs = {
+                "loss": loss,
+                "lm_logits": lm_logits,
+                "presents": presents,
+            }
+            return outputs
+
+        def check_gpt2_lm_head_output(self, result):
+            total_voc = self.vocab_size
+            self.parent.assertListEqual(
+                list(result["lm_logits"].size()),
+                [self.batch_size, self.n_choices, self.seq_length, total_voc])
+
+        def check_gpt2_lm_head_loss_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+
+        def create_gpt2_double_heads(self, config, input_ids, token_type_ids, position_ids,
+                                       mc_labels, lm_labels, mc_token_ids):
+            model = GPT2DoubleHeadsModel(config)
+            model.eval()
+            loss = model(input_ids, mc_token_ids,
+                         lm_labels=lm_labels, mc_labels=mc_labels,
+                         token_type_ids=token_type_ids, position_ids=position_ids)
+            lm_logits, mc_logits, presents = model(input_ids, mc_token_ids, position_ids=position_ids, token_type_ids=token_type_ids)
+            outputs = {
+                "loss": loss,
+                "lm_logits": lm_logits,
+                "mc_logits": mc_logits,
+                "presents": presents,
+            }
+            return outputs
+
+        def check_gpt2_double_heads_output(self, result):
+            total_voc = self.vocab_size
+            self.parent.assertListEqual(
+                list(result["lm_logits"].size()),
+                [self.batch_size, self.n_choices, self.seq_length, total_voc])
+            self.parent.assertListEqual(
+                list(result["mc_logits"].size()),
+                [self.batch_size, self.n_choices])
+
+        def check_gpt2_double_heads_loss_output(self, result):
+            self.parent.assertListEqual(
+                [list(l.size()) for l in result["loss"]],
+                [[], []])
+
+    def test_default(self):
+        self.run_tester(GPT2ModelTest.GPT2ModelTester(self))
+
+    def test_config_to_json_string(self):
+        config = GPT2Config(vocab_size_or_config_json_file=99, n_embd=37)
+        obj = json.loads(config.to_json_string())
+        self.assertEqual(obj["vocab_size"], 99)
+        self.assertEqual(obj["n_embd"], 37)
+
+    def run_tester(self, tester):
+        config_and_inputs = tester.prepare_config_and_inputs()
+        output_result = tester.create_gpt2_model(*config_and_inputs)
+        tester.check_gpt2_model_output(output_result)
+
+        output_result = tester.create_gpt2_lm_head(*config_and_inputs)
+        tester.check_gpt2_lm_head_output(output_result)
+        tester.check_gpt2_lm_head_loss_output(output_result)
+
+        output_result = tester.create_gpt2_double_heads(*config_and_inputs)
+        tester.check_gpt2_double_heads_output(output_result)
+        tester.check_gpt2_double_heads_loss_output(output_result)
+
+    @classmethod
+    def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
+        """Creates a random int32 tensor of the shape within the vocab size."""
+        if rng is None:
+            rng = random.Random()
+
+        total_dims = 1
+        for dim in shape:
+            total_dims *= dim
+
+        values = []
+        for _ in range(total_dims):
+            values.append(rng.randint(0, vocab_size - 1))
+
+        return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/modeling_openai_test.py
+++ b/tests/modeling_openai_test.py
@ -0,0 +1,222 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import json
+import random
+
+import torch
+
+from pytorch_pretrained_bert import (OpenAIGPTConfig, OpenAIGPTModel,
+                                     OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
+
+
+class OpenAIGPTModelTest(unittest.TestCase):
+    class OpenAIGPTModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_position_ids=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     n_special=1,
+                     n_positions=33,
+                     n_embd=32,
+                     n_layer=5,
+                     n_head=4,
+                     n_choices=3,
+                     afn="gelu",
+                     resid_pdrop=0.1,
+                     attn_pdrop=0.1,
+                     embd_pdrop=0.1,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     scope=None):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_position_ids = use_position_ids
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.n_special = n_special
+            self.n_positions = n_positions
+            self.n_embd = n_embd
+            self.n_layer = n_layer
+            self.n_head = n_head
+            self.afn = afn
+            self.n_choices = n_choices
+            self.resid_pdrop = resid_pdrop
+            self.attn_pdrop = attn_pdrop
+            self.embd_pdrop = embd_pdrop
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.vocab_size)
+
+            position_ids = None
+            if self.use_position_ids:
+                position_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                total_voc = self.vocab_size + self.n_special
+                token_type_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
+
+            mc_labels = None
+            lm_labels = None
+            mc_token_ids = None
+            if self.use_labels:
+                mc_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
+                lm_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
+                mc_token_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices], self.seq_length)
+
+            config = OpenAIGPTConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                n_positions=self.n_positions,
+                n_special=self.n_special,
+                n_embd=self.n_embd,
+                n_layer=self.n_layer,
+                n_head=self.n_head,
+                afn=self.afn,
+                resid_pdrop=self.resid_pdrop,
+                attn_pdrop=self.attn_pdrop,
+                embd_pdrop=self.embd_pdrop,
+                initializer_range=self.initializer_range)
+
+            return (config, input_ids, token_type_ids, position_ids,
+                    mc_labels, lm_labels, mc_token_ids)
+
+        def create_openai_model(self, config, input_ids, token_type_ids, position_ids,
+                                mc_labels, lm_labels, mc_token_ids):
+            model = OpenAIGPTModel(config)
+            model.eval()
+            hidden_states = model(input_ids, position_ids, token_type_ids)
+            outputs = {
+                "hidden_states": hidden_states,
+            }
+            return outputs
+
+        def check_openai_model_output(self, result):
+            self.parent.assertListEqual(
+                list(result["hidden_states"].size()),
+                [self.batch_size, self.n_choices, self.seq_length, self.n_embd])
+
+
+        def create_openai_lm_head(self, config, input_ids, token_type_ids, position_ids,
+                                       mc_labels, lm_labels, mc_token_ids):
+            model = OpenAIGPTLMHeadModel(config)
+            model.eval()
+            loss = model(input_ids, position_ids, token_type_ids, lm_labels)
+            lm_logits = model(input_ids, position_ids, token_type_ids)
+            outputs = {
+                "loss": loss,
+                "lm_logits": lm_logits,
+            }
+            return outputs
+
+        def check_openai_lm_head_output(self, result):
+            total_voc = self.n_special + self.vocab_size
+            self.parent.assertListEqual(
+                list(result["lm_logits"].size()),
+                [self.batch_size, self.n_choices, self.seq_length, total_voc])
+
+        def check_openai_lm_head_loss_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+
+        def create_openai_double_heads(self, config, input_ids, token_type_ids, position_ids,
+                                       mc_labels, lm_labels, mc_token_ids):
+            model = OpenAIGPTDoubleHeadsModel(config)
+            model.eval()
+            loss = model(input_ids, mc_token_ids,
+                         lm_labels=lm_labels, mc_labels=mc_labels,
+                         token_type_ids=token_type_ids, position_ids=position_ids)
+            lm_logits, mc_logits = model(input_ids, mc_token_ids, position_ids=position_ids, token_type_ids=token_type_ids)
+            outputs = {
+                "loss": loss,
+                "lm_logits": lm_logits,
+                "mc_logits": mc_logits,
+            }
+            return outputs
+
+        def check_openai_double_heads_output(self, result):
+            total_voc = self.n_special + self.vocab_size
+            self.parent.assertListEqual(
+                list(result["lm_logits"].size()),
+                [self.batch_size, self.n_choices, self.seq_length, total_voc])
+            self.parent.assertListEqual(
+                list(result["mc_logits"].size()),
+                [self.batch_size, self.n_choices])
+
+        def check_openai_double_heads_loss_output(self, result):
+            self.parent.assertListEqual(
+                [list(l.size()) for l in result["loss"]],
+                [[], []])
+
+    def test_default(self):
+        self.run_tester(OpenAIGPTModelTest.OpenAIGPTModelTester(self))
+
+    def test_config_to_json_string(self):
+        config = OpenAIGPTConfig(vocab_size_or_config_json_file=99, n_embd=37)
+        obj = json.loads(config.to_json_string())
+        self.assertEqual(obj["vocab_size"], 99)
+        self.assertEqual(obj["n_embd"], 37)
+
+    def run_tester(self, tester):
+        config_and_inputs = tester.prepare_config_and_inputs()
+        output_result = tester.create_openai_model(*config_and_inputs)
+        tester.check_openai_model_output(output_result)
+
+        output_result = tester.create_openai_lm_head(*config_and_inputs)
+        tester.check_openai_lm_head_output(output_result)
+        tester.check_openai_lm_head_loss_output(output_result)
+
+        output_result = tester.create_openai_double_heads(*config_and_inputs)
+        tester.check_openai_double_heads_output(output_result)
+        tester.check_openai_double_heads_loss_output(output_result)
+
+    @classmethod
+    def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
+        """Creates a random int32 tensor of the shape within the vocab size."""
+        if rng is None:
+            rng = random.Random()
+
+        total_dims = 1
+        for dim in shape:
+            total_dims *= dim
+
+        values = []
+        for _ in range(total_dims):
+            values.append(rng.randint(0, vocab_size - 1))
+
+        return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/modeling_test.py
+++ b/tests/modeling_test.py
@ -22,7 +22,10 @@ import random

 import torch

-from pytorch_pretrained_bert import BertConfig, BertModel
+from pytorch_pretrained_bert import (BertConfig, BertModel, BertForMaskedLM,
+                                     BertForNextSentencePrediction, BertForPreTraining,
+                                     BertForQuestionAnswering, BertForSequenceClassification,
+                                     BertForTokenClassification)


 class BertModelTest(unittest.TestCase):
@ -35,6 +38,7 @@ class BertModelTest(unittest.TestCase):
                     is_training=True,
                     use_input_mask=True,
                     use_token_type_ids=True,
+                     use_labels=True,
                     vocab_size=99,
                     hidden_size=32,
                     num_hidden_layers=5,
@ -45,7 +49,9 @@ class BertModelTest(unittest.TestCase):
                     attention_probs_dropout_prob=0.1,
                     max_position_embeddings=512,
                     type_vocab_size=16,
+                     type_sequence_label_size=2,
                     initializer_range=0.02,
+                     num_labels=3,
                     scope=None):
            self.parent = parent
            self.batch_size = batch_size
@ -53,6 +59,7 @@ class BertModelTest(unittest.TestCase):
            self.is_training = is_training
            self.use_input_mask = use_input_mask
            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
            self.vocab_size = vocab_size
            self.hidden_size = hidden_size
            self.num_hidden_layers = num_hidden_layers
@ -63,10 +70,12 @@ class BertModelTest(unittest.TestCase):
            self.attention_probs_dropout_prob = attention_probs_dropout_prob
            self.max_position_embeddings = max_position_embeddings
            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
            self.initializer_range = initializer_range
+            self.num_labels = num_labels
            self.scope = scope

-        def create_model(self):
+        def prepare_config_and_inputs(self):
            input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)

            input_mask = None
@ -77,6 +86,12 @@ class BertModelTest(unittest.TestCase):
            if self.use_token_type_ids:
                token_type_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)

+            sequence_labels = None
+            token_labels = None
+            if self.use_labels:
+                sequence_labels = BertModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+
            config = BertConfig(
                vocab_size_or_config_json_file=self.vocab_size,
                hidden_size=self.hidden_size,
@ -90,10 +105,17 @@ class BertModelTest(unittest.TestCase):
                type_vocab_size=self.type_vocab_size,
                initializer_range=self.initializer_range)

+            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels
+
+        def check_loss_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+
+        def create_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
            model = BertModel(config=config)
-
+            model.eval()
            all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-
            outputs = {
                "sequence_output": all_encoder_layers[-1],
                "pooled_output": pooled_output,
@ -101,13 +123,125 @@ class BertModelTest(unittest.TestCase):
            }
            return outputs

-        def check_output(self, result):
+        def check_bert_model_output(self, result):
+            self.parent.assertListEqual(
+                [size for layer in result["all_encoder_layers"] for size in layer.size()],
+                [self.batch_size, self.seq_length, self.hidden_size] * self.num_hidden_layers)
            self.parent.assertListEqual(
                list(result["sequence_output"].size()),
                [self.batch_size, self.seq_length, self.hidden_size])
-
            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])

+
+        def create_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+            model = BertForMaskedLM(config=config)
+            model.eval()
+            loss = model(input_ids, token_type_ids, input_mask, token_labels)
+            prediction_scores = model(input_ids, token_type_ids, input_mask)
+            outputs = {
+                "loss": loss,
+                "prediction_scores": prediction_scores,
+            }
+            return outputs
+
+        def check_bert_for_masked_lm_output(self, result):
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+
+        def create_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+            model = BertForNextSentencePrediction(config=config)
+            model.eval()
+            loss = model(input_ids, token_type_ids, input_mask, sequence_labels)
+            seq_relationship_score = model(input_ids, token_type_ids, input_mask)
+            outputs = {
+                "loss": loss,
+                "seq_relationship_score": seq_relationship_score,
+            }
+            return outputs
+
+        def check_bert_for_next_sequence_prediction_output(self, result):
+            self.parent.assertListEqual(
+                list(result["seq_relationship_score"].size()),
+                [self.batch_size, 2])
+
+
+        def create_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+            model = BertForPreTraining(config=config)
+            model.eval()
+            loss = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels)
+            prediction_scores, seq_relationship_score = model(input_ids, token_type_ids, input_mask)
+            outputs = {
+                "loss": loss,
+                "prediction_scores": prediction_scores,
+                "seq_relationship_score": seq_relationship_score,
+            }
+            return outputs
+
+        def check_bert_for_pretraining_output(self, result):
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.parent.assertListEqual(
+                list(result["seq_relationship_score"].size()),
+                [self.batch_size, 2])
+
+
+        def create_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+            model = BertForQuestionAnswering(config=config)
+            model.eval()
+            loss = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels)
+            start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
+            outputs = {
+                "loss": loss,
+                "start_logits": start_logits,
+                "end_logits": end_logits,
+            }
+            return outputs
+
+        def check_bert_for_question_answering_output(self, result):
+            self.parent.assertListEqual(
+                list(result["start_logits"].size()),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["end_logits"].size()),
+                [self.batch_size, self.seq_length])
+
+
+        def create_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+            model = BertForSequenceClassification(config=config, num_labels=self.num_labels)
+            model.eval()
+            loss = model(input_ids, token_type_ids, input_mask, sequence_labels)
+            logits = model(input_ids, token_type_ids, input_mask)
+            outputs = {
+                "loss": loss,
+                "logits": logits,
+            }
+            return outputs
+
+        def check_bert_for_sequence_classification_output(self, result):
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.num_labels])
+
+
+        def create_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+            model = BertForTokenClassification(config=config, num_labels=self.num_labels)
+            model.eval()
+            loss = model(input_ids, token_type_ids, input_mask, token_labels)
+            logits = model(input_ids, token_type_ids, input_mask)
+            outputs = {
+                "loss": loss,
+                "logits": logits,
+            }
+            return outputs
+
+        def check_bert_for_token_classification_output(self, result):
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.seq_length, self.num_labels])
+
+
    def test_default(self):
        self.run_tester(BertModelTest.BertModelTester(self))

@ -118,8 +252,33 @@ class BertModelTest(unittest.TestCase):
        self.assertEqual(obj["hidden_size"], 37)

    def run_tester(self, tester):
-        output_result = tester.create_model()
-        tester.check_output(output_result)
+        config_and_inputs = tester.prepare_config_and_inputs()
+        output_result = tester.create_bert_model(*config_and_inputs)
+        tester.check_bert_model_output(output_result)
+
+        output_result = tester.create_bert_for_masked_lm(*config_and_inputs)
+        tester.check_bert_for_masked_lm_output(output_result)
+        tester.check_loss_output(output_result)
+
+        output_result = tester.create_bert_for_next_sequence_prediction(*config_and_inputs)
+        tester.check_bert_for_next_sequence_prediction_output(output_result)
+        tester.check_loss_output(output_result)
+
+        output_result = tester.create_bert_for_pretraining(*config_and_inputs)
+        tester.check_bert_for_pretraining_output(output_result)
+        tester.check_loss_output(output_result)
+
+        output_result = tester.create_bert_for_question_answering(*config_and_inputs)
+        tester.check_bert_for_question_answering_output(output_result)
+        tester.check_loss_output(output_result)
+
+        output_result = tester.create_bert_for_sequence_classification(*config_and_inputs)
+        tester.check_bert_for_sequence_classification_output(output_result)
+        tester.check_loss_output(output_result)
+
+        output_result = tester.create_bert_for_token_classification(*config_and_inputs)
+        tester.check_bert_for_token_classification_output(output_result)
+        tester.check_loss_output(output_result)

    @classmethod
    def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
--- a/tests/modeling_transfo_xl_test.py
+++ b/tests/modeling_transfo_xl_test.py
@ -0,0 +1,218 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import json
+import random
+
+import torch
+
+from pytorch_pretrained_bert import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
+
+
+class TransfoXLModelTest(unittest.TestCase):
+    class TransfoXLModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     mem_len=30,
+                     clamp_len=15,
+                     is_training=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     cutoffs=[10, 50, 80],
+                     d_model=32,
+                     d_embed=32,
+                     n_head=4,
+                     d_head=8,
+                     d_inner=128,
+                     div_val=2,
+                     n_layer=5,
+                     scope=None,
+                     seed=1):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.mem_len = mem_len
+            self.clamp_len = clamp_len
+            self.is_training = is_training
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.cutoffs = cutoffs
+            self.d_model = d_model
+            self.d_embed = d_embed
+            self.n_head = n_head
+            self.d_head = d_head
+            self.d_inner = d_inner
+            self.div_val = div_val
+            self.n_layer = n_layer
+            self.scope = scope
+            self.seed = seed
+
+        def prepare_config_and_inputs(self):
+            input_ids_1 = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_ids_2 = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            lm_labels = None
+            if self.use_labels:
+                lm_labels = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            config = TransfoXLConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                mem_len=self.mem_len,
+                clamp_len=self.clamp_len,
+                cutoffs=self.cutoffs,
+                d_model=self.d_model,
+                d_embed=self.d_embed,
+                n_head=self.n_head,
+                d_head=self.d_head,
+                d_inner=self.d_inner,
+                div_val=self.div_val,
+                n_layer=self.n_layer)
+
+            return (config, input_ids_1, input_ids_2, lm_labels)
+
+        def set_seed(self):
+            random.seed(self.seed)
+            torch.manual_seed(self.seed)
+
+        def create_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
+            model = TransfoXLModel(config)
+            model.eval()
+
+            hidden_states_1, mems_1 = model(input_ids_1)
+            hidden_states_2, mems_2 = model(input_ids_2, mems_1)
+            outputs = {
+                "hidden_states_1": hidden_states_1,
+                "mems_1": mems_1,
+                "hidden_states_2": hidden_states_2,
+                "mems_2": mems_2,
+            }
+            return outputs
+
+        def check_transfo_xl_model_output(self, result):
+            self.parent.assertListEqual(
+                list(result["hidden_states_1"].size()),
+                [self.batch_size, self.seq_length, self.d_model])
+            self.parent.assertListEqual(
+                list(result["hidden_states_2"].size()),
+                [self.batch_size, self.seq_length, self.d_model])
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_1"]),
+                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_2"]),
+                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
+
+
+        def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
+            model = TransfoXLLMHeadModel(config)
+            model.eval()
+
+            loss_1, mems_1a = model(input_ids_1, target=lm_labels)
+            lm_logits_1, mems_1b = model(input_ids_1)
+
+            loss_2, mems_2a = model(input_ids_2, target=lm_labels, mems=mems_1a)
+            lm_logits_2, mems_2b = model(input_ids_2, mems=mems_1b)
+
+            outputs = {
+                "loss_1": loss_1,
+                "mems_1a": mems_1a,
+                "lm_logits_1": lm_logits_1,
+                "mems_1b": mems_1b,
+                "loss_2": loss_2,
+                "mems_2a": mems_2a,
+                "lm_logits_2": lm_logits_2,
+                "mems_2b": mems_2b,
+            }
+            return outputs
+
+        def check_transfo_xl_lm_head_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss_1"].size()),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["lm_logits_1"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_1a"]),
+                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_1b"]),
+                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
+            self.parent.assertListEqual(
+                list(mem[~torch.isnan(mem)].sum() for mem in result["mems_1a"]),
+                list(mem[~torch.isnan(mem)].sum() for mem in result["mems_1b"]))
+
+            self.parent.assertListEqual(
+                list(result["loss_2"].size()),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["lm_logits_2"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_2a"]),
+                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_2b"]),
+                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
+            self.parent.assertListEqual(
+                list(mem[~torch.isnan(mem)].sum() for mem in result["mems_2a"]),
+                list(mem[~torch.isnan(mem)].sum() for mem in result["mems_2b"]))
+
+    def test_default(self):
+        self.run_tester(TransfoXLModelTest.TransfoXLModelTester(self))
+
+    def test_config_to_json_string(self):
+        config = TransfoXLConfig(vocab_size_or_config_json_file=96, d_embed=37)
+        obj = json.loads(config.to_json_string())
+        self.assertEqual(obj["n_token"], 96)
+        self.assertEqual(obj["d_embed"], 37)
+
+    def run_tester(self, tester):
+        config_and_inputs = tester.prepare_config_and_inputs()
+
+        tester.set_seed()
+        output_result = tester.create_transfo_xl_model(*config_and_inputs)
+        tester.check_transfo_xl_model_output(output_result)
+
+        tester.set_seed()
+        output_result = tester.create_transfo_xl_lm_head(*config_and_inputs)
+        tester.check_transfo_xl_lm_head_output(output_result)
+
+    @classmethod
+    def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
+        """Creates a random int32 tensor of the shape within the vocab size."""
+        if rng is None:
+            rng = random.Random()
+
+        total_dims = 1
+        for dim in shape:
+            total_dims *= dim
+
+        values = []
+        for _ in range(total_dims):
+            values.append(rng.randint(0, vocab_size - 1))
+
+        return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/optimization_test.py
+++ b/tests/optimization_test.py
@ -32,10 +32,10 @@ class OptimizationTest(unittest.TestCase):
    def test_adam(self):
        w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True)
        target = torch.tensor([0.4, 0.2, -0.5])
-        criterion = torch.nn.MSELoss(reduction='elementwise_mean')
+        criterion = torch.nn.MSELoss()
        # No warmup, constant schedule, no gradient clipping
        optimizer = BertAdam(params=[w], lr=2e-1,
-                                          weight_decay_rate=0.0,
+                                          weight_decay=0.0,
                                          max_grad_norm=-1)
        for _ in range(100):
            loss = criterion(w, target)
--- a/tests/tokenization_openai_test.py
+++ b/tests/tokenization_openai_test.py
@ -0,0 +1,56 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import unittest
+import json
+
+from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer
+
+
+class OpenAIGPTTokenizationTest(unittest.TestCase):
+
+    def test_full_tokenizer(self):
+        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
+        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
+                 "w</w>", "r</w>", "t</w>",
+                 "lo", "low", "er</w>",
+                 "low</w>", "lowest</w>", "newer</w>", "wider</w>"]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
+        with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp:
+            json.dump(vocab_tokens, fp)
+            vocab_file = fp.name
+        with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp:
+            fp.write("\n".join(merges))
+            merges_file = fp.name
+
+        tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file, special_tokens=["<unk>"])
+        os.remove(vocab_file)
+        os.remove(merges_file)
+
+        text = "lower"
+        bpe_tokens = ["low", "er</w>"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + ["<unk>"]
+        input_bpe_tokens = [14, 15, 20]
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tests/tokenization_test.py
+++ b/tests/tokenization_test.py
@ -12,15 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function, unicode_literals

 import os
 import unittest
+from io import open

-from pytorch_pretrained_bert.tokenization import (BertTokenizer, BasicTokenizer, WordpieceTokenizer,
-                                                  _is_whitespace, _is_control, _is_punctuation)
+from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
+                                                  BertTokenizer,
+                                                  WordpieceTokenizer,
+                                                  _is_control, _is_punctuation,
+                                                  _is_whitespace)


 class TokenizationTest(unittest.TestCase):
@ -30,7 +32,7 @@ class TokenizationTest(unittest.TestCase):
            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
            "##ing", ","
        ]
-        with open("/tmp/bert_tokenizer_test.txt", "w") as vocab_writer:
+        with open("/tmp/bert_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer:
            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))

            vocab_file = vocab_writer.name
@ -44,12 +46,30 @@ class TokenizationTest(unittest.TestCase):
        self.assertListEqual(
            tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])

+    def test_full_tokenizer_raises_error_for_long_sequences(self):
+        vocab_tokens = [
+            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
+            "##ing", ","
+        ]
+        with open("/tmp/bert_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+            vocab_file = vocab_writer.name
+
+        tokenizer = BertTokenizer(vocab_file, max_len=10)
+        os.remove(vocab_file)
+        tokens = tokenizer.tokenize(u"the cat sat on the mat in the summer time")
+        indices = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(indices, [0 for _ in range(10)])
+
+        tokens = tokenizer.tokenize(u"the cat sat on the mat in the summer time .")
+        self.assertRaises(ValueError, tokenizer.convert_tokens_to_ids, tokens)
+
    def test_chinese(self):
        tokenizer = BasicTokenizer()
-    
+
        self.assertListEqual(
            tokenizer.tokenize(u"ah\u535A\u63A8zz"),
-            [u"ah", u"\u535A", u"\u63A8", u"zz"])  
+            [u"ah", u"\u535A", u"\u63A8", u"zz"])

    def test_basic_tokenizer_lower(self):
        tokenizer = BasicTokenizer(do_lower_case=True)
--- a/tests/tokenization_transfo_xl_test.py
+++ b/tests/tokenization_transfo_xl_test.py
@ -0,0 +1,90 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import unittest
+from io import open
+
+from pytorch_pretrained_bert.tokenization_transfo_xl import (TransfoXLTokenizer,
+                                                  _is_control, _is_punctuation,
+                                                  _is_whitespace)
+
+
+class TransfoXLTokenizationTest(unittest.TestCase):
+
+    def test_full_tokenizer(self):
+        vocab_tokens = [
+            "<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un", "running", ","
+        ]
+        with open("/tmp/transfo_xl_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+            vocab_file = vocab_writer.name
+
+        tokenizer = TransfoXLTokenizer(vocab_file=vocab_file, lower_case=True)
+        tokenizer.build_vocab()
+        os.remove(vocab_file)
+
+        tokens = tokenizer.tokenize(u"<unk> UNwant\u00E9d,running")
+        self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
+
+    def test_full_tokenizer_lower(self):
+        tokenizer = TransfoXLTokenizer(lower_case=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
+            ["hello", "!", "how", "are", "you", "?"])
+        self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
+
+    def test_full_tokenizer_no_lower(self):
+        tokenizer = TransfoXLTokenizer(lower_case=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
+            ["HeLLo", "!", "how", "Are", "yoU", "?"])
+
+    def test_is_whitespace(self):
+        self.assertTrue(_is_whitespace(u" "))
+        self.assertTrue(_is_whitespace(u"\t"))
+        self.assertTrue(_is_whitespace(u"\r"))
+        self.assertTrue(_is_whitespace(u"\n"))
+        self.assertTrue(_is_whitespace(u"\u00A0"))
+
+        self.assertFalse(_is_whitespace(u"A"))
+        self.assertFalse(_is_whitespace(u"-"))
+
+    def test_is_control(self):
+        self.assertTrue(_is_control(u"\u0005"))
+
+        self.assertFalse(_is_control(u"A"))
+        self.assertFalse(_is_control(u" "))
+        self.assertFalse(_is_control(u"\t"))
+        self.assertFalse(_is_control(u"\r"))
+
+    def test_is_punctuation(self):
+        self.assertTrue(_is_punctuation(u"-"))
+        self.assertTrue(_is_punctuation(u"$"))
+        self.assertTrue(_is_punctuation(u"`"))
+        self.assertTrue(_is_punctuation(u"."))
+
+        self.assertFalse(_is_punctuation(u"A"))
+        self.assertFalse(_is_punctuation(u" "))
+
+
+if __name__ == '__main__':
+    unittest.main()