mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-20 17:13:56 +08:00
Compare commits
4 Commits
v4.46.3
...
auto_gpt4_
Author | SHA1 | Date | |
---|---|---|---|
5b16085f37 | |||
975ec0ca05 | |||
2cdc0ad4cf | |||
a0edf4d190 |
20
add_missing_imports.py
Normal file
20
add_missing_imports.py
Normal file
@ -0,0 +1,20 @@
|
||||
# 2) Add the new classes to the module __init__ (CHECK)
|
||||
# 3) Add the new classes to the module autodoc (CHECK)
|
||||
# 4) Update the root __init__ to import the new classes (CHECK)
|
||||
# 5) Add the missing Auto classes (CHECK)
|
||||
# 6) Add the module file to the doctest list
|
||||
# - Happens automatically: Update the model support checklist
|
||||
# - Happens automatically: Update the dummies
|
||||
|
||||
from pathlib import Path
|
||||
import re
|
||||
|
||||
init_file = Path("src/transformers/models/gpt_neo/__init__.py")
|
||||
contents = init_file.read_text()
|
||||
if "is_tf_available" in contents:
|
||||
raise ValueError("This file already contains TF imports!")
|
||||
torch_block_re = r"try:[\n\s]+if not is_torch_available\(\)\:[\n\s]+raise OptionalDependencyNotAvailable.*?else.*?\].*?\]"
|
||||
torch_block = re.search(torch_block_re, contents, re.DOTALL)
|
||||
torch_block_text = torch_block.group(0)
|
||||
breakpoint()
|
||||
print()
|
@ -83,3 +83,18 @@ The `generate()` method can be used to generate text using GPT Neo model.
|
||||
|
||||
[[autodoc]] FlaxGPTNeoForCausalLM
|
||||
- __call__
|
||||
|
||||
## TFGPTNeoModel
|
||||
|
||||
[[autodoc]] TFGPTNeoModel
|
||||
- call
|
||||
|
||||
## TFGPTNeoForCausalLM
|
||||
|
||||
[[autodoc]] TFGPTNeoForCausalLM
|
||||
- call
|
||||
|
||||
## TFGPTNeoForSequenceClassification
|
||||
|
||||
[[autodoc]] TFGPTNeoForSequenceClassification
|
||||
- call
|
91
gpt4_convert.py
Normal file
91
gpt4_convert.py
Normal file
@ -0,0 +1,91 @@
|
||||
# What are the steps in conversion?
|
||||
|
||||
|
||||
|
||||
# TODO 1: Get GPT to generate the input shapes when it converts a module (not at top-level class!)
|
||||
# TODO 2: Port weights from PT to TF versions and do equivalence testing
|
||||
# TODO 3:
|
||||
|
||||
from pathlib import Path
|
||||
import re
|
||||
import openai
|
||||
from time import sleep
|
||||
from tqdm import tqdm
|
||||
|
||||
def get_module_name(module_text: str):
|
||||
module_name = re.search(r"(?:class |def )(\w+)", module_text).group(1)
|
||||
return module_name
|
||||
|
||||
def translate_fn(module_text: str):
|
||||
system_text = """
|
||||
You are a translation bot designed to translate code in the Hugging Face Transformers library from PyTorch to TensorFlow.
|
||||
|
||||
You will be passed a single PyTorch function or class from the library. Your goal is to output the equivalent
|
||||
TensorFlow code.
|
||||
|
||||
There are some guidelines you should follow when translating the code:
|
||||
|
||||
- When creating layers, please pass their attribute name as the name kwarg.
|
||||
- If the class inherits from PreTrainedModel it should instead inherit from TFPreTrainedModel.
|
||||
- Retain any docstrings attached to methods like forward and translate them, even when the method is being renamed to call.
|
||||
- Layer and model classes should accept **kwargs and pass these to super.init. They should also be renamed by adding "TF" to the start of their name.
|
||||
- You don't need to import anything.
|
||||
- If the class calls other classes in the same module, please add "TF" to the start of their name if required.
|
||||
- TensorFlow layers do not require input shape arguments in the same way as PyTorch layers. As a result, the first
|
||||
argument to the constructor of layers like Dense or Conv2D (but not Embedding) can usually be removed.
|
||||
- TensorFlow Embedding layers do not have a padding_idx argument. Please remove this argument from the constructor.
|
||||
- Prefer the function shape_list(), which returns a list, over methods like tensor.shape or tf.shape(tensor).
|
||||
- Keras layers do not have a register_buffer() method. Instead just set the attribute with that name on the layer directly.
|
||||
- Output classes like BaseModelOutput or SequenceClassifierOutput should have "TF" added to the start of their name.
|
||||
- NumPy operations and calls to .numpy() must be avoided! Use TensorFlow operations instead.
|
||||
"""
|
||||
module_name = get_module_name(module_text)
|
||||
if "load_tf_weights" in module_name:
|
||||
print("Skipping", module_name)
|
||||
return ""
|
||||
prompt = [{"role": "system", "content": system_text}, {"role": "user", "content": module_text}]
|
||||
for i in range(5):
|
||||
try:
|
||||
response = openai.ChatCompletion.create(model="gpt-4", messages=prompt, temperature=0, stream=True)
|
||||
break
|
||||
except openai.error.RateLimitError:
|
||||
print(f"Rate limited, retrying ({i + 1} of 5)")
|
||||
sleep(15)
|
||||
else:
|
||||
raise RuntimeError("Rate limited too many times")
|
||||
chunks = []
|
||||
for chunk in tqdm(response, desc=f"Translating {module_name}", dynamic_ncols=True, unit=" tokens"):
|
||||
chunk_message = chunk['choices'][0]['delta']
|
||||
chunks.append(chunk_message)
|
||||
translated_function = ''.join([m.get('content', '') for m in chunks])
|
||||
return translated_function
|
||||
|
||||
|
||||
def split_file(source_file: Path):
|
||||
text = source_file.read_text()
|
||||
top_level_fns = list(re.finditer(r"\n\n((?:@|class |def ).*?)(?=\n\n@|\n\nclass |\n\ndef |$)", text, flags=re.DOTALL))
|
||||
for i in range(len(top_level_fns) - 1):
|
||||
assert top_level_fns[i].end() == top_level_fns[i + 1].start()
|
||||
preamble = text[:top_level_fns[0].start()]
|
||||
all_texts = [preamble] + [m.group(0) for m in top_level_fns]
|
||||
for i in range(len(all_texts) - 1):
|
||||
text = all_texts[i]
|
||||
if not text.endswith("\n"):
|
||||
breakpoint()
|
||||
return [text.strip() for text in all_texts]
|
||||
|
||||
|
||||
def main():
|
||||
path = Path("src/transformers/models/gpt_neo/modeling_gpt_neo.py")
|
||||
out_path = Path("src/transformers/models/gpt_neo/modeling_tf_gpt_neo.py")
|
||||
split_fns = split_file(path)
|
||||
module_names = [get_module_name(fn) for fn in split_fns[1:]]
|
||||
module_names = [name for name in module_names if "load_tf_weights" not in name]
|
||||
translated_fns = [split_fns[0]]
|
||||
translated_fns += [translate_fn(fn) for fn in split_fns[1:]]
|
||||
output = '\n'.join(translated_fns)
|
||||
out_path.write_text(output)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
107
gpt4_convert_tests.py
Normal file
107
gpt4_convert_tests.py
Normal file
@ -0,0 +1,107 @@
|
||||
# What are the steps in conversion?
|
||||
|
||||
# 1) Convert all functions and classes in the model file
|
||||
# 2) Add the new classes to the module __init__
|
||||
# 3) Add the new classes to the module autodoc
|
||||
# 4) Update the root __init__ to import the new classes
|
||||
# 5) Add the missing Auto classes
|
||||
# 6) Add the module file to the doctest list
|
||||
# - Happens automatically: Update the model support checklist
|
||||
# - Happens automatically: Update the dummies
|
||||
|
||||
# TODO 1: Get GPT to generate the input shapes when it converts a module (not at top-level class!)
|
||||
# TODO 2: Port weights from PT to TF versions and do equivalence testing
|
||||
# TODO 3:
|
||||
|
||||
from pathlib import Path
|
||||
import re
|
||||
import openai
|
||||
from time import sleep
|
||||
from tqdm import tqdm
|
||||
from argparse import ArgumentParser
|
||||
|
||||
def get_module_name(module_text: str):
|
||||
module_name = re.search(r"(?:class |def )(\w+)", module_text).group(1)
|
||||
return module_name
|
||||
|
||||
def translate_fn(module_text: str):
|
||||
system_text = """
|
||||
You are a translation bot designed to translate code in the Hugging Face Transformers library from PyTorch to TensorFlow.
|
||||
|
||||
You will be passed a PyTorch class from a tests file. Your goal is to translate this class to test the equivalent TensorFlow model.
|
||||
|
||||
There are some guidelines you should follow when translating the code:
|
||||
|
||||
- Most model classes should be renamed by adding "TF" at the start of their name. However, tokenizers, processors and config classes are universal and so not renamed.
|
||||
- You don't need to add any extra imports, you can assume that any other functions or classes you call will be imported for you.
|
||||
- If the class calls other classes in the same module, you can assume that these have already been converted. Please add "TF" to the start of their name if required.
|
||||
- You can use tensor.shape.rank as a TensorFlow replacement for tensor.ndim.
|
||||
- Please use the Hugging Face function shape_list(), which returns a list, instead of tensor.shape or tf.shape(tensor) unless you need to treat the output as a tensor.
|
||||
- PyTorch methods like .detach(), .eval() and .to(device) are not needed in TensorFlow.
|
||||
- In tests, it's completely acceptable to construct arrays in NumPy instead of TensorFlow.
|
||||
- HuggingFace methods like ids_tensor, floats_tensor and random_attention_mask are universal, so you can use them without changes.
|
||||
- If the test looks like it should work without changes, please simply repeat it back without changes.
|
||||
"""
|
||||
module_name = get_module_name(module_text)
|
||||
if "load_tf_weights" in module_name:
|
||||
print("Skipping", module_name)
|
||||
return ""
|
||||
prompt = [{"role": "system", "content": system_text}, {"role": "user", "content": module_text}]
|
||||
for i in range(5):
|
||||
try:
|
||||
response = openai.ChatCompletion.create(model="gpt-4", messages=prompt, temperature=0, stream=True)
|
||||
break
|
||||
except openai.error.RateLimitError:
|
||||
print(f"Rate limited, retrying ({i + 1} of 5)")
|
||||
sleep(15)
|
||||
else:
|
||||
raise RuntimeError("Rate limited too many times")
|
||||
chunks = []
|
||||
for chunk in tqdm(response, desc=f"Translating {module_name}", dynamic_ncols=True, unit=" tokens"):
|
||||
chunk_message = chunk['choices'][0]['delta']
|
||||
chunks.append(chunk_message)
|
||||
translated_function = ''.join([m.get('content', '') for m in chunks])
|
||||
return translated_function
|
||||
|
||||
|
||||
def split_file(source_file: Path):
|
||||
text = source_file.read_text()
|
||||
top_level_fns = list(re.finditer(r"\n\n((?:@|class |def ).*?)(?=\n\n@|\n\nclass |\n\ndef |$)", text, flags=re.DOTALL))
|
||||
for i in range(len(top_level_fns) - 1):
|
||||
assert top_level_fns[i].end() == top_level_fns[i + 1].start()
|
||||
preamble = text[:top_level_fns[0].start()]
|
||||
all_texts = [preamble] + [m.group(0) for m in top_level_fns]
|
||||
for i in range(len(all_texts) - 1):
|
||||
class_text = all_texts[i]
|
||||
if len(class_text) > 8000:
|
||||
# Probably a big test class - let's extract methods individually
|
||||
# Remember they're each going to be indented by 4 spaces!
|
||||
methods = list(re.finditer(r"\n (def .*?)(?=\n def |$)", class_text, flags=re.DOTALL))
|
||||
method_preamble = class_text[:methods[0].start()]
|
||||
all_class_texts = [method_preamble] + [m.group(0) for m in methods]
|
||||
all_texts[i] = all_class_texts
|
||||
else:
|
||||
all_texts[i] = class_text.strip()
|
||||
|
||||
return all_texts
|
||||
|
||||
|
||||
def main():
|
||||
path = Path("tests/models/gpt_neo/test_modeling_gpt_neo.py")
|
||||
out_path = Path("tests/models/gpt_neo/test_modeling_tf_gpt_neo.py")
|
||||
split_classes = split_file(path)
|
||||
translated_classes = [split_classes[0]]
|
||||
for split_class in split_classes[1:]:
|
||||
if isinstance(split_class, list):
|
||||
partial_translation = [split_class[0]]
|
||||
for method in split_class[1:]:
|
||||
partial_translation.append(translate_fn(method))
|
||||
translated_classes.append('\n'.join(partial_translation))
|
||||
else:
|
||||
translated_classes.append(translate_fn(split_class))
|
||||
output = '\n'.join(translated_classes)
|
||||
out_path.write_text(output)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -3091,6 +3091,16 @@ else:
|
||||
"TFFunnelPreTrainedModel",
|
||||
]
|
||||
)
|
||||
_import_structure["models.gpt_neo"].extend(
|
||||
[
|
||||
"TF_GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
"TFGPTNeoForCausalLM",
|
||||
"TFGPTNeoForSequenceClassification",
|
||||
"TFGPTNeoModel",
|
||||
"TFGPTNeoMainLayer",
|
||||
"TFGPTNeoPreTrainedModel",
|
||||
]
|
||||
)
|
||||
_import_structure["models.gpt2"].extend(
|
||||
[
|
||||
"TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
@ -6311,6 +6321,14 @@ if TYPE_CHECKING:
|
||||
TFFunnelModel,
|
||||
TFFunnelPreTrainedModel,
|
||||
)
|
||||
from .models.gpt_neo import (
|
||||
TF_GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
TFGPTNeoForCausalLM,
|
||||
TFGPTNeoForSequenceClassification,
|
||||
TFGPTNeoModel,
|
||||
TFGPTNeoPreTrainedModel,
|
||||
TFGPTNeoMainLayer,
|
||||
)
|
||||
from .models.gpt2 import (
|
||||
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
TFGPT2DoubleHeadsModel,
|
||||
|
@ -52,6 +52,7 @@ TF_MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("flaubert", "TFFlaubertModel"),
|
||||
("funnel", ("TFFunnelModel", "TFFunnelBaseModel")),
|
||||
("gpt-sw3", "TFGPT2Model"),
|
||||
("gpt_neo", "TFGPTNeoModel"),
|
||||
("gpt2", "TFGPT2Model"),
|
||||
("gptj", "TFGPTJModel"),
|
||||
("groupvit", "TFGroupViTModel"),
|
||||
@ -172,6 +173,7 @@ TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
|
||||
("ctrl", "TFCTRLLMHeadModel"),
|
||||
("gpt-sw3", "TFGPT2LMHeadModel"),
|
||||
("gpt2", "TFGPT2LMHeadModel"),
|
||||
("gpt-neo", "TFGPTNeoForCausalLM"),
|
||||
("gptj", "TFGPTJForCausalLM"),
|
||||
("openai-gpt", "TFOpenAIGPTLMHeadModel"),
|
||||
("opt", "TFOPTForCausalLM"),
|
||||
@ -303,6 +305,7 @@ TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
||||
("flaubert", "TFFlaubertForSequenceClassification"),
|
||||
("funnel", "TFFunnelForSequenceClassification"),
|
||||
("gpt-sw3", "TFGPT2ForSequenceClassification"),
|
||||
("gpt_neo", "TFGPTNeoForSequenceClassification"),
|
||||
("gpt2", "TFGPT2ForSequenceClassification"),
|
||||
("gptj", "TFGPTJForSequenceClassification"),
|
||||
("layoutlm", "TFLayoutLMForSequenceClassification"),
|
||||
|
@ -13,7 +13,7 @@
|
||||
# limitations under the License.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_flax_available, is_torch_available
|
||||
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_flax_available, is_torch_available, is_tf_available
|
||||
|
||||
|
||||
_import_structure = {
|
||||
@ -35,6 +35,21 @@ else:
|
||||
"load_tf_weights_in_gpt_neo",
|
||||
]
|
||||
|
||||
try:
|
||||
if not is_tf_available():
|
||||
raise OptionalDependencyNotAvailable()
|
||||
except OptionalDependencyNotAvailable:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_tf_gpt_neo"] = [
|
||||
"TF_GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST"
|
||||
"TFGPTNeoForCausalLM",
|
||||
"TFGPTNeoForSequenceClassification",
|
||||
"TFGPTNeoModel",
|
||||
"TFGPTNeoMainLayer",
|
||||
"TFGPTNeoPreTrainedModel",
|
||||
]
|
||||
|
||||
try:
|
||||
if not is_flax_available():
|
||||
raise OptionalDependencyNotAvailable()
|
||||
@ -65,6 +80,20 @@ if TYPE_CHECKING:
|
||||
GPTNeoPreTrainedModel,
|
||||
load_tf_weights_in_gpt_neo,
|
||||
)
|
||||
try:
|
||||
if not is_tf_available():
|
||||
raise OptionalDependencyNotAvailable()
|
||||
except OptionalDependencyNotAvailable:
|
||||
pass
|
||||
else:
|
||||
from .modeling_tf_gpt_neo import (
|
||||
TF_GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
TFGPTNeoForCausalLM,
|
||||
TFGPTNeoForSequenceClassification,
|
||||
TFGPTNeoModel,
|
||||
TFGPTNeoPreTrainedModel,
|
||||
TFGPTNeoMainLayer,
|
||||
)
|
||||
|
||||
try:
|
||||
if not is_flax_available():
|
||||
|
799
src/transformers/models/gpt_neo/modeling_tf_gpt_neo.py
Normal file
799
src/transformers/models/gpt_neo/modeling_tf_gpt_neo.py
Normal file
@ -0,0 +1,799 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2021 The Eleuther AI and HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" PyTorch GPT Neo model."""
|
||||
|
||||
|
||||
import os
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...modeling_tf_outputs import (
|
||||
TFBaseModelOutputWithPast,
|
||||
TFBaseModelOutputWithPastAndCrossAttentions,
|
||||
TFCausalLMOutputWithCrossAttentions,
|
||||
TFCausalLMOutputWithPast,
|
||||
TFSequenceClassifierOutputWithPast,
|
||||
)
|
||||
from ...modeling_tf_utils import TFPreTrainedModel, shape_list
|
||||
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
|
||||
from .configuration_gpt_neo import GPTNeoConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
_CONFIG_FOR_DOC = "GPTNeoConfig"
|
||||
|
||||
GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
||||
"EleutherAI/gpt-neo-1.3B",
|
||||
# See all GPTNeo models at https://huggingface.co/models?filter=gpt_neo
|
||||
]
|
||||
|
||||
_CHECKPOINT_FOR_DOC = "EleutherAI/gpt-neo-1.3B"
|
||||
|
||||
class TFGPTNeoSelfAttention(tf.keras.layers.Layer):
|
||||
def __init__(self, config, attention_type, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
max_positions = config.max_position_embeddings
|
||||
bias = tf.linalg.band_part(tf.ones((max_positions, max_positions), dtype=tf.bool), -1, 0)
|
||||
bias = tf.reshape(bias, (1, 1, max_positions, max_positions))
|
||||
|
||||
if attention_type == "local":
|
||||
bias = tf.math.logical_xor(bias, tf.linalg.band_part(bias, -config.window_size, 0))
|
||||
|
||||
self.bias = bias
|
||||
self.masked_bias = tf.constant(-1e9)
|
||||
|
||||
self.attn_dropout = tf.keras.layers.Dropout(float(config.attention_dropout))
|
||||
self.resid_dropout = tf.keras.layers.Dropout(float(config.resid_dropout))
|
||||
|
||||
self.embed_dim = config.hidden_size
|
||||
self.num_heads = config.num_heads
|
||||
self.head_dim = self.embed_dim // self.num_heads
|
||||
if self.head_dim * self.num_heads != self.embed_dim:
|
||||
raise ValueError(
|
||||
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
|
||||
f" {self.num_heads})."
|
||||
)
|
||||
|
||||
self.k_proj = tf.keras.layers.Dense(self.embed_dim, use_bias=False, name="k_proj")
|
||||
self.v_proj = tf.keras.layers.Dense(self.embed_dim, use_bias=False, name="v_proj")
|
||||
self.q_proj = tf.keras.layers.Dense(self.embed_dim, use_bias=False, name="q_proj")
|
||||
self.out_proj = tf.keras.layers.Dense(self.embed_dim, use_bias=True, name="out_proj")
|
||||
|
||||
def _split_heads(self, tensor, num_heads, attn_head_size):
|
||||
new_shape = tensor.shape[:-1] + (num_heads, attn_head_size)
|
||||
tensor = tf.reshape(tensor, new_shape)
|
||||
return tf.transpose(tensor, perm=[0, 2, 1, 3])
|
||||
|
||||
def _merge_heads(self, tensor, num_heads, attn_head_size):
|
||||
tensor = tf.transpose(tensor, perm=[0, 2, 1, 3])
|
||||
new_shape = tensor.shape[:-2] + (num_heads * attn_head_size,)
|
||||
return tf.reshape(tensor, new_shape)
|
||||
|
||||
def _attn(self, query, key, value, attention_mask=None, head_mask=None):
|
||||
query = tf.cast(query, tf.float32)
|
||||
key = tf.cast(key, tf.float32)
|
||||
|
||||
attn_weights = tf.matmul(query, key, transpose_b=True)
|
||||
|
||||
query_length, key_length = query.shape[-2], key.shape[-2]
|
||||
causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
|
||||
mask_value = tf.float32.min
|
||||
mask_value = tf.constant(mask_value, dtype=attn_weights.dtype)
|
||||
attn_weights = tf.where(causal_mask, attn_weights, mask_value)
|
||||
|
||||
if attention_mask is not None:
|
||||
attn_weights = attn_weights + attention_mask
|
||||
|
||||
attn_weights = tf.nn.softmax(attn_weights, axis=-1)
|
||||
attn_weights = tf.cast(attn_weights, value.dtype)
|
||||
attn_weights = self.attn_dropout(attn_weights)
|
||||
|
||||
if head_mask is not None:
|
||||
attn_weights = attn_weights * head_mask
|
||||
|
||||
attn_output = tf.matmul(attn_weights, value)
|
||||
|
||||
return attn_output, attn_weights
|
||||
|
||||
def call(
|
||||
self,
|
||||
hidden_states,
|
||||
attention_mask=None,
|
||||
layer_past=None,
|
||||
head_mask=None,
|
||||
use_cache=False,
|
||||
output_attentions=False,
|
||||
):
|
||||
query = self.q_proj(hidden_states)
|
||||
key = self.k_proj(hidden_states)
|
||||
value = self.v_proj(hidden_states)
|
||||
|
||||
query = self._split_heads(query, self.num_heads, self.head_dim)
|
||||
key = self._split_heads(key, self.num_heads, self.head_dim)
|
||||
value = self._split_heads(value, self.num_heads, self.head_dim)
|
||||
|
||||
if layer_past is not None:
|
||||
past_key = layer_past[0]
|
||||
past_value = layer_past[1]
|
||||
key = tf.concat((past_key, key), axis=-2)
|
||||
value = tf.concat((past_value, value), axis=-2)
|
||||
|
||||
if use_cache is True:
|
||||
present = (key, value)
|
||||
else:
|
||||
present = None
|
||||
|
||||
attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
|
||||
|
||||
attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
|
||||
attn_output = self.out_proj(attn_output)
|
||||
attn_output = self.resid_dropout(attn_output)
|
||||
|
||||
outputs = (attn_output, present)
|
||||
if output_attentions:
|
||||
outputs += (attn_weights,)
|
||||
|
||||
return outputs # a, present, (attentions)
|
||||
class TFGPTNeoAttention(tf.keras.layers.Layer):
|
||||
def __init__(self, config, layer_id=0, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.layer_id = layer_id
|
||||
self.attention_layers = config.attention_layers
|
||||
self.attention_type = self.attention_layers[layer_id]
|
||||
|
||||
if self.attention_type in ["global", "local"]:
|
||||
self.attention = TFGPTNeoSelfAttention(config, self.attention_type)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"Only attn layer types 'global' and 'local' exist, but got `config.attention_layers`: "
|
||||
f"{config.attention_layers}. Select attn layer types from ['global', 'local'] only."
|
||||
)
|
||||
|
||||
def call(
|
||||
self,
|
||||
hidden_states,
|
||||
layer_past=None,
|
||||
attention_mask=None,
|
||||
head_mask=None,
|
||||
use_cache=False,
|
||||
output_attentions=False,
|
||||
**kwargs
|
||||
):
|
||||
return self.attention(
|
||||
hidden_states,
|
||||
attention_mask=attention_mask,
|
||||
layer_past=layer_past,
|
||||
head_mask=head_mask,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
class TFGPTNeoMLP(tf.keras.layers.Layer):
|
||||
def __init__(self, intermediate_size, config, **kwargs): # in MLP: intermediate_size= 4 * hidden_size
|
||||
super().__init__(**kwargs)
|
||||
embed_dim = config.hidden_size
|
||||
self.c_fc = tf.keras.layers.Dense(intermediate_size, name="c_fc")
|
||||
self.c_proj = tf.keras.layers.Dense(embed_dim, name="c_proj")
|
||||
self.act = ACT2FN[config.activation_function]
|
||||
self.dropout = tf.keras.layers.Dropout(float(config.resid_dropout))
|
||||
|
||||
def call(self, hidden_states):
|
||||
hidden_states = self.c_fc(hidden_states)
|
||||
hidden_states = self.act(hidden_states)
|
||||
hidden_states = self.c_proj(hidden_states)
|
||||
hidden_states = self.dropout(hidden_states)
|
||||
return hidden_states
|
||||
class TFGPTNeoBlock(tf.keras.layers.Layer):
|
||||
def __init__(self, config, layer_id, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
hidden_size = config.hidden_size
|
||||
inner_dim = config.intermediate_size if config.intermediate_size is not None else 4 * hidden_size
|
||||
self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
|
||||
self.attn = TFGPTNeoAttention(config, layer_id, name="attn")
|
||||
self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
|
||||
self.mlp = TFGPTNeoMLP(inner_dim, config, name="mlp")
|
||||
|
||||
def call(
|
||||
self,
|
||||
hidden_states,
|
||||
layer_past=None,
|
||||
attention_mask=None,
|
||||
head_mask=None,
|
||||
use_cache=False,
|
||||
output_attentions=False,
|
||||
training=False,
|
||||
):
|
||||
residual = hidden_states
|
||||
hidden_states = self.ln_1(hidden_states)
|
||||
attn_outputs = self.attn(
|
||||
hidden_states,
|
||||
layer_past=layer_past,
|
||||
attention_mask=attention_mask,
|
||||
head_mask=head_mask,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
training=training,
|
||||
)
|
||||
attn_output = attn_outputs[0] # output_attn: a, present, (attentions)
|
||||
outputs = attn_outputs[1:]
|
||||
# residual connection
|
||||
hidden_states = attn_output + residual
|
||||
|
||||
residual = hidden_states
|
||||
hidden_states = self.ln_2(hidden_states)
|
||||
feed_forward_hidden_states = self.mlp(hidden_states, training=training)
|
||||
# residual connection
|
||||
hidden_states = residual + feed_forward_hidden_states
|
||||
|
||||
if use_cache:
|
||||
outputs = (hidden_states,) + outputs
|
||||
else:
|
||||
outputs = (hidden_states,) + outputs[1:]
|
||||
|
||||
return outputs # hidden_states, present, (attentions, cross_attentions)
|
||||
class TFGPTNeoPreTrainedModel(TFPreTrainedModel):
|
||||
"""
|
||||
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
||||
models.
|
||||
"""
|
||||
|
||||
config_class = GPTNeoConfig
|
||||
base_model_prefix = "transformer"
|
||||
supports_gradient_checkpointing = True
|
||||
_no_split_modules = ["TFGPTNeoBlock"]
|
||||
|
||||
def __init__(self, *inputs, **kwargs):
|
||||
super().__init__(*inputs, **kwargs)
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights."""
|
||||
if isinstance(module, (tf.keras.layers.Dense,)):
|
||||
# Slightly different from the TF version which uses truncated_normal for initialization
|
||||
# cf https://github.com/pytorch/pytorch/pull/5617
|
||||
module.kernel.assign(tf.random.normal(shape=module.kernel.shape, mean=0.0, stddev=self.config.initializer_range))
|
||||
if module.bias is not None:
|
||||
module.bias.assign(tf.zeros(shape=module.bias.shape))
|
||||
elif isinstance(module, tf.keras.layers.Embedding):
|
||||
module.embeddings.assign(tf.random.normal(shape=module.embeddings.shape, mean=0.0, stddev=self.config.initializer_range))
|
||||
elif isinstance(module, tf.keras.layers.LayerNormalization):
|
||||
module.bias.assign(tf.zeros(shape=module.bias.shape))
|
||||
module.gamma.assign(tf.ones(shape=module.gamma.shape))
|
||||
|
||||
def _set_gradient_checkpointing(self, module, value=False):
|
||||
if isinstance(module, TFGPTNeoModel):
|
||||
module.gradient_checkpointing = value
|
||||
|
||||
|
||||
GPT_NEO_START_DOCSTRING = r"""
|
||||
|
||||
This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
|
||||
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
||||
etc.)
|
||||
|
||||
This model is also a TensorFlow [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass.
|
||||
Use it as a regular TensorFlow Model and refer to the TensorFlow documentation for all matter related to general usage
|
||||
and behavior.
|
||||
|
||||
Parameters:
|
||||
config ([`GPTNeoConfig`]): Model configuration class with all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the
|
||||
configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
|
||||
"""
|
||||
|
||||
GPT_NEO_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
input_ids (`tf.Tensor` of shape `(batch_size, input_ids_length)`):
|
||||
`input_ids_length` = `sequence_length` if `past_key_values` is `None` else
|
||||
`past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
|
||||
sequence tokens in the vocabulary.
|
||||
|
||||
If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
|
||||
`input_ids`.
|
||||
|
||||
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
||||
[`PreTrainedTokenizer.__call__`] for details.
|
||||
|
||||
[What are input IDs?](../glossary#input-ids)
|
||||
past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.num_layers`):
|
||||
Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
|
||||
`past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
|
||||
their past given to this model should not be passed as `input_ids` as they have already been computed.
|
||||
attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
[What are attention masks?](../glossary#attention-mask)
|
||||
token_type_ids (`tf.Tensor` of shape `(batch_size, input_ids_length)`, *optional*):
|
||||
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
|
||||
1]`:
|
||||
|
||||
- 0 corresponds to a *sentence A* token,
|
||||
- 1 corresponds to a *sentence B* token.
|
||||
|
||||
[What are token type IDs?](../glossary#token-type-ids)
|
||||
position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
||||
config.max_position_embeddings - 1]`.
|
||||
|
||||
[What are position IDs?](../glossary#position-ids)
|
||||
head_mask (`tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
|
||||
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
|
||||
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
|
||||
model's internal embedding lookup matrix.
|
||||
|
||||
If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
|
||||
`past_key_values`).
|
||||
use_cache (`bool`, *optional*):
|
||||
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
|
||||
`past_key_values`).
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail.
|
||||
output_hidden_states (`bool`, *optional*):
|
||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
||||
more detail.
|
||||
return_dict (`bool`, *optional*):
|
||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
||||
"""
|
||||
@add_start_docstrings(
|
||||
"The bare GPT Neo Model transformer outputting raw hidden-states without any specific head on top.",
|
||||
GPT_NEO_START_DOCSTRING,
|
||||
)
|
||||
class TFGPTNeoModel(TFGPTNeoPreTrainedModel):
|
||||
def __init__(self, config, **kwargs):
|
||||
super().__init__(config, **kwargs)
|
||||
|
||||
self.embed_dim = config.hidden_size
|
||||
self.wte = tf.keras.layers.Embedding(config.vocab_size, self.embed_dim, name="wte")
|
||||
self.wpe = tf.keras.layers.Embedding(config.max_position_embeddings, self.embed_dim, name="wpe")
|
||||
self.drop = tf.keras.layers.Dropout(float(config.embed_dropout))
|
||||
self.h = [TFGPTNeoBlock(config, layer_id=i, name=f"h_{i}") for i in range(config.num_layers)]
|
||||
self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f")
|
||||
|
||||
self.gradient_checkpointing = False
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.wte
|
||||
|
||||
def set_input_embeddings(self, new_embeddings):
|
||||
self.wte = new_embeddings
|
||||
|
||||
@add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=TFBaseModelOutputWithPastAndCrossAttentions,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
def call(
|
||||
self,
|
||||
input_ids: Optional[tf.Tensor] = None,
|
||||
past_key_values: Optional[Tuple[tf.Tensor]] = None,
|
||||
attention_mask: Optional[tf.Tensor] = None,
|
||||
token_type_ids: Optional[tf.Tensor] = None,
|
||||
position_ids: Optional[tf.Tensor] = None,
|
||||
head_mask: Optional[tf.Tensor] = None,
|
||||
inputs_embeds: Optional[tf.Tensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
training: Optional[bool] = None,
|
||||
) -> Union[Tuple[tf.Tensor], TFBaseModelOutputWithPastAndCrossAttentions]:
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||
elif input_ids is not None:
|
||||
input_shape = shape_list(input_ids)
|
||||
input_ids = tf.reshape(input_ids, (-1, input_shape[-1]))
|
||||
batch_size = input_ids.shape[0]
|
||||
elif inputs_embeds is not None:
|
||||
input_shape = shape_list(inputs_embeds)[:-1]
|
||||
batch_size = inputs_embeds.shape[0]
|
||||
else:
|
||||
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
||||
|
||||
if token_type_ids is not None:
|
||||
token_type_ids = tf.reshape(token_type_ids, (-1, input_shape[-1]))
|
||||
if position_ids is not None:
|
||||
position_ids = tf.reshape(position_ids, (-1, input_shape[-1]))
|
||||
|
||||
if past_key_values is None:
|
||||
past_length = 0
|
||||
past_key_values = [None] * len(self.h)
|
||||
else:
|
||||
past_length = past_key_values[0][0].shape[-2]
|
||||
|
||||
if position_ids is None:
|
||||
position_ids = tf.range(past_length, input_shape[-1] + past_length, dtype=tf.int32)
|
||||
position_ids = tf.expand_dims(position_ids, 0)
|
||||
position_ids = tf.reshape(position_ids, (-1, input_shape[-1]))
|
||||
|
||||
# Attention mask.
|
||||
if attention_mask is not None:
|
||||
if batch_size <= 0:
|
||||
raise ValueError("batch_size has to be defined and > 0")
|
||||
attention_mask = tf.reshape(attention_mask, (batch_size, -1))
|
||||
# We create a 3D attention mask from a 2D tensor mask.
|
||||
# Sizes are [batch_size, 1, 1, to_seq_length]
|
||||
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
|
||||
# this attention mask is more simple than the triangular masking of causal attention
|
||||
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
|
||||
attention_mask = attention_mask[:, None, None, :]
|
||||
|
||||
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
|
||||
# masked positions, this operation will create a tensor which is 0.0 for
|
||||
# positions we want to attend and the dtype's smallest value for masked positions.
|
||||
# Since we are adding it to the raw scores before the softmax, this is
|
||||
# effectively the same as removing these entirely.
|
||||
attention_mask = tf.cast(attention_mask, dtype=self.dtype) # fp16 compatibility
|
||||
attention_mask = (1.0 - attention_mask) * tf.float32.min
|
||||
|
||||
# Prepare head mask if needed
|
||||
# 1.0 in head_mask indicate we keep the head
|
||||
# attention_probs has shape bsz x num_heads x N x N
|
||||
# head_mask has shape n_layer x batch x num_heads x N x N
|
||||
head_mask = self.get_head_mask(head_mask, self.config.num_layers)
|
||||
|
||||
if inputs_embeds is None:
|
||||
inputs_embeds = self.wte(input_ids)
|
||||
position_embeds = self.wpe(position_ids)
|
||||
hidden_states = inputs_embeds + position_embeds
|
||||
|
||||
if token_type_ids is not None:
|
||||
token_type_embeds = self.wte(token_type_ids)
|
||||
hidden_states = hidden_states + token_type_embeds
|
||||
|
||||
hidden_states = self.drop(hidden_states, training=training)
|
||||
|
||||
output_shape = input_shape + (hidden_states.shape[-1],)
|
||||
|
||||
if self.gradient_checkpointing and training:
|
||||
if use_cache:
|
||||
logger.warning_once(
|
||||
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
|
||||
)
|
||||
use_cache = False
|
||||
|
||||
presents = () if use_cache else None
|
||||
all_self_attentions = () if output_attentions else None
|
||||
all_hidden_states = () if output_hidden_states else None
|
||||
for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
outputs = block(
|
||||
hidden_states,
|
||||
layer_past=layer_past,
|
||||
attention_mask=attention_mask,
|
||||
head_mask=head_mask[i],
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
training=training,
|
||||
)
|
||||
|
||||
hidden_states = outputs[0]
|
||||
if use_cache is True:
|
||||
presents = presents + (outputs[1],)
|
||||
|
||||
if output_attentions:
|
||||
all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
|
||||
|
||||
hidden_states = self.ln_f(hidden_states)
|
||||
|
||||
hidden_states = tf.reshape(hidden_states, output_shape)
|
||||
# Add last hidden state
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
if not return_dict:
|
||||
return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
|
||||
|
||||
return TFBaseModelOutputWithPast(
|
||||
last_hidden_state=hidden_states,
|
||||
past_key_values=presents,
|
||||
hidden_states=all_hidden_states,
|
||||
attentions=all_self_attentions,
|
||||
)
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
The GPT Neo Model transformer with a language modeling head on top (linear layer with weights tied to the input
|
||||
embeddings).
|
||||
""",
|
||||
GPT_NEO_START_DOCSTRING,
|
||||
)
|
||||
class TFGPTNeoForCausalLM(TFGPTNeoPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"h\.\d+\.attn\.masked_bias",
|
||||
r"lm_head.weight",
|
||||
r"h\.\d+\.attn\.attention\.bias",
|
||||
]
|
||||
_keys_to_ignore_on_save = [r"lm_head.weight"]
|
||||
|
||||
def __init__(self, config, **kwargs):
|
||||
super().__init__(config, **kwargs)
|
||||
self.transformer = TFGPTNeoModel(config, name="transformer")
|
||||
self.lm_head = tf.keras.layers.Dense(config.vocab_size, use_bias=False, name="lm_head")
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.lm_head
|
||||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.lm_head = new_embeddings
|
||||
|
||||
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
|
||||
token_type_ids = kwargs.get("token_type_ids", None)
|
||||
# only last token for inputs_ids if past is defined in kwargs
|
||||
if past_key_values:
|
||||
input_ids = input_ids[:, -1:]
|
||||
if token_type_ids is not None:
|
||||
token_type_ids = token_type_ids[:, -1:]
|
||||
|
||||
attention_mask = kwargs.get("attention_mask", None)
|
||||
position_ids = kwargs.get("position_ids", None)
|
||||
|
||||
if attention_mask is not None and position_ids is None:
|
||||
# create position_ids on the fly for batch generation
|
||||
position_ids = tf.math.cumsum(tf.cast(attention_mask, tf.int64), axis=-1) - 1
|
||||
position_ids = tf.where(attention_mask == 0, 1, position_ids)
|
||||
if past_key_values:
|
||||
position_ids = position_ids[:, -1:]
|
||||
else:
|
||||
position_ids = None
|
||||
return {
|
||||
"input_ids": input_ids,
|
||||
"past_key_values": past_key_values,
|
||||
"use_cache": kwargs.get("use_cache"),
|
||||
"position_ids": position_ids,
|
||||
"attention_mask": attention_mask,
|
||||
"token_type_ids": token_type_ids,
|
||||
}
|
||||
|
||||
@add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=TFCausalLMOutputWithCrossAttentions,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
def call(
|
||||
self,
|
||||
input_ids: Optional[tf.Tensor] = None,
|
||||
past_key_values: Optional[Tuple[tf.Tensor]] = None,
|
||||
attention_mask: Optional[tf.Tensor] = None,
|
||||
token_type_ids: Optional[tf.Tensor] = None,
|
||||
position_ids: Optional[tf.Tensor] = None,
|
||||
head_mask: Optional[tf.Tensor] = None,
|
||||
inputs_embeds: Optional[tf.Tensor] = None,
|
||||
labels: Optional[tf.Tensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
training: bool = False,
|
||||
) -> Union[Tuple[tf.Tensor], TFCausalLMOutputWithCrossAttentions]:
|
||||
r"""
|
||||
labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
|
||||
`labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
|
||||
are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
transformer_outputs = self.transformer(
|
||||
input_ids,
|
||||
past_key_values=past_key_values,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
training=training,
|
||||
)
|
||||
hidden_states = transformer_outputs[0]
|
||||
|
||||
lm_logits = self.lm_head(hidden_states)
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
# Compute loss in fp32 to match with mesh-tf version
|
||||
# https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
|
||||
lm_logits = tf.cast(lm_logits, tf.float32)
|
||||
|
||||
# Shift so that tokens < n predict n
|
||||
shift_logits = lm_logits[..., :-1, :]
|
||||
shift_labels = labels[..., 1:]
|
||||
# Flatten the tokens
|
||||
loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
|
||||
loss = loss_fct(shift_labels, shift_logits)
|
||||
|
||||
lm_logits = tf.cast(lm_logits, hidden_states.dtype)
|
||||
loss = tf.cast(loss, hidden_states.dtype)
|
||||
|
||||
if not return_dict:
|
||||
output = (lm_logits,) + transformer_outputs[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return TFCausalLMOutputWithPast(
|
||||
loss=loss,
|
||||
logits=lm_logits,
|
||||
past_key_values=transformer_outputs.past_key_values,
|
||||
hidden_states=transformer_outputs.hidden_states,
|
||||
attentions=transformer_outputs.attentions,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _reorder_cache(
|
||||
past_key_values: Tuple[Tuple[tf.Tensor]], beam_idx: tf.Tensor
|
||||
) -> Tuple[Tuple[tf.Tensor]]:
|
||||
"""
|
||||
This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
|
||||
[`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
|
||||
beam_idx at every generation step.
|
||||
"""
|
||||
return tuple(
|
||||
tuple(tf.gather(past_state, beam_idx) for past_state in layer_past)
|
||||
for layer_past in past_key_values
|
||||
)
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
The GPTNeo Model transformer with a sequence classification head on top (linear layer).
|
||||
|
||||
[`TFGPTNeoForSequenceClassification`] uses the last token in order to do the classification, as other causal models
|
||||
(e.g. GPT-1) do.
|
||||
|
||||
Since it does classification on the last token, it requires to know the position of the last token. If a
|
||||
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
|
||||
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
|
||||
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
|
||||
each row of the batch).
|
||||
""",
|
||||
GPT_NEO_START_DOCSTRING,
|
||||
)
|
||||
class TFGPTNeoForSequenceClassification(TFGPTNeoPreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]
|
||||
|
||||
def __init__(self, config, **kwargs):
|
||||
super().__init__(config, **kwargs)
|
||||
self.num_labels = config.num_labels
|
||||
self.transformer = TFGPTNeoModel(config, name="transformer")
|
||||
self.score = tf.keras.layers.Dense(config.hidden_size, self.num_labels, use_bias=False, name="score")
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
@add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=TFSequenceClassifierOutputWithPast,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
def call(
|
||||
self,
|
||||
input_ids: Optional[tf.Tensor] = None,
|
||||
past_key_values: Optional[Tuple[tf.Tensor]] = None,
|
||||
attention_mask: Optional[tf.Tensor] = None,
|
||||
token_type_ids: Optional[tf.Tensor] = None,
|
||||
position_ids: Optional[tf.Tensor] = None,
|
||||
head_mask: Optional[tf.Tensor] = None,
|
||||
inputs_embeds: Optional[tf.Tensor] = None,
|
||||
labels: Optional[tf.Tensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
**kwargs,
|
||||
) -> Union[Tuple[tf.Tensor], TFSequenceClassifierOutputWithPast]:
|
||||
r"""
|
||||
labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
|
||||
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
||||
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
transformer_outputs = self.transformer(
|
||||
input_ids,
|
||||
past_key_values=past_key_values,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
**kwargs,
|
||||
)
|
||||
hidden_states = transformer_outputs[0]
|
||||
logits = self.score(hidden_states)
|
||||
|
||||
if input_ids is not None:
|
||||
batch_size, sequence_length = shape_list(input_ids)[:2]
|
||||
else:
|
||||
batch_size, sequence_length = shape_list(inputs_embeds)[:2]
|
||||
|
||||
if self.config.pad_token_id is None and batch_size != 1:
|
||||
raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
|
||||
if self.config.pad_token_id is None:
|
||||
sequence_lengths = -1
|
||||
else:
|
||||
if input_ids is not None:
|
||||
sequence_lengths = (tf.math.not_equal(input_ids, self.config.pad_token_id).numpy().sum(-1) - 1)
|
||||
else:
|
||||
sequence_lengths = -1
|
||||
logger.warning(
|
||||
f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
|
||||
"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
|
||||
)
|
||||
|
||||
pooled_logits = logits[tf.range(batch_size), sequence_lengths]
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if self.config.problem_type is None:
|
||||
if self.num_labels == 1:
|
||||
self.config.problem_type = "regression"
|
||||
elif self.num_labels > 1 and (labels.dtype == tf.int64 or labels.dtype == tf.int32):
|
||||
self.config.problem_type = "single_label_classification"
|
||||
else:
|
||||
self.config.problem_type = "multi_label_classification"
|
||||
|
||||
if self.config.problem_type == "regression":
|
||||
loss_fct = tf.keras.losses.MeanSquaredError()
|
||||
if self.num_labels == 1:
|
||||
loss = loss_fct(pooled_logits[:, tf.newaxis], labels[:, tf.newaxis])
|
||||
else:
|
||||
loss = loss_fct(pooled_logits, labels)
|
||||
elif self.config.problem_type == "single_label_classification":
|
||||
loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
|
||||
loss = loss_fct(labels, pooled_logits)
|
||||
elif self.config.problem_type == "multi_label_classification":
|
||||
loss_fct = tf.keras.losses.BinaryCrossentropy(from_logits=True)
|
||||
loss = loss_fct(labels, pooled_logits)
|
||||
if not return_dict:
|
||||
output = (pooled_logits,) + transformer_outputs[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return TFSequenceClassifierOutputWithPast(
|
||||
loss=loss,
|
||||
logits=pooled_logits,
|
||||
past_key_values=transformer_outputs.past_key_values,
|
||||
hidden_states=transformer_outputs.hidden_states,
|
||||
attentions=transformer_outputs.attentions,
|
||||
)
|
Reference in New Issue
Block a user