mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-21 01:23:56 +08:00
Compare commits
23 Commits
reference_
...
v4.11.3
Author | SHA1 | Date | |
---|---|---|---|
65659a29cf | |||
3202896ec9 | |||
bb2caca727 | |||
1c11636862 | |||
e9fc92d7bb | |||
6f06c58e7d | |||
3f4eaf0692 | |||
a8186b0128 | |||
c2901b093b | |||
d7db364abc | |||
3627a4b7a6 | |||
e6c5752865 | |||
3642d13b1b | |||
66c81b2fc1 | |||
218b58aff8 | |||
7655f11076 | |||
6b87918441 | |||
54f9d62c61 | |||
22d3156881 | |||
9bb3d33a46 | |||
a05400e020 | |||
10083244a3 | |||
11144a3048 |
@ -27,7 +27,10 @@ author = "huggingface"
|
||||
# The short X.Y version
|
||||
version = ""
|
||||
# The full version, including alpha/beta/rc tags
|
||||
release = "4.11.0"
|
||||
release = "4.11.3"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -41,6 +41,13 @@ DPRConfig
|
||||
:members:
|
||||
|
||||
|
||||
DPRPreTrainedModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.DPRPreTrainedModel
|
||||
:members:
|
||||
|
||||
|
||||
DPRContextEncoderTokenizer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
|
@ -24,19 +24,33 @@ This model was contributed by `Stella Biderman <https://huggingface.co/stellaath
|
||||
|
||||
Tips:
|
||||
|
||||
- Running [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B) in float32 precision on GPU requires at least 24 GB of
|
||||
RAM. On GPUs with less than 24 GB RAM, one should therefore load the model in half-precision:
|
||||
- To load `GPT-J <https://huggingface.co/EleutherAI/gpt-j-6B>`__ in float32 one would need at least 2x model size CPU
|
||||
RAM: 1x for initial weights and another 1x to load the checkpoint. So for GPT-J it would take at least 48GB of CPU
|
||||
RAM to just load the model. To reduce the CPU RAM usage there are a few options. The ``torch_dtype`` argument can be
|
||||
used to initialize the model in half-precision. And the ``low_cpu_mem_usage`` argument can be used to keep the RAM
|
||||
usage to 1x. There is also a `fp16 branch <https://huggingface.co/EleutherAI/gpt-j-6B/tree/float16>`__ which stores
|
||||
the fp16 weights, which could be used to further minimize the RAM usage. Combining all this it should take roughly
|
||||
12.1GB of CPU RAM to load the model.
|
||||
|
||||
.. code-block::
|
||||
|
||||
>>> from transformers import GPTJForCausalLM
|
||||
>>> import torch
|
||||
|
||||
>>> model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", torch_dtype=torch.float16)
|
||||
>>> model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16, low_cpu_mem_usage=True)
|
||||
|
||||
|
||||
- The model should fit on 16GB GPU for inference. For training/fine-tuning it would take much more GPU RAM. Adam
|
||||
optimizer for example makes four copies of the model: model, gradients, average and squared average of the gradients.
|
||||
So it would need at least 4x model size GPU memory, even with mixed precision as gradient updates are in fp32. This
|
||||
is not including the activations and data batches, which would again require some more GPU RAM. So one should explore
|
||||
solutions such as DeepSpeed, to train/fine-tune the model. Another option is to use the original codebase to
|
||||
train/fine-tune the model on TPU and then convert the model to Transformers format for inference. Instructions for
|
||||
that could be found `here <https://github.com/kingoflolz/mesh-transformer-jax/blob/master/howto_finetune.md>`__
|
||||
|
||||
- Although the embedding matrix has a size of 50400, only 50257 entries are used by the GPT-2 tokenizer. These extra
|
||||
tokens are added for the sake of efficiency on TPUs. To avoid the mis-match between embedding matrix size and vocab
|
||||
size, the tokenizer for [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B) contains 143 extra tokens
|
||||
size, the tokenizer for `GPT-J <https://huggingface.co/EleutherAI/gpt-j-6B>`__ contains 143 extra tokens
|
||||
``<|extratoken_1|>... <|extratoken_143|>``, so the ``vocab_size`` of tokenizer also becomes 50400.
|
||||
|
||||
Generation
|
||||
|
@ -42,6 +42,7 @@ Ready-made configurations include the following models:
|
||||
- BERT
|
||||
- DistilBERT
|
||||
- GPT-2
|
||||
- LayoutLM
|
||||
- RoBERTa
|
||||
- T5
|
||||
- XLM-RoBERTa
|
||||
|
2
setup.py
2
setup.py
@ -344,7 +344,7 @@ install_requires = [
|
||||
|
||||
setup(
|
||||
name="transformers",
|
||||
version="4.11.0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
||||
version="4.11.3", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
||||
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Sam Shleifer, Patrick von Platen, Sylvain Gugger, Suraj Patil, Stas Bekman, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
|
||||
author_email="thomas@huggingface.co",
|
||||
description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
|
||||
|
@ -22,7 +22,7 @@
|
||||
# to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
|
||||
# in the namespace without actually importing anything (and especially none of the backends).
|
||||
|
||||
__version__ = "4.11.0"
|
||||
__version__ = "4.11.3"
|
||||
|
||||
# Work around to update TensorFlow's absl.logging threshold which alters the
|
||||
# default Python logging output behavior when present.
|
||||
@ -773,6 +773,7 @@ if is_torch_available():
|
||||
"DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
"DPRContextEncoder",
|
||||
"DPRPretrainedContextEncoder",
|
||||
"DPRPreTrainedModel",
|
||||
"DPRPretrainedQuestionEncoder",
|
||||
"DPRPretrainedReader",
|
||||
"DPRQuestionEncoder",
|
||||
@ -2512,6 +2513,7 @@ if TYPE_CHECKING:
|
||||
DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
DPRContextEncoder,
|
||||
DPRPretrainedContextEncoder,
|
||||
DPRPreTrainedModel,
|
||||
DPRPretrainedQuestionEncoder,
|
||||
DPRPretrainedReader,
|
||||
DPRQuestionEncoder,
|
||||
|
@ -332,7 +332,7 @@ class PretrainedConfig(PushToHubMixin):
|
||||
self.transformers_version = kwargs.pop("transformers_version", None)
|
||||
|
||||
# Deal with gradient checkpointing
|
||||
if kwargs.get("gradient_checkpointing", True):
|
||||
if kwargs.get("gradient_checkpointing", False):
|
||||
warnings.warn(
|
||||
"Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "
|
||||
"Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the "
|
||||
|
@ -1291,7 +1291,7 @@ class BartForConditionalGeneration(BartPretrainedModel):
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if labels is not None:
|
||||
if decoder_input_ids is None:
|
||||
if decoder_input_ids is None and decoder_inputs_embeds is None:
|
||||
decoder_input_ids = shift_tokens_right(
|
||||
labels, self.config.pad_token_id, self.config.decoder_start_token_id
|
||||
)
|
||||
|
@ -2501,7 +2501,7 @@ class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel):
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if labels is not None:
|
||||
if decoder_input_ids is None:
|
||||
if decoder_input_ids is None and decoder_inputs_embeds is None:
|
||||
decoder_input_ids = shift_tokens_right(
|
||||
labels, self.config.pad_token_id, self.config.decoder_start_token_id
|
||||
)
|
||||
|
@ -46,6 +46,7 @@ if is_torch_available():
|
||||
"DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
"DPRContextEncoder",
|
||||
"DPRPretrainedContextEncoder",
|
||||
"DPRPreTrainedModel",
|
||||
"DPRPretrainedQuestionEncoder",
|
||||
"DPRPretrainedReader",
|
||||
"DPRQuestionEncoder",
|
||||
@ -89,6 +90,7 @@ if TYPE_CHECKING:
|
||||
DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
DPRContextEncoder,
|
||||
DPRPretrainedContextEncoder,
|
||||
DPRPreTrainedModel,
|
||||
DPRPretrainedQuestionEncoder,
|
||||
DPRPretrainedReader,
|
||||
DPRQuestionEncoder,
|
||||
|
@ -147,7 +147,29 @@ class DPRReaderOutput(ModelOutput):
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
|
||||
|
||||
class DPREncoder(PreTrainedModel):
|
||||
class DPRPreTrainedModel(PreTrainedModel):
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
if isinstance(module, nn.Linear):
|
||||
# Slightly different from the TF version which uses truncated_normal for initialization
|
||||
# cf https://github.com/pytorch/pytorch/pull/5617
|
||||
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
||||
if module.bias is not None:
|
||||
module.bias.data.zero_()
|
||||
elif isinstance(module, nn.Embedding):
|
||||
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
||||
if module.padding_idx is not None:
|
||||
module.weight.data[module.padding_idx].zero_()
|
||||
elif isinstance(module, nn.LayerNorm):
|
||||
module.bias.data.zero_()
|
||||
module.weight.data.fill_(1.0)
|
||||
|
||||
def _set_gradient_checkpointing(self, module, value=False):
|
||||
if isinstance(module, BertEncoder):
|
||||
module.gradient_checkpointing = value
|
||||
|
||||
|
||||
class DPREncoder(DPRPreTrainedModel):
|
||||
|
||||
base_model_prefix = "bert_model"
|
||||
|
||||
@ -200,13 +222,8 @@ class DPREncoder(PreTrainedModel):
|
||||
return self.encode_proj.out_features
|
||||
return self.bert_model.config.hidden_size
|
||||
|
||||
def init_weights(self):
|
||||
self.bert_model.init_weights()
|
||||
if self.projection_dim > 0:
|
||||
self.encode_proj.apply(self.bert_model._init_weights)
|
||||
|
||||
|
||||
class DPRSpanPredictor(PreTrainedModel):
|
||||
class DPRSpanPredictor(DPRPreTrainedModel):
|
||||
|
||||
base_model_prefix = "encoder"
|
||||
|
||||
@ -262,16 +279,13 @@ class DPRSpanPredictor(PreTrainedModel):
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
def init_weights(self):
|
||||
self.encoder.init_weights()
|
||||
|
||||
|
||||
##################
|
||||
# PreTrainedModel
|
||||
##################
|
||||
|
||||
|
||||
class DPRPretrainedContextEncoder(PreTrainedModel):
|
||||
class DPRPretrainedContextEncoder(DPRPreTrainedModel):
|
||||
"""
|
||||
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
||||
models.
|
||||
@ -282,11 +296,8 @@ class DPRPretrainedContextEncoder(PreTrainedModel):
|
||||
base_model_prefix = "ctx_encoder"
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def init_weights(self):
|
||||
self.ctx_encoder.init_weights()
|
||||
|
||||
|
||||
class DPRPretrainedQuestionEncoder(PreTrainedModel):
|
||||
class DPRPretrainedQuestionEncoder(DPRPreTrainedModel):
|
||||
"""
|
||||
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
||||
models.
|
||||
@ -297,15 +308,8 @@ class DPRPretrainedQuestionEncoder(PreTrainedModel):
|
||||
base_model_prefix = "question_encoder"
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def init_weights(self):
|
||||
self.question_encoder.init_weights()
|
||||
|
||||
def _set_gradient_checkpointing(self, module, value=False):
|
||||
if isinstance(module, BertEncoder):
|
||||
module.gradient_checkpointing = value
|
||||
|
||||
|
||||
class DPRPretrainedReader(PreTrainedModel):
|
||||
class DPRPretrainedReader(DPRPreTrainedModel):
|
||||
"""
|
||||
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
||||
models.
|
||||
@ -316,15 +320,6 @@ class DPRPretrainedReader(PreTrainedModel):
|
||||
base_model_prefix = "span_predictor"
|
||||
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
||||
|
||||
def init_weights(self):
|
||||
self.span_predictor.encoder.init_weights()
|
||||
self.span_predictor.qa_classifier.apply(self.span_predictor.encoder.bert_model._init_weights)
|
||||
self.span_predictor.qa_outputs.apply(self.span_predictor.encoder.bert_model._init_weights)
|
||||
|
||||
def _set_gradient_checkpointing(self, module, value=False):
|
||||
if isinstance(module, BertEncoder):
|
||||
module.gradient_checkpointing = value
|
||||
|
||||
|
||||
###############
|
||||
# Actual Models
|
||||
|
@ -964,6 +964,14 @@ class HubertForCTC(HubertPreTrainedModel):
|
||||
|
||||
self.hubert = HubertModel(config)
|
||||
self.dropout = nn.Dropout(config.final_dropout)
|
||||
|
||||
if config.vocab_size is None:
|
||||
raise ValueError(
|
||||
f"You are trying to instantiate {self.__class__} with a configuration that "
|
||||
"does not define the vocabulary size of the language model head. Please "
|
||||
"instantiate the model as follows: `HubertForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
|
||||
"or define `vocab_size` of your model's configuration."
|
||||
)
|
||||
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
|
||||
|
||||
self.init_weights()
|
||||
|
@ -183,11 +183,6 @@ class LayoutLMOnnxConfig(OnnxConfig):
|
||||
raise ValueError("Cannot generate dummy inputs without PyTorch installed.")
|
||||
import torch
|
||||
|
||||
input_dict["bbox"] = torch.tensor(
|
||||
[
|
||||
[0] * 4,
|
||||
*[box] * seq_length,
|
||||
[self.max_2d_positions] * 4,
|
||||
]
|
||||
).tile(batch_size, 1, 1)
|
||||
batch_size, seq_length = input_dict["input_ids"].shape
|
||||
input_dict["bbox"] = torch.tensor([*[box] * seq_length]).tile(batch_size, 1, 1)
|
||||
return input_dict
|
||||
|
17
src/transformers/models/megatron_gpt2/__init__.py
Normal file
17
src/transformers/models/megatron_gpt2/__init__.py
Normal file
@ -0,0 +1,17 @@
|
||||
# flake8: noqa
|
||||
# There's no way to ignore "F401 '...' imported but unused" warnings in this
|
||||
# module, but to preserve other warnings. So, don't check this module at all.
|
||||
|
||||
# Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -1416,6 +1416,14 @@ class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
|
||||
|
||||
self.wav2vec2 = Wav2Vec2Model(config)
|
||||
self.dropout = nn.Dropout(config.final_dropout)
|
||||
|
||||
if config.vocab_size is None:
|
||||
raise ValueError(
|
||||
f"You are trying to instantiate {self.__class__} with a configuration that "
|
||||
"does not define the vocabulary size of the language model head. Please "
|
||||
"instantiate the model as follows: `Wav2Vec2ForCTC.from_pretrained(..., vocab_size=vocab_size)`."
|
||||
"or define `vocab_size` of your model's configuration."
|
||||
)
|
||||
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
|
||||
|
||||
self.init_weights()
|
||||
|
@ -791,7 +791,7 @@ class Pipeline(_ScikitCompat):
|
||||
elif isinstance(inputs, tuple):
|
||||
return tuple([self._ensure_tensor_on_device(item, device) for item in inputs])
|
||||
elif isinstance(inputs, torch.Tensor):
|
||||
return inputs.to(self.device)
|
||||
return inputs.to(device)
|
||||
else:
|
||||
return inputs
|
||||
|
||||
|
@ -248,7 +248,13 @@ class QuestionAnsweringPipeline(Pipeline):
|
||||
return super().__call__(examples[0], **kwargs)
|
||||
return super().__call__(examples, **kwargs)
|
||||
|
||||
def preprocess(self, example, padding="do_not_pad", doc_stride=128, max_question_len=64, max_seq_len=384):
|
||||
def preprocess(self, example, padding="do_not_pad", doc_stride=None, max_question_len=64, max_seq_len=None):
|
||||
|
||||
if max_seq_len is None:
|
||||
max_seq_len = min(self.tokenizer.model_max_length, 384)
|
||||
if doc_stride is None:
|
||||
doc_stride = min(max_seq_len // 4, 128)
|
||||
|
||||
if not self.tokenizer.is_fast:
|
||||
features = squad_convert_examples_to_features(
|
||||
examples=[example],
|
||||
@ -277,7 +283,6 @@ class QuestionAnsweringPipeline(Pipeline):
|
||||
return_offsets_mapping=True,
|
||||
return_special_tokens_mask=True,
|
||||
)
|
||||
|
||||
# When the input is too long, it's converted in a batch of inputs with overflowing tokens
|
||||
# and a stride of overlap between the inputs. If a batch of inputs is given, a special output
|
||||
# "overflow_to_sample_mapping" indicate which member of the encoded batch belong to which original batch sample.
|
||||
@ -308,12 +313,15 @@ class QuestionAnsweringPipeline(Pipeline):
|
||||
token_type_ids_span_idx = (
|
||||
encoded_inputs["token_type_ids"][span_idx] if "token_type_ids" in encoded_inputs else None
|
||||
)
|
||||
submask = p_mask[span_idx]
|
||||
if isinstance(submask, np.ndarray):
|
||||
submask = submask.tolist()
|
||||
features.append(
|
||||
SquadFeatures(
|
||||
input_ids=input_ids_span_idx,
|
||||
attention_mask=attention_mask_span_idx,
|
||||
token_type_ids=token_type_ids_span_idx,
|
||||
p_mask=p_mask[span_idx].tolist(),
|
||||
p_mask=submask,
|
||||
encoding=encoded_inputs[span_idx],
|
||||
# We don't use the rest of the values - and actually
|
||||
# for Fast tokenizer we could totally avoid using SquadFeatures and SquadExample
|
||||
@ -330,26 +338,41 @@ class QuestionAnsweringPipeline(Pipeline):
|
||||
qas_id=None,
|
||||
)
|
||||
)
|
||||
return {"features": features, "example": example}
|
||||
|
||||
split_features = []
|
||||
for feature in features:
|
||||
fw_args = {}
|
||||
others = {}
|
||||
model_input_names = self.tokenizer.model_input_names
|
||||
|
||||
for k, v in feature.__dict__.items():
|
||||
if k in model_input_names:
|
||||
if self.framework == "tf":
|
||||
tensor = tf.constant(v)
|
||||
if tensor.dtype == tf.int64:
|
||||
tensor = tf.cast(tensor, tf.int32)
|
||||
fw_args[k] = tf.expand_dims(tensor, 0)
|
||||
elif self.framework == "pt":
|
||||
tensor = torch.tensor(v)
|
||||
if tensor.dtype == torch.int32:
|
||||
tensor = tensor.long()
|
||||
fw_args[k] = tensor.unsqueeze(0)
|
||||
else:
|
||||
others[k] = v
|
||||
split_features.append({"fw_args": fw_args, "others": others})
|
||||
return {"features": split_features, "example": example}
|
||||
|
||||
def _forward(self, model_inputs):
|
||||
features = model_inputs["features"]
|
||||
example = model_inputs["example"]
|
||||
model_input_names = self.tokenizer.model_input_names
|
||||
fw_args = {k: [feature.__dict__[k] for feature in features] for k in model_input_names}
|
||||
|
||||
if self.framework == "tf":
|
||||
fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()}
|
||||
start, end = self.model(fw_args)[:2]
|
||||
start, end = start.numpy(), end.numpy()
|
||||
elif self.framework == "pt":
|
||||
# Retrieve the score for the context tokens only (removing question tokens)
|
||||
fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()}
|
||||
# On Windows, the default int type in numpy is np.int32 so we get some non-long tensors.
|
||||
fw_args = {k: v.long() if v.dtype == torch.int32 else v for (k, v) in fw_args.items()}
|
||||
starts = []
|
||||
ends = []
|
||||
for feature in features:
|
||||
fw_args = feature["fw_args"]
|
||||
start, end = self.model(**fw_args)[:2]
|
||||
start, end = start.cpu().numpy(), end.cpu().numpy()
|
||||
return {"start": start, "end": end, "features": features, "example": example}
|
||||
starts.append(start)
|
||||
ends.append(end)
|
||||
return {"starts": starts, "ends": ends, "features": features, "example": example}
|
||||
|
||||
def postprocess(
|
||||
self,
|
||||
@ -360,90 +383,89 @@ class QuestionAnsweringPipeline(Pipeline):
|
||||
):
|
||||
min_null_score = 1000000 # large and positive
|
||||
answers = []
|
||||
start_ = model_outputs["start"][0]
|
||||
end_ = model_outputs["end"][0]
|
||||
feature = model_outputs["features"][0]
|
||||
example = model_outputs["example"]
|
||||
# Ensure padded tokens & question tokens cannot belong to the set of candidate answers.
|
||||
undesired_tokens = np.abs(np.array(feature.p_mask) - 1)
|
||||
for i, (feature_, start_, end_) in enumerate(
|
||||
zip(model_outputs["features"], model_outputs["starts"], model_outputs["ends"])
|
||||
):
|
||||
feature = feature_["others"]
|
||||
# Ensure padded tokens & question tokens cannot belong to the set of candidate answers.
|
||||
undesired_tokens = np.abs(np.array(feature["p_mask"]) - 1)
|
||||
|
||||
if feature.attention_mask is not None:
|
||||
undesired_tokens = undesired_tokens & feature.attention_mask
|
||||
if feature_["fw_args"].get("attention_mask", None) is not None:
|
||||
undesired_tokens = undesired_tokens & feature_["fw_args"]["attention_mask"].numpy()
|
||||
|
||||
# Generate mask
|
||||
undesired_tokens_mask = undesired_tokens == 0.0
|
||||
# Generate mask
|
||||
undesired_tokens_mask = undesired_tokens == 0.0
|
||||
|
||||
# Make sure non-context indexes in the tensor cannot contribute to the softmax
|
||||
start_ = np.where(undesired_tokens_mask, -10000.0, start_)
|
||||
end_ = np.where(undesired_tokens_mask, -10000.0, end_)
|
||||
# Make sure non-context indexes in the tensor cannot contribute to the softmax
|
||||
start_ = np.where(undesired_tokens_mask, -10000.0, start_)
|
||||
end_ = np.where(undesired_tokens_mask, -10000.0, end_)
|
||||
|
||||
# Normalize logits and spans to retrieve the answer
|
||||
start_ = np.exp(start_ - np.log(np.sum(np.exp(start_), axis=-1, keepdims=True)))
|
||||
end_ = np.exp(end_ - np.log(np.sum(np.exp(end_), axis=-1, keepdims=True)))
|
||||
# Normalize logits and spans to retrieve the answer
|
||||
start_ = np.exp(start_ - np.log(np.sum(np.exp(start_), axis=-1, keepdims=True)))
|
||||
end_ = np.exp(end_ - np.log(np.sum(np.exp(end_), axis=-1, keepdims=True)))
|
||||
|
||||
if handle_impossible_answer:
|
||||
min_null_score = min(min_null_score, (start_[0] * end_[0]).item())
|
||||
if handle_impossible_answer:
|
||||
min_null_score = min(min_null_score, (start_[0] * end_[0]).item())
|
||||
|
||||
# Mask CLS
|
||||
start_[0] = end_[0] = 0.0
|
||||
# Mask CLS
|
||||
start_[0, 0] = end_[0, 0] = 0.0
|
||||
|
||||
starts, ends, scores = self.decode(start_, end_, top_k, max_answer_len, undesired_tokens)
|
||||
if not self.tokenizer.is_fast:
|
||||
char_to_word = np.array(example.char_to_word_offset)
|
||||
starts, ends, scores = self.decode(start_, end_, top_k, max_answer_len, undesired_tokens)
|
||||
if not self.tokenizer.is_fast:
|
||||
char_to_word = np.array(example.char_to_word_offset)
|
||||
|
||||
# Convert the answer (tokens) back to the original text
|
||||
# Score: score from the model
|
||||
# Start: Index of the first character of the answer in the context string
|
||||
# End: Index of the character following the last character of the answer in the context string
|
||||
# Answer: Plain text of the answer
|
||||
for s, e, score in zip(starts, ends, scores):
|
||||
answers.append(
|
||||
{
|
||||
"score": score.item(),
|
||||
"start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
|
||||
"end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
|
||||
"answer": " ".join(
|
||||
example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1]
|
||||
),
|
||||
}
|
||||
)
|
||||
else:
|
||||
# Convert the answer (tokens) back to the original text
|
||||
# Score: score from the model
|
||||
# Start: Index of the first character of the answer in the context string
|
||||
# End: Index of the character following the last character of the answer in the context string
|
||||
# Answer: Plain text of the answer
|
||||
question_first = bool(self.tokenizer.padding_side == "right")
|
||||
enc = feature.encoding
|
||||
# Convert the answer (tokens) back to the original text
|
||||
# Score: score from the model
|
||||
# Start: Index of the first character of the answer in the context string
|
||||
# End: Index of the character following the last character of the answer in the context string
|
||||
# Answer: Plain text of the answer
|
||||
for s, e, score in zip(starts, ends, scores):
|
||||
token_to_orig_map = feature["token_to_orig_map"]
|
||||
answers.append(
|
||||
{
|
||||
"score": score.item(),
|
||||
"start": np.where(char_to_word == token_to_orig_map[s])[0][0].item(),
|
||||
"end": np.where(char_to_word == token_to_orig_map[e])[0][-1].item(),
|
||||
"answer": " ".join(example.doc_tokens[token_to_orig_map[s] : token_to_orig_map[e] + 1]),
|
||||
}
|
||||
)
|
||||
else:
|
||||
# Convert the answer (tokens) back to the original text
|
||||
# Score: score from the model
|
||||
# Start: Index of the first character of the answer in the context string
|
||||
# End: Index of the character following the last character of the answer in the context string
|
||||
# Answer: Plain text of the answer
|
||||
question_first = bool(self.tokenizer.padding_side == "right")
|
||||
enc = feature["encoding"]
|
||||
|
||||
# Sometimes the max probability token is in the middle of a word so:
|
||||
# - we start by finding the right word containing the token with `token_to_word`
|
||||
# - then we convert this word in a character span with `word_to_chars`
|
||||
sequence_index = 1 if question_first else 0
|
||||
for s, e, score in zip(starts, ends, scores):
|
||||
try:
|
||||
start_word = enc.token_to_word(s)
|
||||
end_word = enc.token_to_word(e)
|
||||
start_index = enc.word_to_chars(start_word, sequence_index=sequence_index)[0]
|
||||
end_index = enc.word_to_chars(end_word, sequence_index=sequence_index)[1]
|
||||
except Exception:
|
||||
# Some tokenizers don't really handle words. Keep to offsets then.
|
||||
start_index = enc.offsets[s][0]
|
||||
end_index = enc.offsets[e][1]
|
||||
# Sometimes the max probability token is in the middle of a word so:
|
||||
# - we start by finding the right word containing the token with `token_to_word`
|
||||
# - then we convert this word in a character span with `word_to_chars`
|
||||
sequence_index = 1 if question_first else 0
|
||||
for s, e, score in zip(starts, ends, scores):
|
||||
try:
|
||||
start_word = enc.token_to_word(s)
|
||||
end_word = enc.token_to_word(e)
|
||||
start_index = enc.word_to_chars(start_word, sequence_index=sequence_index)[0]
|
||||
end_index = enc.word_to_chars(end_word, sequence_index=sequence_index)[1]
|
||||
except Exception:
|
||||
# Some tokenizers don't really handle words. Keep to offsets then.
|
||||
start_index = enc.offsets[s][0]
|
||||
end_index = enc.offsets[e][1]
|
||||
|
||||
answers.append(
|
||||
{
|
||||
"score": score.item(),
|
||||
"start": start_index,
|
||||
"end": end_index,
|
||||
"answer": example.context_text[start_index:end_index],
|
||||
}
|
||||
)
|
||||
answers.append(
|
||||
{
|
||||
"score": score.item(),
|
||||
"start": start_index,
|
||||
"end": end_index,
|
||||
"answer": example.context_text[start_index:end_index],
|
||||
}
|
||||
)
|
||||
|
||||
if handle_impossible_answer:
|
||||
answers.append({"score": min_null_score, "start": 0, "end": 0, "answer": ""})
|
||||
|
||||
answers = sorted(answers, key=lambda x: x["score"], reverse=True)[:top_k]
|
||||
answers = sorted(answers, key=lambda x: x["score"], reverse=True)[:top_k]
|
||||
if len(answers) == 1:
|
||||
return answers[0]
|
||||
return answers
|
||||
|
@ -158,6 +158,9 @@ class TextGenerationPipeline(Pipeline):
|
||||
|
||||
def _forward(self, model_inputs, **generate_kwargs):
|
||||
input_ids = model_inputs["input_ids"]
|
||||
# Allow empty prompts
|
||||
if input_ids.shape[1] == 0:
|
||||
input_ids = None
|
||||
prompt_text = model_inputs.pop("prompt_text")
|
||||
generated_sequence = self.model.generate(input_ids=input_ids, **generate_kwargs) # BS x SL
|
||||
return {"generated_sequence": generated_sequence, "input_ids": input_ids, "prompt_text": prompt_text}
|
||||
|
@ -204,9 +204,10 @@ class TokenClassificationPipeline(Pipeline):
|
||||
offset_mapping = model_inputs.pop("offset_mapping", None)
|
||||
sentence = model_inputs.pop("sentence")
|
||||
if self.framework == "tf":
|
||||
outputs = self.model(model_inputs.data)[0][0].numpy()
|
||||
outputs = self.model(model_inputs.data)[0][0]
|
||||
else:
|
||||
outputs = self.model(**model_inputs)[0][0].numpy()
|
||||
outputs = self.model(**model_inputs)[0][0]
|
||||
|
||||
return {
|
||||
"outputs": outputs,
|
||||
"special_tokens_mask": special_tokens_mask,
|
||||
@ -216,7 +217,7 @@ class TokenClassificationPipeline(Pipeline):
|
||||
}
|
||||
|
||||
def postprocess(self, model_outputs, aggregation_strategy=AggregationStrategy.NONE):
|
||||
outputs = model_outputs["outputs"]
|
||||
outputs = model_outputs["outputs"].numpy()
|
||||
sentence = model_outputs["sentence"]
|
||||
input_ids = model_outputs["input_ids"][0]
|
||||
offset_mapping = model_outputs["offset_mapping"][0] if model_outputs["offset_mapping"] is not None else None
|
||||
|
@ -191,10 +191,7 @@ class ZeroShotClassificationPipeline(Pipeline):
|
||||
else:
|
||||
raise ValueError(f"Unable to understand extra arguments {args}")
|
||||
|
||||
result = super().__call__(sequences, **kwargs)
|
||||
if len(result) == 1:
|
||||
return result[0]
|
||||
return result
|
||||
return super().__call__(sequences, **kwargs)
|
||||
|
||||
def preprocess(self, inputs, candidate_labels=None, hypothesis_template="This example is {}."):
|
||||
sequence_pairs, sequences = self._args_parser(inputs, candidate_labels, hypothesis_template)
|
||||
@ -264,4 +261,6 @@ class ZeroShotClassificationPipeline(Pipeline):
|
||||
"scores": scores[iseq, top_inds].tolist(),
|
||||
}
|
||||
)
|
||||
if len(result) == 1:
|
||||
return result[0]
|
||||
return result
|
||||
|
@ -20,6 +20,7 @@ import bisect
|
||||
import itertools
|
||||
import re
|
||||
import unicodedata
|
||||
from collections import OrderedDict
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union, overload
|
||||
|
||||
from .file_utils import PaddingStrategy, TensorType, add_end_docstrings
|
||||
@ -102,7 +103,6 @@ class Trie:
|
||||
>>> trie.split("[CLS] This is a extra_id_100")
|
||||
["[CLS]", " This is a ", "extra_id_100"]
|
||||
"""
|
||||
|
||||
# indexes are counted left of the chars index.
|
||||
# "hello", index 0, is left of h, index 1 is between h and e.
|
||||
# index 5 is right of the "o".
|
||||
@ -115,7 +115,7 @@ class Trie:
|
||||
# If the trie contains, "blowing", and "lower" and we encounter the
|
||||
# string "blower", we need to split into ["b", "lower"].
|
||||
# This is where we need to keep track of multiple possible starts.
|
||||
states = {}
|
||||
states = OrderedDict()
|
||||
|
||||
# This will contain every indices where we need
|
||||
# to cut.
|
||||
@ -144,36 +144,36 @@ class Trie:
|
||||
|
||||
# In this case, we already have partial matches (But unfinished)
|
||||
for start, trie_pointer in states.items():
|
||||
if current_char in trie_pointer:
|
||||
if "" in trie_pointer:
|
||||
# This is a final match, we need to reset and
|
||||
# store the results in `offsets`.
|
||||
|
||||
# Lookahead to match longest first
|
||||
# Important in case of extra_id_1 vs extra_id_100
|
||||
lookahead_index = current
|
||||
end = current
|
||||
next_char = text[lookahead_index] if lookahead_index < len(text) else None
|
||||
while next_char in trie_pointer:
|
||||
trie_pointer = trie_pointer[next_char]
|
||||
lookahead_index += 1
|
||||
if "" in trie_pointer:
|
||||
end = lookahead_index
|
||||
skip = lookahead_index
|
||||
|
||||
if lookahead_index == len(text):
|
||||
# End of string
|
||||
break
|
||||
next_char = text[lookahead_index]
|
||||
# End lookahead
|
||||
|
||||
# Storing and resetting
|
||||
offsets.append(start)
|
||||
offsets.append(end)
|
||||
reset = True
|
||||
elif current_char in trie_pointer:
|
||||
# The current character being looked at has a match within the trie
|
||||
# update the pointer (it will be stored back into states later).
|
||||
trie_pointer = trie_pointer[current_char]
|
||||
if "" in trie_pointer:
|
||||
# This is a final match, we need to reset and
|
||||
# store the results in `offsets`.
|
||||
|
||||
# Lookahead to match longest first
|
||||
# Important in case of extra_id_1 vs extra_id_100
|
||||
lookahead_index = current + 1
|
||||
end = current + 1
|
||||
next_char = text[lookahead_index] if lookahead_index < len(text) else None
|
||||
while next_char in trie_pointer:
|
||||
trie_pointer = trie_pointer[next_char]
|
||||
lookahead_index += 1
|
||||
if "" in trie_pointer:
|
||||
end = lookahead_index
|
||||
skip = lookahead_index
|
||||
|
||||
if lookahead_index == len(text):
|
||||
# End of string
|
||||
break
|
||||
next_char = text[lookahead_index]
|
||||
# End lookahead
|
||||
|
||||
# Storing and resetting
|
||||
offsets.append(start)
|
||||
offsets.append(end)
|
||||
reset = True
|
||||
|
||||
# Storing back the new pointer into the states.
|
||||
# Partial matches got longer by one.
|
||||
@ -198,6 +198,18 @@ class Trie:
|
||||
if current_char in self.data:
|
||||
states[current] = self.data[current_char]
|
||||
|
||||
# We have a cut at the end with states.
|
||||
for start, trie_pointer in states.items():
|
||||
if "" in trie_pointer:
|
||||
# This is a final match, we need to reset and
|
||||
# store the results in `offsets`.
|
||||
end = len(text)
|
||||
offsets.append(start)
|
||||
offsets.append(end)
|
||||
# Longest cut is always the one with lower start so the first
|
||||
# item so we need to break.
|
||||
break
|
||||
|
||||
# We have all the offsets now, we just need to do the actual splitting.
|
||||
# We need to eventually add the first part of the string and the eventual
|
||||
# last part.
|
||||
|
@ -2223,8 +2223,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
elif padding is not False:
|
||||
if padding is True:
|
||||
if verbose:
|
||||
if max_length is not None:
|
||||
warnings.warn("`max_length` is ignored when `padding`=`True`.")
|
||||
if max_length is not None and (truncation is False or truncation == "do_not_truncate"):
|
||||
warnings.warn(
|
||||
"`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
|
||||
"To pad to max length, use `padding='max_length'`."
|
||||
)
|
||||
if old_pad_to_max_length is not False:
|
||||
warnings.warn("Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`.")
|
||||
padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch
|
||||
|
@ -1238,7 +1238,11 @@ class Trainer:
|
||||
self.callback_handler.lr_scheduler = self.lr_scheduler
|
||||
self.callback_handler.train_dataloader = train_dataloader
|
||||
self.state.trial_name = self.hp_name(trial) if self.hp_name is not None else None
|
||||
self.state.trial_params = hp_params(trial.assignments) if trial is not None else None
|
||||
if trial is not None:
|
||||
assignments = trial.assignments if self.hp_search_backend == HPSearchBackend.SIGOPT else trial
|
||||
self.state.trial_params = hp_params(assignments)
|
||||
else:
|
||||
self.state.trial_params = None
|
||||
# This should be the same if the state has been saved but in case the training arguments changed, it's safer
|
||||
# to set this after the load.
|
||||
self.state.max_steps = max_steps
|
||||
@ -1311,7 +1315,11 @@ class Trainer:
|
||||
else:
|
||||
tr_loss_step = self.training_step(model, inputs)
|
||||
|
||||
if args.logging_nan_inf_filter and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)):
|
||||
if (
|
||||
args.logging_nan_inf_filter
|
||||
and not is_torch_tpu_available()
|
||||
and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
|
||||
):
|
||||
# if loss is nan or inf simply add the average of previous logged losses
|
||||
tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
|
||||
else:
|
||||
|
@ -152,6 +152,8 @@ def nested_xla_mesh_reduce(tensors, name):
|
||||
|
||||
if isinstance(tensors, (list, tuple)):
|
||||
return type(tensors)(nested_xla_mesh_reduce(t, f"{name}_{i}") for i, t in enumerate(tensors))
|
||||
if tensors.ndim == 0:
|
||||
tensors = tensors[None]
|
||||
return xm.mesh_reduce(name, tensors, torch.cat)
|
||||
else:
|
||||
raise ImportError("Torch xla must be installed to use `nested_xla_mesh_reduce`")
|
||||
@ -772,6 +774,13 @@ class IterableDatasetShard(IterableDataset):
|
||||
for i in process_slice:
|
||||
yield current_batch[i]
|
||||
|
||||
def __len__(self):
|
||||
# Will raise an error if the underlying dataset is not sized.
|
||||
if self.drop_last:
|
||||
return (len(self.dataset) // (self.batch_size * self.num_processes)) * self.batch_size
|
||||
else:
|
||||
return math.ceil(len(self.dataset) / (self.batch_size * self.num_processes)) * self.batch_size
|
||||
|
||||
|
||||
# In order to keep `trainer.py` compact and easy to understand, place any secondary PT Trainer
|
||||
# helper methods here
|
||||
|
@ -1462,6 +1462,15 @@ class DPRPretrainedContextEncoder:
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class DPRPreTrainedModel:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch"])
|
||||
|
||||
|
||||
class DPRPretrainedQuestionEncoder:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
@ -14,6 +14,7 @@
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import DPRConfig, is_torch_available
|
||||
@ -213,6 +214,19 @@ class DPRModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_reader(*config_and_inputs)
|
||||
|
||||
def test_init_changed_config(self):
|
||||
config = self.model_tester.prepare_config_and_inputs()[0]
|
||||
|
||||
model = DPRQuestionEncoder(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dirname:
|
||||
model.save_pretrained(tmp_dirname)
|
||||
model = DPRQuestionEncoder.from_pretrained(tmp_dirname, projection_dim=512)
|
||||
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
for model_name in DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
|
||||
|
@ -162,6 +162,11 @@ class LEDModelTester:
|
||||
attention_window=self.attention_window,
|
||||
)
|
||||
|
||||
def get_pipeline_config(self):
|
||||
config = self.get_config()
|
||||
config.max_position_embeddings = 100
|
||||
return config
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config, inputs_dict = self.prepare_config_and_inputs()
|
||||
global_attention_mask = torch.zeros_like(inputs_dict["input_ids"])
|
||||
|
@ -189,6 +189,7 @@ class ReformerModelTester:
|
||||
def get_pipeline_config(self):
|
||||
config = self.get_config()
|
||||
config.vocab_size = 100
|
||||
config.max_position_embeddings = 100
|
||||
config.axial_pos_shape = (4, 25)
|
||||
config.is_decoder = False
|
||||
return config
|
||||
|
@ -87,6 +87,12 @@ class QAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
||||
outputs, [{"answer": ANY(str), "start": ANY(int), "end": ANY(int), "score": ANY(float)} for i in range(20)]
|
||||
)
|
||||
|
||||
# Very long context require multiple features
|
||||
outputs = question_answerer(
|
||||
question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris." * 20
|
||||
)
|
||||
self.assertEqual(outputs, {"answer": ANY(str), "start": ANY(int), "end": ANY(int), "score": ANY(float)})
|
||||
|
||||
@require_torch
|
||||
def test_small_model_pt(self):
|
||||
question_answerer = pipeline(
|
||||
@ -121,6 +127,73 @@ class QAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
||||
|
||||
self.assertEqual(nested_simplify(outputs), {"score": 0.979, "start": 27, "end": 32, "answer": "Paris"})
|
||||
|
||||
@slow
|
||||
@require_torch
|
||||
def test_large_model_issue(self):
|
||||
qa_pipeline = pipeline(
|
||||
"question-answering",
|
||||
model="mrm8488/bert-multi-cased-finetuned-xquadv1",
|
||||
)
|
||||
outputs = qa_pipeline(
|
||||
{
|
||||
"context": "Yes Bank founder Rana Kapoor has approached the Bombay High Court, challenging a special court's order from August this year that had remanded him in police custody for a week in a multi-crore loan fraud case. Kapoor, who is currently lodged in Taloja Jail, is an accused in the loan fraud case and some related matters being probed by the CBI and Enforcement Directorate. A single bench presided over by Justice S K Shinde on Tuesday posted the plea for further hearing on October 14. In his plea filed through advocate Vijay Agarwal, Kapoor claimed that the special court's order permitting the CBI's request for police custody on August 14 was illegal and in breach of the due process of law. Therefore, his police custody and subsequent judicial custody in the case were all illegal. Kapoor has urged the High Court to quash and set aside the special court's order dated August 14. As per his plea, in August this year, the CBI had moved two applications before the special court, one seeking permission to arrest Kapoor, who was already in judicial custody at the time in another case, and the other, seeking his police custody. While the special court refused to grant permission to the CBI to arrest Kapoor, it granted the central agency's plea for his custody. Kapoor, however, said in his plea that before filing an application for his arrest, the CBI had not followed the process of issuing him a notice under Section 41 of the CrPC for appearance before it. He further said that the CBI had not taken prior sanction as mandated under section 17 A of the Prevention of Corruption Act for prosecuting him. The special court, however, had said in its order at the time that as Kapoor was already in judicial custody in another case and was not a free man the procedure mandated under Section 41 of the CrPC need not have been adhered to as far as issuing a prior notice of appearance was concerned. ADVERTISING It had also said that case records showed that the investigating officer had taken an approval from a managing director of Yes Bank before beginning the proceedings against Kapoor and such a permission was a valid sanction. However, Kapoor in his plea said that the above order was bad in law and sought that it be quashed and set aside. The law mandated that if initial action was not in consonance with legal procedures, then all subsequent actions must be held as illegal, he said, urging the High Court to declare the CBI remand and custody and all subsequent proceedings including the further custody as illegal and void ab-initio. In a separate plea before the High Court, Kapoor's daughter Rakhee Kapoor-Tandon has sought exemption from in-person appearance before a special PMLA court. Rakhee has stated that she is a resident of the United Kingdom and is unable to travel to India owing to restrictions imposed due to the COVID-19 pandemic. According to the CBI, in the present case, Kapoor had obtained a gratification or pecuniary advantage of ₹ 307 crore, and thereby caused Yes Bank a loss of ₹ 1,800 crore by extending credit facilities to Avantha Group, when it was not eligible for the same",
|
||||
"question": "Is this person invovled in fraud?",
|
||||
}
|
||||
)
|
||||
self.assertEqual(
|
||||
nested_simplify(outputs),
|
||||
{"answer": "an accused in the loan fraud case", "end": 294, "score": 0.001, "start": 261},
|
||||
)
|
||||
|
||||
@slow
|
||||
@require_torch
|
||||
def test_large_model_course(self):
|
||||
question_answerer = pipeline("question-answering")
|
||||
long_context = """
|
||||
🤗 Transformers: State of the Art NLP
|
||||
|
||||
🤗 Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction,
|
||||
question answering, summarization, translation, text generation and more in over 100 languages.
|
||||
Its aim is to make cutting-edge NLP easier to use for everyone.
|
||||
|
||||
🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets and
|
||||
then share them with the community on our model hub. At the same time, each python module defining an architecture is fully standalone and
|
||||
can be modified to enable quick research experiments.
|
||||
|
||||
Why should I use transformers?
|
||||
|
||||
1. Easy-to-use state-of-the-art models:
|
||||
- High performance on NLU and NLG tasks.
|
||||
- Low barrier to entry for educators and practitioners.
|
||||
- Few user-facing abstractions with just three classes to learn.
|
||||
- A unified API for using all our pretrained models.
|
||||
- Lower compute costs, smaller carbon footprint:
|
||||
|
||||
2. Researchers can share trained models instead of always retraining.
|
||||
- Practitioners can reduce compute time and production costs.
|
||||
- Dozens of architectures with over 10,000 pretrained models, some in more than 100 languages.
|
||||
|
||||
3. Choose the right framework for every part of a model's lifetime:
|
||||
- Train state-of-the-art models in 3 lines of code.
|
||||
- Move a single model between TF2.0/PyTorch frameworks at will.
|
||||
- Seamlessly pick the right framework for training, evaluation and production.
|
||||
|
||||
4. Easily customize a model or an example to your needs:
|
||||
- We provide examples for each architecture to reproduce the results published by its original authors.
|
||||
- Model internals are exposed as consistently as possible.
|
||||
- Model files can be used independently of the library for quick experiments.
|
||||
|
||||
🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration
|
||||
between them. It's straightforward to train your models with one before loading them for inference with the other.
|
||||
"""
|
||||
question = "Which deep learning libraries back 🤗 Transformers?"
|
||||
outputs = question_answerer(question=question, context=long_context)
|
||||
|
||||
self.assertEqual(
|
||||
nested_simplify(outputs),
|
||||
{"answer": "Jax, PyTorch and TensorFlow", "end": 1919, "score": 0.971, "start": 1892},
|
||||
)
|
||||
|
||||
@slow
|
||||
@require_tf
|
||||
def test_large_model_tf(self):
|
||||
|
@ -106,3 +106,14 @@ class TextGenerationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseM
|
||||
outputs = text_generator("This is a test", return_full_text=True)
|
||||
self.assertEqual(outputs, [{"generated_text": ANY(str)}])
|
||||
self.assertTrue(outputs[0]["generated_text"].startswith("This is a test"))
|
||||
|
||||
# Empty prompt is slighly special
|
||||
# it requires BOS token to exist.
|
||||
# Special case for Pegasus which will always append EOS so will
|
||||
# work even without BOS.
|
||||
if text_generator.tokenizer.bos_token_id is not None or "Pegasus" in tokenizer.__class__.__name__:
|
||||
outputs = text_generator("")
|
||||
self.assertEqual(outputs, [{"generated_text": ANY(str)}])
|
||||
else:
|
||||
with self.assertRaises((ValueError, AssertionError)):
|
||||
outputs = text_generator("")
|
||||
|
@ -25,7 +25,14 @@ from transformers import (
|
||||
pipeline,
|
||||
)
|
||||
from transformers.pipelines import AggregationStrategy, TokenClassificationArgumentHandler
|
||||
from transformers.testing_utils import is_pipeline_test, nested_simplify, require_tf, require_torch, slow
|
||||
from transformers.testing_utils import (
|
||||
is_pipeline_test,
|
||||
nested_simplify,
|
||||
require_tf,
|
||||
require_torch,
|
||||
require_torch_gpu,
|
||||
slow,
|
||||
)
|
||||
|
||||
from .test_pipelines_common import ANY, PipelineTestCaseMeta
|
||||
|
||||
@ -246,6 +253,19 @@ class TokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
|
||||
],
|
||||
)
|
||||
|
||||
@require_torch_gpu
|
||||
@slow
|
||||
def test_gpu(self):
|
||||
sentence = "This is dummy sentence"
|
||||
ner = pipeline(
|
||||
"token-classification",
|
||||
device=0,
|
||||
aggregation_strategy=AggregationStrategy.SIMPLE,
|
||||
)
|
||||
|
||||
output = ner(sentence)
|
||||
self.assertEqual(nested_simplify(output), [])
|
||||
|
||||
@require_torch
|
||||
@slow
|
||||
def test_dbmdz_english(self):
|
||||
|
@ -61,6 +61,24 @@ class ZeroShotClassificationPipelineTests(unittest.TestCase, metaclass=PipelineT
|
||||
)
|
||||
self.assertEqual(outputs, {"sequence": ANY(str), "labels": [ANY(str)], "scores": [ANY(float)]})
|
||||
|
||||
# https://github.com/huggingface/transformers/issues/13846
|
||||
outputs = classifier(["I am happy"], ["positive", "negative"])
|
||||
self.assertEqual(
|
||||
outputs,
|
||||
[
|
||||
{"sequence": ANY(str), "labels": [ANY(str), ANY(str)], "scores": [ANY(float), ANY(float)]}
|
||||
for i in range(1)
|
||||
],
|
||||
)
|
||||
outputs = classifier(["I am happy", "I am sad"], ["positive", "negative"])
|
||||
self.assertEqual(
|
||||
outputs,
|
||||
[
|
||||
{"sequence": ANY(str), "labels": [ANY(str), ANY(str)], "scores": [ANY(float), ANY(float)]}
|
||||
for i in range(2)
|
||||
],
|
||||
)
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
classifier("", candidate_labels="politics")
|
||||
|
||||
|
@ -3562,3 +3562,15 @@ class TrieTest(unittest.TestCase):
|
||||
trie.add("extra_id_1")
|
||||
trie.add("extra_id_100")
|
||||
self.assertEqual(trie.split("[CLS] This is a extra_id_100"), ["[CLS]", " This is a ", "extra_id_100"])
|
||||
|
||||
def test_trie_single(self):
|
||||
trie = Trie()
|
||||
trie.add("A")
|
||||
self.assertEqual(trie.split("ABC"), ["A", "BC"])
|
||||
self.assertEqual(trie.split("BCA"), ["BC", "A"])
|
||||
|
||||
def test_trie_final(self):
|
||||
trie = Trie()
|
||||
trie.add("TOKEN]")
|
||||
trie.add("[SPECIAL_TOKEN]")
|
||||
self.assertEqual(trie.split("This is something [SPECIAL_TOKEN]"), ["This is something ", "[SPECIAL_TOKEN]"])
|
||||
|
@ -355,6 +355,34 @@ class TrainerUtilsTest(unittest.TestCase):
|
||||
self.check_iterable_dataset_shard(dataset, 4, drop_last=True, num_processes=3, epoch=42)
|
||||
self.check_iterable_dataset_shard(dataset, 4, drop_last=False, num_processes=3, epoch=42)
|
||||
|
||||
def test_iterable_dataset_shard_with_length(self):
|
||||
sampler_shards = [
|
||||
IterableDatasetShard(list(range(100)), batch_size=4, drop_last=True, num_processes=2, process_index=i)
|
||||
for i in range(2)
|
||||
]
|
||||
|
||||
# Build expected shards: each process will have batches of size 4 until there is not enough elements to
|
||||
# form two full batches (so we stop at 96 = (100 // (4 * 2)) * 4)
|
||||
expected_shards = [[], []]
|
||||
current_shard = 0
|
||||
for i in range(0, 96, 4):
|
||||
expected_shards[current_shard].extend(list(range(i, i + 4)))
|
||||
current_shard = 1 - current_shard
|
||||
|
||||
self.assertListEqual([list(shard) for shard in sampler_shards], expected_shards)
|
||||
self.assertListEqual([len(shard) for shard in sampler_shards], [len(shard) for shard in expected_shards])
|
||||
|
||||
sampler_shards = [
|
||||
IterableDatasetShard(list(range(100)), batch_size=4, drop_last=False, num_processes=2, process_index=i)
|
||||
for i in range(2)
|
||||
]
|
||||
# When drop_last=False, we get two last full batches by looping back to the beginning.
|
||||
expected_shards[0].extend(list(range(96, 100)))
|
||||
expected_shards[1].extend(list(range(0, 4)))
|
||||
|
||||
self.assertListEqual([list(shard) for shard in sampler_shards], expected_shards)
|
||||
self.assertListEqual([len(shard) for shard in sampler_shards], [len(shard) for shard in expected_shards])
|
||||
|
||||
def check_shard_sampler(self, dataset, batch_size, drop_last, num_processes=2):
|
||||
shards = [
|
||||
ShardSampler(
|
||||
|
@ -281,6 +281,7 @@ SPECIAL_MODULE_TO_TEST_MAP = {
|
||||
"test_trainer_distributed.py",
|
||||
"test_trainer_tpu.py",
|
||||
],
|
||||
"train_pt_utils.py": "test_trainer_utils.py",
|
||||
"utils/versions.py": "test_versions_utils.py",
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user