Compare commits

...

23 Commits

Author SHA1 Message Date
65659a29cf Release v4.11.3 2021-10-06 12:48:25 -04:00
3202896ec9 Fix nan-loss condition (#13911) 2021-10-06 12:46:57 -04:00
bb2caca727 Fix trainer logging_nan_inf_filter in torch_xla mode (#13896)
* Fix logging_nan_inf_filter in torch_xla mode

* Update src/transformers/trainer.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Fix format

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2021-10-06 12:46:50 -04:00
1c11636862 Fix hp search for non sigopt backends (#13897) 2021-10-06 12:46:29 -04:00
e9fc92d7bb Fixing Backward compatiblity for zero-shot (#13855)
Fixes #13846
2021-10-06 12:46:22 -04:00
6f06c58e7d Fixing GPU for token-classification in a better way. (#13856)
Co-authored-by:  Pierre Snell <pierre.snell@botpress.com>

Co-authored-by: Pierre Snell <pierre.snell@botpress.com>
2021-10-06 12:46:15 -04:00
3f4eaf0692 Fixing question-answering with long contexts (#13873)
* Tmp.

* Fixing BC for question answering with long context.

* Capping model_max_length to avoid tf overflow.

* Bad workaround bugged roberta.

* Fixing name.
2021-10-06 12:46:09 -04:00
a8186b0128 Fixing empty prompts for text-generation when BOS exists. (#13859)
* Fixing empty prompts for text-generation when BOS exists.

* Fixing odd case with Pegasus.

* Fixing Bert is Assertion Error.
2021-10-06 12:45:46 -04:00
c2901b093b Fixing 1-length special tokens cut. (#13862) 2021-10-06 12:45:09 -04:00
d7db364abc [docs/gpt-j] fix typo (#13851) 2021-10-06 12:44:51 -04:00
3627a4b7a6 include megatron_gpt2 in installed modules (#13834) 2021-10-06 12:44:39 -04:00
e6c5752865 Bart: check if decoder_inputs_embeds is set (#13800)
In BartForConditionalGeneration.forward, if labels are provided,
   decoder_input_ids are set to the labels shifted to the right.
   This is problematic: if decoder_inputs_embeds is also set,
   the call to self.model, which eventually gets to BartDecoder.forward,
   will raise an error.
   The fix is quite simple, similar to what is there already in
   BartModel.forward. Mainly, we should not
   compute decoder_input_ids if decoder_inputs_embeds is provided.

Co-authored-by: Silviu Vlad Oprea <silviuvo@amazon.co.uk>
2021-10-06 12:44:30 -04:00
3642d13b1b Fix warning situation: UserWarning: max_length is ignored when padding=True" (#13829)
* Removed wrong warning

* Raise a warning when `max_length` is given with wrong `truncation`

* Update the error message

* Update the warning message

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2021-10-06 12:44:08 -04:00
66c81b2fc1 [DPR] Correct init (#13796)
* update

* add to docs and init

* make fix-copies
2021-10-06 12:43:56 -04:00
218b58aff8 [docs/gpt-j] addd instructions for how minimize CPU RAM usage (#13795)
* add a note about tokenizer

* add  tips to load model is less RAM

* fix link

* fix more links
2021-10-06 12:43:23 -04:00
7655f11076 Release v4.11.2 2021-09-30 11:54:39 -04:00
6b87918441 Fix gather for TPU (#13813) 2021-09-30 11:53:13 -04:00
54f9d62c61 Release v4.11.1 2021-09-29 12:04:25 -04:00
22d3156881 Fix length of IterableDatasetShard and add test (#13792)
* Fix length of IterableDatasetShard and add test

* Add comments
2021-09-29 12:03:56 -04:00
9bb3d33a46 Implement len in IterableDatasetShard (#13780) 2021-09-29 12:03:51 -04:00
a05400e020 Fix warning for gradient_checkpointing (#13767) 2021-09-29 12:03:46 -04:00
10083244a3 Fix LayoutLM ONNX test error (#13710)
Fix LayoutLM ONNX test error
2021-09-29 12:03:43 -04:00
11144a3048 up (#13777) 2021-09-29 12:03:40 -04:00
35 changed files with 480 additions and 179 deletions

View File

@ -27,7 +27,10 @@ author = "huggingface"
# The short X.Y version
version = ""
# The full version, including alpha/beta/rc tags
release = "4.11.0"
release = "4.11.3"

View File

@ -41,6 +41,13 @@ DPRConfig
:members:
DPRPreTrainedModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.DPRPreTrainedModel
:members:
DPRContextEncoderTokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@ -24,19 +24,33 @@ This model was contributed by `Stella Biderman <https://huggingface.co/stellaath
Tips:
- Running [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B) in float32 precision on GPU requires at least 24 GB of
RAM. On GPUs with less than 24 GB RAM, one should therefore load the model in half-precision:
- To load `GPT-J <https://huggingface.co/EleutherAI/gpt-j-6B>`__ in float32 one would need at least 2x model size CPU
RAM: 1x for initial weights and another 1x to load the checkpoint. So for GPT-J it would take at least 48GB of CPU
RAM to just load the model. To reduce the CPU RAM usage there are a few options. The ``torch_dtype`` argument can be
used to initialize the model in half-precision. And the ``low_cpu_mem_usage`` argument can be used to keep the RAM
usage to 1x. There is also a `fp16 branch <https://huggingface.co/EleutherAI/gpt-j-6B/tree/float16>`__ which stores
the fp16 weights, which could be used to further minimize the RAM usage. Combining all this it should take roughly
12.1GB of CPU RAM to load the model.
.. code-block::
>>> from transformers import GPTJForCausalLM
>>> import torch
>>> model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", torch_dtype=torch.float16)
>>> model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16, low_cpu_mem_usage=True)
- The model should fit on 16GB GPU for inference. For training/fine-tuning it would take much more GPU RAM. Adam
optimizer for example makes four copies of the model: model, gradients, average and squared average of the gradients.
So it would need at least 4x model size GPU memory, even with mixed precision as gradient updates are in fp32. This
is not including the activations and data batches, which would again require some more GPU RAM. So one should explore
solutions such as DeepSpeed, to train/fine-tune the model. Another option is to use the original codebase to
train/fine-tune the model on TPU and then convert the model to Transformers format for inference. Instructions for
that could be found `here <https://github.com/kingoflolz/mesh-transformer-jax/blob/master/howto_finetune.md>`__
- Although the embedding matrix has a size of 50400, only 50257 entries are used by the GPT-2 tokenizer. These extra
tokens are added for the sake of efficiency on TPUs. To avoid the mis-match between embedding matrix size and vocab
size, the tokenizer for [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B) contains 143 extra tokens
size, the tokenizer for `GPT-J <https://huggingface.co/EleutherAI/gpt-j-6B>`__ contains 143 extra tokens
``<|extratoken_1|>... <|extratoken_143|>``, so the ``vocab_size`` of tokenizer also becomes 50400.
Generation

View File

@ -42,6 +42,7 @@ Ready-made configurations include the following models:
- BERT
- DistilBERT
- GPT-2
- LayoutLM
- RoBERTa
- T5
- XLM-RoBERTa

View File

@ -344,7 +344,7 @@ install_requires = [
setup(
name="transformers",
version="4.11.0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
version="4.11.3", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Sam Shleifer, Patrick von Platen, Sylvain Gugger, Suraj Patil, Stas Bekman, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
author_email="thomas@huggingface.co",
description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",

View File

@ -22,7 +22,7 @@
# to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
# in the namespace without actually importing anything (and especially none of the backends).
__version__ = "4.11.0"
__version__ = "4.11.3"
# Work around to update TensorFlow's absl.logging threshold which alters the
# default Python logging output behavior when present.
@ -773,6 +773,7 @@ if is_torch_available():
"DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST",
"DPRContextEncoder",
"DPRPretrainedContextEncoder",
"DPRPreTrainedModel",
"DPRPretrainedQuestionEncoder",
"DPRPretrainedReader",
"DPRQuestionEncoder",
@ -2512,6 +2513,7 @@ if TYPE_CHECKING:
DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST,
DPRContextEncoder,
DPRPretrainedContextEncoder,
DPRPreTrainedModel,
DPRPretrainedQuestionEncoder,
DPRPretrainedReader,
DPRQuestionEncoder,

View File

@ -332,7 +332,7 @@ class PretrainedConfig(PushToHubMixin):
self.transformers_version = kwargs.pop("transformers_version", None)
# Deal with gradient checkpointing
if kwargs.get("gradient_checkpointing", True):
if kwargs.get("gradient_checkpointing", False):
warnings.warn(
"Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "
"Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the "

View File

@ -1291,7 +1291,7 @@ class BartForConditionalGeneration(BartPretrainedModel):
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
if decoder_input_ids is None:
if decoder_input_ids is None and decoder_inputs_embeds is None:
decoder_input_ids = shift_tokens_right(
labels, self.config.pad_token_id, self.config.decoder_start_token_id
)

View File

@ -2501,7 +2501,7 @@ class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel):
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
if decoder_input_ids is None:
if decoder_input_ids is None and decoder_inputs_embeds is None:
decoder_input_ids = shift_tokens_right(
labels, self.config.pad_token_id, self.config.decoder_start_token_id
)

View File

@ -46,6 +46,7 @@ if is_torch_available():
"DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST",
"DPRContextEncoder",
"DPRPretrainedContextEncoder",
"DPRPreTrainedModel",
"DPRPretrainedQuestionEncoder",
"DPRPretrainedReader",
"DPRQuestionEncoder",
@ -89,6 +90,7 @@ if TYPE_CHECKING:
DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST,
DPRContextEncoder,
DPRPretrainedContextEncoder,
DPRPreTrainedModel,
DPRPretrainedQuestionEncoder,
DPRPretrainedReader,
DPRQuestionEncoder,

View File

@ -147,7 +147,29 @@ class DPRReaderOutput(ModelOutput):
attentions: Optional[Tuple[torch.FloatTensor]] = None
class DPREncoder(PreTrainedModel):
class DPRPreTrainedModel(PreTrainedModel):
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
def _set_gradient_checkpointing(self, module, value=False):
if isinstance(module, BertEncoder):
module.gradient_checkpointing = value
class DPREncoder(DPRPreTrainedModel):
base_model_prefix = "bert_model"
@ -200,13 +222,8 @@ class DPREncoder(PreTrainedModel):
return self.encode_proj.out_features
return self.bert_model.config.hidden_size
def init_weights(self):
self.bert_model.init_weights()
if self.projection_dim > 0:
self.encode_proj.apply(self.bert_model._init_weights)
class DPRSpanPredictor(PreTrainedModel):
class DPRSpanPredictor(DPRPreTrainedModel):
base_model_prefix = "encoder"
@ -262,16 +279,13 @@ class DPRSpanPredictor(PreTrainedModel):
attentions=outputs.attentions,
)
def init_weights(self):
self.encoder.init_weights()
##################
# PreTrainedModel
##################
class DPRPretrainedContextEncoder(PreTrainedModel):
class DPRPretrainedContextEncoder(DPRPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
@ -282,11 +296,8 @@ class DPRPretrainedContextEncoder(PreTrainedModel):
base_model_prefix = "ctx_encoder"
_keys_to_ignore_on_load_missing = [r"position_ids"]
def init_weights(self):
self.ctx_encoder.init_weights()
class DPRPretrainedQuestionEncoder(PreTrainedModel):
class DPRPretrainedQuestionEncoder(DPRPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
@ -297,15 +308,8 @@ class DPRPretrainedQuestionEncoder(PreTrainedModel):
base_model_prefix = "question_encoder"
_keys_to_ignore_on_load_missing = [r"position_ids"]
def init_weights(self):
self.question_encoder.init_weights()
def _set_gradient_checkpointing(self, module, value=False):
if isinstance(module, BertEncoder):
module.gradient_checkpointing = value
class DPRPretrainedReader(PreTrainedModel):
class DPRPretrainedReader(DPRPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
@ -316,15 +320,6 @@ class DPRPretrainedReader(PreTrainedModel):
base_model_prefix = "span_predictor"
_keys_to_ignore_on_load_missing = [r"position_ids"]
def init_weights(self):
self.span_predictor.encoder.init_weights()
self.span_predictor.qa_classifier.apply(self.span_predictor.encoder.bert_model._init_weights)
self.span_predictor.qa_outputs.apply(self.span_predictor.encoder.bert_model._init_weights)
def _set_gradient_checkpointing(self, module, value=False):
if isinstance(module, BertEncoder):
module.gradient_checkpointing = value
###############
# Actual Models

View File

@ -964,6 +964,14 @@ class HubertForCTC(HubertPreTrainedModel):
self.hubert = HubertModel(config)
self.dropout = nn.Dropout(config.final_dropout)
if config.vocab_size is None:
raise ValueError(
f"You are trying to instantiate {self.__class__} with a configuration that "
"does not define the vocabulary size of the language model head. Please "
"instantiate the model as follows: `HubertForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
"or define `vocab_size` of your model's configuration."
)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
self.init_weights()

View File

@ -183,11 +183,6 @@ class LayoutLMOnnxConfig(OnnxConfig):
raise ValueError("Cannot generate dummy inputs without PyTorch installed.")
import torch
input_dict["bbox"] = torch.tensor(
[
[0] * 4,
*[box] * seq_length,
[self.max_2d_positions] * 4,
]
).tile(batch_size, 1, 1)
batch_size, seq_length = input_dict["input_ids"].shape
input_dict["bbox"] = torch.tensor([*[box] * seq_length]).tile(batch_size, 1, 1)
return input_dict

View File

@ -0,0 +1,17 @@
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.
# Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -1416,6 +1416,14 @@ class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
self.wav2vec2 = Wav2Vec2Model(config)
self.dropout = nn.Dropout(config.final_dropout)
if config.vocab_size is None:
raise ValueError(
f"You are trying to instantiate {self.__class__} with a configuration that "
"does not define the vocabulary size of the language model head. Please "
"instantiate the model as follows: `Wav2Vec2ForCTC.from_pretrained(..., vocab_size=vocab_size)`."
"or define `vocab_size` of your model's configuration."
)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
self.init_weights()

View File

@ -791,7 +791,7 @@ class Pipeline(_ScikitCompat):
elif isinstance(inputs, tuple):
return tuple([self._ensure_tensor_on_device(item, device) for item in inputs])
elif isinstance(inputs, torch.Tensor):
return inputs.to(self.device)
return inputs.to(device)
else:
return inputs

View File

@ -248,7 +248,13 @@ class QuestionAnsweringPipeline(Pipeline):
return super().__call__(examples[0], **kwargs)
return super().__call__(examples, **kwargs)
def preprocess(self, example, padding="do_not_pad", doc_stride=128, max_question_len=64, max_seq_len=384):
def preprocess(self, example, padding="do_not_pad", doc_stride=None, max_question_len=64, max_seq_len=None):
if max_seq_len is None:
max_seq_len = min(self.tokenizer.model_max_length, 384)
if doc_stride is None:
doc_stride = min(max_seq_len // 4, 128)
if not self.tokenizer.is_fast:
features = squad_convert_examples_to_features(
examples=[example],
@ -277,7 +283,6 @@ class QuestionAnsweringPipeline(Pipeline):
return_offsets_mapping=True,
return_special_tokens_mask=True,
)
# When the input is too long, it's converted in a batch of inputs with overflowing tokens
# and a stride of overlap between the inputs. If a batch of inputs is given, a special output
# "overflow_to_sample_mapping" indicate which member of the encoded batch belong to which original batch sample.
@ -308,12 +313,15 @@ class QuestionAnsweringPipeline(Pipeline):
token_type_ids_span_idx = (
encoded_inputs["token_type_ids"][span_idx] if "token_type_ids" in encoded_inputs else None
)
submask = p_mask[span_idx]
if isinstance(submask, np.ndarray):
submask = submask.tolist()
features.append(
SquadFeatures(
input_ids=input_ids_span_idx,
attention_mask=attention_mask_span_idx,
token_type_ids=token_type_ids_span_idx,
p_mask=p_mask[span_idx].tolist(),
p_mask=submask,
encoding=encoded_inputs[span_idx],
# We don't use the rest of the values - and actually
# for Fast tokenizer we could totally avoid using SquadFeatures and SquadExample
@ -330,26 +338,41 @@ class QuestionAnsweringPipeline(Pipeline):
qas_id=None,
)
)
return {"features": features, "example": example}
split_features = []
for feature in features:
fw_args = {}
others = {}
model_input_names = self.tokenizer.model_input_names
for k, v in feature.__dict__.items():
if k in model_input_names:
if self.framework == "tf":
tensor = tf.constant(v)
if tensor.dtype == tf.int64:
tensor = tf.cast(tensor, tf.int32)
fw_args[k] = tf.expand_dims(tensor, 0)
elif self.framework == "pt":
tensor = torch.tensor(v)
if tensor.dtype == torch.int32:
tensor = tensor.long()
fw_args[k] = tensor.unsqueeze(0)
else:
others[k] = v
split_features.append({"fw_args": fw_args, "others": others})
return {"features": split_features, "example": example}
def _forward(self, model_inputs):
features = model_inputs["features"]
example = model_inputs["example"]
model_input_names = self.tokenizer.model_input_names
fw_args = {k: [feature.__dict__[k] for feature in features] for k in model_input_names}
if self.framework == "tf":
fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()}
start, end = self.model(fw_args)[:2]
start, end = start.numpy(), end.numpy()
elif self.framework == "pt":
# Retrieve the score for the context tokens only (removing question tokens)
fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()}
# On Windows, the default int type in numpy is np.int32 so we get some non-long tensors.
fw_args = {k: v.long() if v.dtype == torch.int32 else v for (k, v) in fw_args.items()}
starts = []
ends = []
for feature in features:
fw_args = feature["fw_args"]
start, end = self.model(**fw_args)[:2]
start, end = start.cpu().numpy(), end.cpu().numpy()
return {"start": start, "end": end, "features": features, "example": example}
starts.append(start)
ends.append(end)
return {"starts": starts, "ends": ends, "features": features, "example": example}
def postprocess(
self,
@ -360,90 +383,89 @@ class QuestionAnsweringPipeline(Pipeline):
):
min_null_score = 1000000 # large and positive
answers = []
start_ = model_outputs["start"][0]
end_ = model_outputs["end"][0]
feature = model_outputs["features"][0]
example = model_outputs["example"]
# Ensure padded tokens & question tokens cannot belong to the set of candidate answers.
undesired_tokens = np.abs(np.array(feature.p_mask) - 1)
for i, (feature_, start_, end_) in enumerate(
zip(model_outputs["features"], model_outputs["starts"], model_outputs["ends"])
):
feature = feature_["others"]
# Ensure padded tokens & question tokens cannot belong to the set of candidate answers.
undesired_tokens = np.abs(np.array(feature["p_mask"]) - 1)
if feature.attention_mask is not None:
undesired_tokens = undesired_tokens & feature.attention_mask
if feature_["fw_args"].get("attention_mask", None) is not None:
undesired_tokens = undesired_tokens & feature_["fw_args"]["attention_mask"].numpy()
# Generate mask
undesired_tokens_mask = undesired_tokens == 0.0
# Generate mask
undesired_tokens_mask = undesired_tokens == 0.0
# Make sure non-context indexes in the tensor cannot contribute to the softmax
start_ = np.where(undesired_tokens_mask, -10000.0, start_)
end_ = np.where(undesired_tokens_mask, -10000.0, end_)
# Make sure non-context indexes in the tensor cannot contribute to the softmax
start_ = np.where(undesired_tokens_mask, -10000.0, start_)
end_ = np.where(undesired_tokens_mask, -10000.0, end_)
# Normalize logits and spans to retrieve the answer
start_ = np.exp(start_ - np.log(np.sum(np.exp(start_), axis=-1, keepdims=True)))
end_ = np.exp(end_ - np.log(np.sum(np.exp(end_), axis=-1, keepdims=True)))
# Normalize logits and spans to retrieve the answer
start_ = np.exp(start_ - np.log(np.sum(np.exp(start_), axis=-1, keepdims=True)))
end_ = np.exp(end_ - np.log(np.sum(np.exp(end_), axis=-1, keepdims=True)))
if handle_impossible_answer:
min_null_score = min(min_null_score, (start_[0] * end_[0]).item())
if handle_impossible_answer:
min_null_score = min(min_null_score, (start_[0] * end_[0]).item())
# Mask CLS
start_[0] = end_[0] = 0.0
# Mask CLS
start_[0, 0] = end_[0, 0] = 0.0
starts, ends, scores = self.decode(start_, end_, top_k, max_answer_len, undesired_tokens)
if not self.tokenizer.is_fast:
char_to_word = np.array(example.char_to_word_offset)
starts, ends, scores = self.decode(start_, end_, top_k, max_answer_len, undesired_tokens)
if not self.tokenizer.is_fast:
char_to_word = np.array(example.char_to_word_offset)
# Convert the answer (tokens) back to the original text
# Score: score from the model
# Start: Index of the first character of the answer in the context string
# End: Index of the character following the last character of the answer in the context string
# Answer: Plain text of the answer
for s, e, score in zip(starts, ends, scores):
answers.append(
{
"score": score.item(),
"start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
"end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
"answer": " ".join(
example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1]
),
}
)
else:
# Convert the answer (tokens) back to the original text
# Score: score from the model
# Start: Index of the first character of the answer in the context string
# End: Index of the character following the last character of the answer in the context string
# Answer: Plain text of the answer
question_first = bool(self.tokenizer.padding_side == "right")
enc = feature.encoding
# Convert the answer (tokens) back to the original text
# Score: score from the model
# Start: Index of the first character of the answer in the context string
# End: Index of the character following the last character of the answer in the context string
# Answer: Plain text of the answer
for s, e, score in zip(starts, ends, scores):
token_to_orig_map = feature["token_to_orig_map"]
answers.append(
{
"score": score.item(),
"start": np.where(char_to_word == token_to_orig_map[s])[0][0].item(),
"end": np.where(char_to_word == token_to_orig_map[e])[0][-1].item(),
"answer": " ".join(example.doc_tokens[token_to_orig_map[s] : token_to_orig_map[e] + 1]),
}
)
else:
# Convert the answer (tokens) back to the original text
# Score: score from the model
# Start: Index of the first character of the answer in the context string
# End: Index of the character following the last character of the answer in the context string
# Answer: Plain text of the answer
question_first = bool(self.tokenizer.padding_side == "right")
enc = feature["encoding"]
# Sometimes the max probability token is in the middle of a word so:
# - we start by finding the right word containing the token with `token_to_word`
# - then we convert this word in a character span with `word_to_chars`
sequence_index = 1 if question_first else 0
for s, e, score in zip(starts, ends, scores):
try:
start_word = enc.token_to_word(s)
end_word = enc.token_to_word(e)
start_index = enc.word_to_chars(start_word, sequence_index=sequence_index)[0]
end_index = enc.word_to_chars(end_word, sequence_index=sequence_index)[1]
except Exception:
# Some tokenizers don't really handle words. Keep to offsets then.
start_index = enc.offsets[s][0]
end_index = enc.offsets[e][1]
# Sometimes the max probability token is in the middle of a word so:
# - we start by finding the right word containing the token with `token_to_word`
# - then we convert this word in a character span with `word_to_chars`
sequence_index = 1 if question_first else 0
for s, e, score in zip(starts, ends, scores):
try:
start_word = enc.token_to_word(s)
end_word = enc.token_to_word(e)
start_index = enc.word_to_chars(start_word, sequence_index=sequence_index)[0]
end_index = enc.word_to_chars(end_word, sequence_index=sequence_index)[1]
except Exception:
# Some tokenizers don't really handle words. Keep to offsets then.
start_index = enc.offsets[s][0]
end_index = enc.offsets[e][1]
answers.append(
{
"score": score.item(),
"start": start_index,
"end": end_index,
"answer": example.context_text[start_index:end_index],
}
)
answers.append(
{
"score": score.item(),
"start": start_index,
"end": end_index,
"answer": example.context_text[start_index:end_index],
}
)
if handle_impossible_answer:
answers.append({"score": min_null_score, "start": 0, "end": 0, "answer": ""})
answers = sorted(answers, key=lambda x: x["score"], reverse=True)[:top_k]
answers = sorted(answers, key=lambda x: x["score"], reverse=True)[:top_k]
if len(answers) == 1:
return answers[0]
return answers

View File

@ -158,6 +158,9 @@ class TextGenerationPipeline(Pipeline):
def _forward(self, model_inputs, **generate_kwargs):
input_ids = model_inputs["input_ids"]
# Allow empty prompts
if input_ids.shape[1] == 0:
input_ids = None
prompt_text = model_inputs.pop("prompt_text")
generated_sequence = self.model.generate(input_ids=input_ids, **generate_kwargs) # BS x SL
return {"generated_sequence": generated_sequence, "input_ids": input_ids, "prompt_text": prompt_text}

View File

@ -204,9 +204,10 @@ class TokenClassificationPipeline(Pipeline):
offset_mapping = model_inputs.pop("offset_mapping", None)
sentence = model_inputs.pop("sentence")
if self.framework == "tf":
outputs = self.model(model_inputs.data)[0][0].numpy()
outputs = self.model(model_inputs.data)[0][0]
else:
outputs = self.model(**model_inputs)[0][0].numpy()
outputs = self.model(**model_inputs)[0][0]
return {
"outputs": outputs,
"special_tokens_mask": special_tokens_mask,
@ -216,7 +217,7 @@ class TokenClassificationPipeline(Pipeline):
}
def postprocess(self, model_outputs, aggregation_strategy=AggregationStrategy.NONE):
outputs = model_outputs["outputs"]
outputs = model_outputs["outputs"].numpy()
sentence = model_outputs["sentence"]
input_ids = model_outputs["input_ids"][0]
offset_mapping = model_outputs["offset_mapping"][0] if model_outputs["offset_mapping"] is not None else None

View File

@ -191,10 +191,7 @@ class ZeroShotClassificationPipeline(Pipeline):
else:
raise ValueError(f"Unable to understand extra arguments {args}")
result = super().__call__(sequences, **kwargs)
if len(result) == 1:
return result[0]
return result
return super().__call__(sequences, **kwargs)
def preprocess(self, inputs, candidate_labels=None, hypothesis_template="This example is {}."):
sequence_pairs, sequences = self._args_parser(inputs, candidate_labels, hypothesis_template)
@ -264,4 +261,6 @@ class ZeroShotClassificationPipeline(Pipeline):
"scores": scores[iseq, top_inds].tolist(),
}
)
if len(result) == 1:
return result[0]
return result

View File

@ -20,6 +20,7 @@ import bisect
import itertools
import re
import unicodedata
from collections import OrderedDict
from typing import Any, Dict, List, Optional, Tuple, Union, overload
from .file_utils import PaddingStrategy, TensorType, add_end_docstrings
@ -102,7 +103,6 @@ class Trie:
>>> trie.split("[CLS] This is a extra_id_100")
["[CLS]", " This is a ", "extra_id_100"]
"""
# indexes are counted left of the chars index.
# "hello", index 0, is left of h, index 1 is between h and e.
# index 5 is right of the "o".
@ -115,7 +115,7 @@ class Trie:
# If the trie contains, "blowing", and "lower" and we encounter the
# string "blower", we need to split into ["b", "lower"].
# This is where we need to keep track of multiple possible starts.
states = {}
states = OrderedDict()
# This will contain every indices where we need
# to cut.
@ -144,36 +144,36 @@ class Trie:
# In this case, we already have partial matches (But unfinished)
for start, trie_pointer in states.items():
if current_char in trie_pointer:
if "" in trie_pointer:
# This is a final match, we need to reset and
# store the results in `offsets`.
# Lookahead to match longest first
# Important in case of extra_id_1 vs extra_id_100
lookahead_index = current
end = current
next_char = text[lookahead_index] if lookahead_index < len(text) else None
while next_char in trie_pointer:
trie_pointer = trie_pointer[next_char]
lookahead_index += 1
if "" in trie_pointer:
end = lookahead_index
skip = lookahead_index
if lookahead_index == len(text):
# End of string
break
next_char = text[lookahead_index]
# End lookahead
# Storing and resetting
offsets.append(start)
offsets.append(end)
reset = True
elif current_char in trie_pointer:
# The current character being looked at has a match within the trie
# update the pointer (it will be stored back into states later).
trie_pointer = trie_pointer[current_char]
if "" in trie_pointer:
# This is a final match, we need to reset and
# store the results in `offsets`.
# Lookahead to match longest first
# Important in case of extra_id_1 vs extra_id_100
lookahead_index = current + 1
end = current + 1
next_char = text[lookahead_index] if lookahead_index < len(text) else None
while next_char in trie_pointer:
trie_pointer = trie_pointer[next_char]
lookahead_index += 1
if "" in trie_pointer:
end = lookahead_index
skip = lookahead_index
if lookahead_index == len(text):
# End of string
break
next_char = text[lookahead_index]
# End lookahead
# Storing and resetting
offsets.append(start)
offsets.append(end)
reset = True
# Storing back the new pointer into the states.
# Partial matches got longer by one.
@ -198,6 +198,18 @@ class Trie:
if current_char in self.data:
states[current] = self.data[current_char]
# We have a cut at the end with states.
for start, trie_pointer in states.items():
if "" in trie_pointer:
# This is a final match, we need to reset and
# store the results in `offsets`.
end = len(text)
offsets.append(start)
offsets.append(end)
# Longest cut is always the one with lower start so the first
# item so we need to break.
break
# We have all the offsets now, we just need to do the actual splitting.
# We need to eventually add the first part of the string and the eventual
# last part.

View File

@ -2223,8 +2223,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
elif padding is not False:
if padding is True:
if verbose:
if max_length is not None:
warnings.warn("`max_length` is ignored when `padding`=`True`.")
if max_length is not None and (truncation is False or truncation == "do_not_truncate"):
warnings.warn(
"`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
"To pad to max length, use `padding='max_length'`."
)
if old_pad_to_max_length is not False:
warnings.warn("Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`.")
padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch

View File

@ -1238,7 +1238,11 @@ class Trainer:
self.callback_handler.lr_scheduler = self.lr_scheduler
self.callback_handler.train_dataloader = train_dataloader
self.state.trial_name = self.hp_name(trial) if self.hp_name is not None else None
self.state.trial_params = hp_params(trial.assignments) if trial is not None else None
if trial is not None:
assignments = trial.assignments if self.hp_search_backend == HPSearchBackend.SIGOPT else trial
self.state.trial_params = hp_params(assignments)
else:
self.state.trial_params = None
# This should be the same if the state has been saved but in case the training arguments changed, it's safer
# to set this after the load.
self.state.max_steps = max_steps
@ -1311,7 +1315,11 @@ class Trainer:
else:
tr_loss_step = self.training_step(model, inputs)
if args.logging_nan_inf_filter and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)):
if (
args.logging_nan_inf_filter
and not is_torch_tpu_available()
and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
):
# if loss is nan or inf simply add the average of previous logged losses
tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
else:

View File

@ -152,6 +152,8 @@ def nested_xla_mesh_reduce(tensors, name):
if isinstance(tensors, (list, tuple)):
return type(tensors)(nested_xla_mesh_reduce(t, f"{name}_{i}") for i, t in enumerate(tensors))
if tensors.ndim == 0:
tensors = tensors[None]
return xm.mesh_reduce(name, tensors, torch.cat)
else:
raise ImportError("Torch xla must be installed to use `nested_xla_mesh_reduce`")
@ -772,6 +774,13 @@ class IterableDatasetShard(IterableDataset):
for i in process_slice:
yield current_batch[i]
def __len__(self):
# Will raise an error if the underlying dataset is not sized.
if self.drop_last:
return (len(self.dataset) // (self.batch_size * self.num_processes)) * self.batch_size
else:
return math.ceil(len(self.dataset) / (self.batch_size * self.num_processes)) * self.batch_size
# In order to keep `trainer.py` compact and easy to understand, place any secondary PT Trainer
# helper methods here

View File

@ -1462,6 +1462,15 @@ class DPRPretrainedContextEncoder:
requires_backends(self, ["torch"])
class DPRPreTrainedModel:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
class DPRPretrainedQuestionEncoder:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])

View File

@ -14,6 +14,7 @@
# limitations under the License.
import tempfile
import unittest
from transformers import DPRConfig, is_torch_available
@ -213,6 +214,19 @@ class DPRModelTest(ModelTesterMixin, unittest.TestCase):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_reader(*config_and_inputs)
def test_init_changed_config(self):
config = self.model_tester.prepare_config_and_inputs()[0]
model = DPRQuestionEncoder(config=config)
model.to(torch_device)
model.eval()
with tempfile.TemporaryDirectory() as tmp_dirname:
model.save_pretrained(tmp_dirname)
model = DPRQuestionEncoder.from_pretrained(tmp_dirname, projection_dim=512)
self.assertIsNotNone(model)
@slow
def test_model_from_pretrained(self):
for model_name in DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:

View File

@ -162,6 +162,11 @@ class LEDModelTester:
attention_window=self.attention_window,
)
def get_pipeline_config(self):
config = self.get_config()
config.max_position_embeddings = 100
return config
def prepare_config_and_inputs_for_common(self):
config, inputs_dict = self.prepare_config_and_inputs()
global_attention_mask = torch.zeros_like(inputs_dict["input_ids"])

View File

@ -189,6 +189,7 @@ class ReformerModelTester:
def get_pipeline_config(self):
config = self.get_config()
config.vocab_size = 100
config.max_position_embeddings = 100
config.axial_pos_shape = (4, 25)
config.is_decoder = False
return config

View File

@ -87,6 +87,12 @@ class QAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
outputs, [{"answer": ANY(str), "start": ANY(int), "end": ANY(int), "score": ANY(float)} for i in range(20)]
)
# Very long context require multiple features
outputs = question_answerer(
question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris." * 20
)
self.assertEqual(outputs, {"answer": ANY(str), "start": ANY(int), "end": ANY(int), "score": ANY(float)})
@require_torch
def test_small_model_pt(self):
question_answerer = pipeline(
@ -121,6 +127,73 @@ class QAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
self.assertEqual(nested_simplify(outputs), {"score": 0.979, "start": 27, "end": 32, "answer": "Paris"})
@slow
@require_torch
def test_large_model_issue(self):
qa_pipeline = pipeline(
"question-answering",
model="mrm8488/bert-multi-cased-finetuned-xquadv1",
)
outputs = qa_pipeline(
{
"context": "Yes Bank founder Rana Kapoor has approached the Bombay High Court, challenging a special court's order from August this year that had remanded him in police custody for a week in a multi-crore loan fraud case. Kapoor, who is currently lodged in Taloja Jail, is an accused in the loan fraud case and some related matters being probed by the CBI and Enforcement Directorate. A single bench presided over by Justice S K Shinde on Tuesday posted the plea for further hearing on October 14. In his plea filed through advocate Vijay Agarwal, Kapoor claimed that the special court's order permitting the CBI's request for police custody on August 14 was illegal and in breach of the due process of law. Therefore, his police custody and subsequent judicial custody in the case were all illegal. Kapoor has urged the High Court to quash and set aside the special court's order dated August 14. As per his plea, in August this year, the CBI had moved two applications before the special court, one seeking permission to arrest Kapoor, who was already in judicial custody at the time in another case, and the other, seeking his police custody. While the special court refused to grant permission to the CBI to arrest Kapoor, it granted the central agency's plea for his custody. Kapoor, however, said in his plea that before filing an application for his arrest, the CBI had not followed the process of issuing him a notice under Section 41 of the CrPC for appearance before it. He further said that the CBI had not taken prior sanction as mandated under section 17 A of the Prevention of Corruption Act for prosecuting him. The special court, however, had said in its order at the time that as Kapoor was already in judicial custody in another case and was not a free man the procedure mandated under Section 41 of the CrPC need not have been adhered to as far as issuing a prior notice of appearance was concerned. ADVERTISING It had also said that case records showed that the investigating officer had taken an approval from a managing director of Yes Bank before beginning the proceedings against Kapoor and such a permission was a valid sanction. However, Kapoor in his plea said that the above order was bad in law and sought that it be quashed and set aside. The law mandated that if initial action was not in consonance with legal procedures, then all subsequent actions must be held as illegal, he said, urging the High Court to declare the CBI remand and custody and all subsequent proceedings including the further custody as illegal and void ab-initio. In a separate plea before the High Court, Kapoor's daughter Rakhee Kapoor-Tandon has sought exemption from in-person appearance before a special PMLA court. Rakhee has stated that she is a resident of the United Kingdom and is unable to travel to India owing to restrictions imposed due to the COVID-19 pandemic. According to the CBI, in the present case, Kapoor had obtained a gratification or pecuniary advantage of ₹ 307 crore, and thereby caused Yes Bank a loss of ₹ 1,800 crore by extending credit facilities to Avantha Group, when it was not eligible for the same",
"question": "Is this person invovled in fraud?",
}
)
self.assertEqual(
nested_simplify(outputs),
{"answer": "an accused in the loan fraud case", "end": 294, "score": 0.001, "start": 261},
)
@slow
@require_torch
def test_large_model_course(self):
question_answerer = pipeline("question-answering")
long_context = """
🤗 Transformers: State of the Art NLP
🤗 Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction,
question answering, summarization, translation, text generation and more in over 100 languages.
Its aim is to make cutting-edge NLP easier to use for everyone.
🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets and
then share them with the community on our model hub. At the same time, each python module defining an architecture is fully standalone and
can be modified to enable quick research experiments.
Why should I use transformers?
1. Easy-to-use state-of-the-art models:
- High performance on NLU and NLG tasks.
- Low barrier to entry for educators and practitioners.
- Few user-facing abstractions with just three classes to learn.
- A unified API for using all our pretrained models.
- Lower compute costs, smaller carbon footprint:
2. Researchers can share trained models instead of always retraining.
- Practitioners can reduce compute time and production costs.
- Dozens of architectures with over 10,000 pretrained models, some in more than 100 languages.
3. Choose the right framework for every part of a model's lifetime:
- Train state-of-the-art models in 3 lines of code.
- Move a single model between TF2.0/PyTorch frameworks at will.
- Seamlessly pick the right framework for training, evaluation and production.
4. Easily customize a model or an example to your needs:
- We provide examples for each architecture to reproduce the results published by its original authors.
- Model internals are exposed as consistently as possible.
- Model files can be used independently of the library for quick experiments.
🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""
question = "Which deep learning libraries back 🤗 Transformers?"
outputs = question_answerer(question=question, context=long_context)
self.assertEqual(
nested_simplify(outputs),
{"answer": "Jax, PyTorch and TensorFlow", "end": 1919, "score": 0.971, "start": 1892},
)
@slow
@require_tf
def test_large_model_tf(self):

View File

@ -106,3 +106,14 @@ class TextGenerationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseM
outputs = text_generator("This is a test", return_full_text=True)
self.assertEqual(outputs, [{"generated_text": ANY(str)}])
self.assertTrue(outputs[0]["generated_text"].startswith("This is a test"))
# Empty prompt is slighly special
# it requires BOS token to exist.
# Special case for Pegasus which will always append EOS so will
# work even without BOS.
if text_generator.tokenizer.bos_token_id is not None or "Pegasus" in tokenizer.__class__.__name__:
outputs = text_generator("")
self.assertEqual(outputs, [{"generated_text": ANY(str)}])
else:
with self.assertRaises((ValueError, AssertionError)):
outputs = text_generator("")

View File

@ -25,7 +25,14 @@ from transformers import (
pipeline,
)
from transformers.pipelines import AggregationStrategy, TokenClassificationArgumentHandler
from transformers.testing_utils import is_pipeline_test, nested_simplify, require_tf, require_torch, slow
from transformers.testing_utils import (
is_pipeline_test,
nested_simplify,
require_tf,
require_torch,
require_torch_gpu,
slow,
)
from .test_pipelines_common import ANY, PipelineTestCaseMeta
@ -246,6 +253,19 @@ class TokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
],
)
@require_torch_gpu
@slow
def test_gpu(self):
sentence = "This is dummy sentence"
ner = pipeline(
"token-classification",
device=0,
aggregation_strategy=AggregationStrategy.SIMPLE,
)
output = ner(sentence)
self.assertEqual(nested_simplify(output), [])
@require_torch
@slow
def test_dbmdz_english(self):

View File

@ -61,6 +61,24 @@ class ZeroShotClassificationPipelineTests(unittest.TestCase, metaclass=PipelineT
)
self.assertEqual(outputs, {"sequence": ANY(str), "labels": [ANY(str)], "scores": [ANY(float)]})
# https://github.com/huggingface/transformers/issues/13846
outputs = classifier(["I am happy"], ["positive", "negative"])
self.assertEqual(
outputs,
[
{"sequence": ANY(str), "labels": [ANY(str), ANY(str)], "scores": [ANY(float), ANY(float)]}
for i in range(1)
],
)
outputs = classifier(["I am happy", "I am sad"], ["positive", "negative"])
self.assertEqual(
outputs,
[
{"sequence": ANY(str), "labels": [ANY(str), ANY(str)], "scores": [ANY(float), ANY(float)]}
for i in range(2)
],
)
with self.assertRaises(ValueError):
classifier("", candidate_labels="politics")

View File

@ -3562,3 +3562,15 @@ class TrieTest(unittest.TestCase):
trie.add("extra_id_1")
trie.add("extra_id_100")
self.assertEqual(trie.split("[CLS] This is a extra_id_100"), ["[CLS]", " This is a ", "extra_id_100"])
def test_trie_single(self):
trie = Trie()
trie.add("A")
self.assertEqual(trie.split("ABC"), ["A", "BC"])
self.assertEqual(trie.split("BCA"), ["BC", "A"])
def test_trie_final(self):
trie = Trie()
trie.add("TOKEN]")
trie.add("[SPECIAL_TOKEN]")
self.assertEqual(trie.split("This is something [SPECIAL_TOKEN]"), ["This is something ", "[SPECIAL_TOKEN]"])

View File

@ -355,6 +355,34 @@ class TrainerUtilsTest(unittest.TestCase):
self.check_iterable_dataset_shard(dataset, 4, drop_last=True, num_processes=3, epoch=42)
self.check_iterable_dataset_shard(dataset, 4, drop_last=False, num_processes=3, epoch=42)
def test_iterable_dataset_shard_with_length(self):
sampler_shards = [
IterableDatasetShard(list(range(100)), batch_size=4, drop_last=True, num_processes=2, process_index=i)
for i in range(2)
]
# Build expected shards: each process will have batches of size 4 until there is not enough elements to
# form two full batches (so we stop at 96 = (100 // (4 * 2)) * 4)
expected_shards = [[], []]
current_shard = 0
for i in range(0, 96, 4):
expected_shards[current_shard].extend(list(range(i, i + 4)))
current_shard = 1 - current_shard
self.assertListEqual([list(shard) for shard in sampler_shards], expected_shards)
self.assertListEqual([len(shard) for shard in sampler_shards], [len(shard) for shard in expected_shards])
sampler_shards = [
IterableDatasetShard(list(range(100)), batch_size=4, drop_last=False, num_processes=2, process_index=i)
for i in range(2)
]
# When drop_last=False, we get two last full batches by looping back to the beginning.
expected_shards[0].extend(list(range(96, 100)))
expected_shards[1].extend(list(range(0, 4)))
self.assertListEqual([list(shard) for shard in sampler_shards], expected_shards)
self.assertListEqual([len(shard) for shard in sampler_shards], [len(shard) for shard in expected_shards])
def check_shard_sampler(self, dataset, batch_size, drop_last, num_processes=2):
shards = [
ShardSampler(

View File

@ -281,6 +281,7 @@ SPECIAL_MODULE_TO_TEST_MAP = {
"test_trainer_distributed.py",
"test_trainer_tpu.py",
],
"train_pt_utils.py": "test_trainer_utils.py",
"utils/versions.py": "test_versions_utils.py",
}