mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-23 19:04:35 +08:00
Compare commits
39 Commits
fix-llama4
...
trigger-re
Author | SHA1 | Date | |
---|---|---|---|
6d38d27ef3 | |||
20c0f8bc77 | |||
9b2afaf02d | |||
d188134b95 | |||
e2ed15c465 | |||
005459827e | |||
69419a4935 | |||
1fdb9f3908 | |||
3dfebf2fc0 | |||
e6093deb18 | |||
b7ec09c2f4 | |||
aa42987c1e | |||
38a9b70786 | |||
9bcdd5cde9 | |||
31d30b7224 | |||
0725cd6953 | |||
797860c68c | |||
89b35be618 | |||
9a02e7602d | |||
54a02160eb | |||
af6120b3eb | |||
5d26a38735 | |||
a9ce8c69c9 | |||
0a53df1a77 | |||
b949747b54 | |||
11738f8537 | |||
f7b21822e3 | |||
3756bf192c | |||
458e0b376c | |||
ea01334873 | |||
b922b22ec2 | |||
c27f628e98 | |||
0a289d1630 | |||
c55d806355 | |||
9cd7570f34 | |||
1fc67a25c6 | |||
12d4c5b66f | |||
3620b32cc8 | |||
cb0f604192 |
65
.github/workflows/self-scheduled-caller.yml
vendored
65
.github/workflows/self-scheduled-caller.yml
vendored
@ -7,7 +7,7 @@ on:
|
||||
- cron: "17 2 * * *"
|
||||
push:
|
||||
branches:
|
||||
- run_scheduled_ci*
|
||||
- trigger-remove-script-datasets-in-tests
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
prev_workflow_run_id:
|
||||
@ -25,7 +25,7 @@ on:
|
||||
# Used for `push` to easily modiffy the target workflow runs to compare against
|
||||
env:
|
||||
prev_workflow_run_id: ""
|
||||
other_workflow_run_id: ""
|
||||
other_workflow_run_id: "15770139098"
|
||||
|
||||
|
||||
jobs:
|
||||
@ -56,64 +56,3 @@ jobs:
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
secrets: inherit
|
||||
|
||||
torch-pipeline:
|
||||
name: Torch pipeline CI
|
||||
uses: ./.github/workflows/self-scheduled.yml
|
||||
with:
|
||||
job: run_pipelines_torch_gpu
|
||||
slack_report_channel: "#transformers-ci-daily-pipeline-torch"
|
||||
runner: daily-ci
|
||||
docker: huggingface/transformers-pytorch-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
secrets: inherit
|
||||
|
||||
example-ci:
|
||||
name: Example CI
|
||||
uses: ./.github/workflows/self-scheduled.yml
|
||||
with:
|
||||
job: run_examples_gpu
|
||||
slack_report_channel: "#transformers-ci-daily-examples"
|
||||
runner: daily-ci
|
||||
docker: huggingface/transformers-all-latest-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
secrets: inherit
|
||||
|
||||
trainer-fsdp-ci:
|
||||
name: Trainer/FSDP CI
|
||||
uses: ./.github/workflows/self-scheduled.yml
|
||||
with:
|
||||
job: run_trainer_and_fsdp_gpu
|
||||
slack_report_channel: "#transformers-ci-daily-training"
|
||||
runner: daily-ci
|
||||
docker: huggingface/transformers-all-latest-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
secrets: inherit
|
||||
|
||||
deepspeed-ci:
|
||||
name: DeepSpeed CI
|
||||
uses: ./.github/workflows/self-scheduled.yml
|
||||
with:
|
||||
job: run_torch_cuda_extensions_gpu
|
||||
slack_report_channel: "#transformers-ci-daily-training"
|
||||
runner: daily-ci
|
||||
docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
|
||||
ci_event: Daily CI
|
||||
working-directory-prefix: /workspace
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
secrets: inherit
|
||||
|
||||
quantization-ci:
|
||||
name: Quantization CI
|
||||
uses: ./.github/workflows/self-scheduled.yml
|
||||
with:
|
||||
job: run_quantization_torch_gpu
|
||||
slack_report_channel: "#transformers-ci-daily-quantization"
|
||||
runner: daily-ci
|
||||
docker: huggingface/transformers-quantization-latest-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
secrets: inherit
|
||||
|
18
Makefile
18
Makefile
@ -8,13 +8,19 @@ check_dirs := examples tests src utils
|
||||
exclude_folders := ""
|
||||
|
||||
modified_only_fixup:
|
||||
$(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
|
||||
@if test -n "$(modified_py_files)"; then \
|
||||
echo "Checking/fixing $(modified_py_files)"; \
|
||||
ruff check $(modified_py_files) --fix --exclude $(exclude_folders); \
|
||||
ruff format $(modified_py_files) --exclude $(exclude_folders);\
|
||||
@current_branch=$$(git branch --show-current); \
|
||||
if [ "$$current_branch" = "main" ]; then \
|
||||
echo "On main branch, running 'style' target instead..."; \
|
||||
$(MAKE) style; \
|
||||
else \
|
||||
echo "No library .py files were modified"; \
|
||||
modified_py_files=$$(python utils/get_modified_files.py $(check_dirs)); \
|
||||
if [ -n "$$modified_py_files" ]; then \
|
||||
echo "Checking/fixing files: $${modified_py_files}"; \
|
||||
ruff check $${modified_py_files} --fix --exclude $(exclude_folders); \
|
||||
ruff format $${modified_py_files} --exclude $(exclude_folders); \
|
||||
else \
|
||||
echo "No library .py files were modified"; \
|
||||
fi; \
|
||||
fi
|
||||
|
||||
# Update src/transformers/dependency_versions_table.py
|
||||
|
@ -14,84 +14,127 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Bamba
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
|
||||
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
<div style="float: right;">
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
|
||||
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Overview
|
||||
# Bamba
|
||||
|
||||
Bamba-9B is a decoder-only language model based on the [Mamba-2](https://github.com/state-spaces/mamba) architecture and is designed to handle a wide range of text generation tasks. It is trained from scratch using a two-stage training approach. In the first stage, the model is trained on 2 trillion tokens from the Dolma v1.7 dataset. In the second stage, it undergoes additional training on 200 billion tokens, leveraging a carefully curated blend of high-quality data to further refine its performance and enhance output quality.
|
||||
[Bamba](https://huggingface.co/blog/bamba) is a 9B parameter decoder-only language model built on the [Mamba-2](./mamba2) architecture. It is pretrained in two stages - it starts by training on 2T tokens from the [Dolma v1.7](https://huggingface.co/datasets/allenai/dolma) dataset and then trained on an additional 200B tokens from [FineWeb](https://huggingface.co/datasets/HuggingFaceFW/fineweb) and [Cosmopedia](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia).
|
||||
|
||||
Checkout all Bamba-9B model checkpoints [here](https://github.com/foundation-model-stack/bamba).
|
||||
You can find all the original Bamba checkpoints under the [Bamba](https://huggingface.co/collections/ibm-ai-platform/bamba-674f1388b9bbc98b413c7bab) collection.
|
||||
|
||||
> [!TIP]
|
||||
> This model was contributed by [ani300](https://github.com/ani300) and [fabianlim](https://github.com/fabianlim).
|
||||
>
|
||||
> Click on the Bamba models in the right sidebar for more examples of how to apply Bamba to different text generation tasks.
|
||||
|
||||
The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line.
|
||||
|
||||
<hfoptions id="usage">
|
||||
<hfoption id="Pipeline">
|
||||
|
||||
```python
|
||||
import torch
|
||||
from transformers import pipeline
|
||||
|
||||
pipeline = pipeline(
|
||||
task="text-generation",
|
||||
model="ibm-ai-platform/Bamba-9B-v2",
|
||||
torch_dtype=torch.bfloat16,
|
||||
device=0
|
||||
)
|
||||
pipeline("Plants create energy through a process known as")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
|
||||
<hfoption id="AutoModel">
|
||||
|
||||
```python
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("ibm-ai-platform/Bamba-9B-v2")
|
||||
model = AutoModelForCausalLM.from_pretrained("ibm-ai-platform/Bamba-9B-v2", torch_dtype=torch.bfloat16, device_map="auto", attn_implementation="sdpa")
|
||||
input_ids = tokenizer("Plants create energy through a process known as", return_tensors="pt").to("cuda")
|
||||
|
||||
output = model.generate(**input_ids)
|
||||
print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
|
||||
<hfoption id="transformers CLI">
|
||||
```bash
|
||||
echo "Plants create energy through a process known as" | transformers-cli run --task text-generation --model ibm-ai-platform/Bamba-9B-v2 --device 0
|
||||
```
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
|
||||
|
||||
The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
|
||||
|
||||
quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
|
||||
tokenizer = AutoTokenizer.from_pretrained("ibm-ai-platform/Bamba-9B-v2")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"ibm-ai-platform/Bamba-9B-v2",
|
||||
quantization_config=quantization_config,
|
||||
device_map="auto",
|
||||
attn_implementation="sdpa"
|
||||
)
|
||||
|
||||
inputs = tokenizer("Plants create energy through a process known as", return_tensors="pt").to("cuda")
|
||||
output = model.generate(**inputs)
|
||||
print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- Bamba supports padding-free training which concatenates distinct training examples while still processing inputs as separate batches. It can significantly accelerate inference by [~2x](https://github.com/huggingface/transformers/pull/35861#issue-2807873129) (depending on model and data distribution) and reduce memory-usage if there are examples of varying lengths by avoiding unnecessary compute and memory overhead from padding tokens.
|
||||
|
||||
Padding-free training requires the `flash-attn`, `mamba-ssm`, and `causal-conv1d` packages and the following arguments must be passed to the model in addition to `input_ids` and `labels`.
|
||||
|
||||
- `position_ids: torch.LongTensor`: the position index of each token in each sequence.
|
||||
- `seq_idx: torch.IntTensor`: the index of each sequence in the batch.
|
||||
- Each of the [`FlashAttentionKwargs`]
|
||||
- `cu_seq_lens_q: torch.LongTensor`: the cumulative sequence lengths of all queries.
|
||||
- `cu_seq_lens_k: torch.LongTensor`: the cumulative sequence lengths of all keys.
|
||||
- `max_length_q: int`: the longest query length in the batch.
|
||||
- `max_length_k: int`: the longest key length in the batch.
|
||||
|
||||
The `attention_mask` inputs should not be provided. The [`DataCollatorWithFlattening`] programmatically generates the set of additional arguments above using `return_seq_idx=True` and `return_flash_attn_kwargs=True`. See the [Improving Hugging Face Training Efficiency Through Packing with Flash Attention](https://huggingface.co/blog/packing-with-FA2) blog post for additional information.
|
||||
|
||||
```python
|
||||
from transformers import DataCollatorWithFlattening
|
||||
|
||||
# Example of using padding-free training
|
||||
data_collator = DataCollatorWithFlattening(
|
||||
tokenizer=tokenizer,
|
||||
return_seq_idx=True,
|
||||
return_flash_attn_kwargs=True
|
||||
)
|
||||
```
|
||||
|
||||
## BambaConfig
|
||||
|
||||
| Model | Params | # Layers | Hidden Dim. | Attention Heads | GQA | KV Heads | Context Length | Tied Embeddings |
|
||||
|-------------------|--------------|----------|-------------|-----------------|-----|----------|----------------|------------------|
|
||||
| Bamba | 9B (9.78B) | 32 | 4096 | 32 | Yes | 8 | 4096 | True |
|
||||
|
||||
[[autodoc]] BambaConfig
|
||||
|
||||
<!---
|
||||
## Usage Tips
|
||||
|
||||
Tips:
|
||||
|
||||
- The architecture is based on Mamba-2 models.
|
||||
|
||||
## BambaModel
|
||||
|
||||
[[autodoc]] BambaModel
|
||||
- forward
|
||||
-->
|
||||
|
||||
## BambaForCausalLM
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("ibm-fms/Bamba-9B")
|
||||
tokenizer = AutoTokenizer.from_pretrained("ibm-fms/Bamba-9B")
|
||||
|
||||
message = ["Mamba is a snake with following properties "]
|
||||
inputs = tokenizer(message, return_tensors='pt', return_token_type_ids=False)
|
||||
response = model.generate(**inputs, max_new_tokens=64)
|
||||
print(tokenizer.batch_decode(response, skip_special_tokens=True)[0])
|
||||
```
|
||||
|
||||
|
||||
## Padding-Free Training
|
||||
|
||||
Bamba supports padding-free training in which distinct training examples can be concatenated
|
||||
together while nevertheless processing the inputs as though they belonged to separate batches. When
|
||||
the examples are of varying lengths, padding-free training can provide significant speed ups and
|
||||
memory savings compared to batching the examples together and using padding, as the unnecessary
|
||||
compute and memory due to padding is avoided entirely. The performance gains depend on factors such
|
||||
as the model and the data distribution, but throughput gains up to [~2x are commonly
|
||||
seen](https://github.com/huggingface/transformers/pull/35861#issue-2807873129).
|
||||
|
||||
Using padding-free training with Bamba requires the `flash-attn`, `mamba-ssm`, and `causal-conv1d`
|
||||
packages, and the following arguments must be passed to the model in addition to `input_ids` and
|
||||
`labels`:
|
||||
* `position_ids: torch.LongTensor`: the position index of each token in each sequence.
|
||||
* `seq_idx: torch.IntTensor`: the index of each sequence in the batch.
|
||||
* Each of the [`FlashAttentionKwargs`]
|
||||
* `cu_seq_lens_q: torch.LongTensor`: The cumulative sequence lengths of all queries.
|
||||
* `cu_seq_lens_k: torch.LongTensor`: The cumulative sequence lengths of all keys.
|
||||
* `max_length_q: int`: the longest query length in the batch.
|
||||
* `max_length_k: int`: the longest key length in the batch.
|
||||
|
||||
The `attention_mask` inputs should not be provided. The [`DataCollatorWithFlattening`] can be used
|
||||
to programmatically generate the above set of additional arguments using `return_seq_idx=True` and
|
||||
`return_flash_attn_kwargs=True`. See [this blog post](https://huggingface.co/blog/packing-with-FA2)
|
||||
for additional information.
|
||||
|
||||
|
||||
[[autodoc]] BambaForCausalLM
|
||||
- forward
|
||||
|
||||
This HF implementation is contributed by [ani300](https://github.com/ani300) and [fabianlim](https://github.com/fabianlim).
|
||||
|
@ -78,7 +78,13 @@ If you're interested in submitting a resource to be included here, please feel f
|
||||
|
||||
[[autodoc]] DPTImageProcessor
|
||||
- preprocess
|
||||
|
||||
## DPTImageProcessorFast
|
||||
|
||||
[[autodoc]] DPTImageProcessorFast
|
||||
- preprocess
|
||||
- post_process_semantic_segmentation
|
||||
- post_process_depth_estimation
|
||||
|
||||
## DPTModel
|
||||
|
||||
|
@ -56,7 +56,7 @@ Here is how to use the processor to process text and audio:
|
||||
```python
|
||||
>>> # let's load an audio sample from an Arabic speech corpus
|
||||
>>> from datasets import load_dataset
|
||||
>>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True, trust_remote_code=True)
|
||||
>>> dataset = load_dataset("halabi2016/arabic_speech_corpus", split="test", streaming=True)
|
||||
>>> audio_sample = next(iter(dataset))["audio"]
|
||||
|
||||
>>> # now, process it
|
||||
|
@ -56,7 +56,7 @@ Here is how to use the processor to process text and audio:
|
||||
```python
|
||||
>>> # let's load an audio sample from an Arabic speech corpus
|
||||
>>> from datasets import load_dataset
|
||||
>>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True, trust_remote_code=True)
|
||||
>>> dataset = load_dataset("halabi2016/arabic_speech_corpus", split="test", streaming=True)
|
||||
>>> audio_sample = next(iter(dataset))["audio"]
|
||||
|
||||
>>> # now, process it
|
||||
|
@ -56,6 +56,7 @@ This model was contributed by [nielsr](https://huggingface.co/nielsr). The origi
|
||||
on both printed (e.g. the [SROIE dataset](https://paperswithcode.com/dataset/sroie) and handwritten (e.g. the [IAM
|
||||
Handwriting dataset](https://fki.tic.heia-fr.ch/databases/iam-handwriting-database>) text recognition tasks. For more
|
||||
information, see the [official models](https://huggingface.co/models?other=trocr>).
|
||||
- [Fine‑tune TrOCR on your own OCR dataset](https://github.com/Ashutosh-4485/trocr-custom-fine-tune.git).
|
||||
- TrOCR is always used within the [VisionEncoderDecoder](vision-encoder-decoder) framework.
|
||||
|
||||
## Resources
|
||||
|
@ -493,6 +493,33 @@ training_args = TrainingArguments(
|
||||
)
|
||||
```
|
||||
|
||||
You can also configure which specific kernels to apply using the `liger_kernel_config` parameter. This dict is passed as keyword arguments to the `_apply_liger_kernel_to_instance` function, allowing fine-grained control over kernel usage. Available options vary by model but typically include: `rope`, `swiglu`, `cross_entropy`, `fused_linear_cross_entropy`, `rms_norm`, etc.
|
||||
|
||||
```py
|
||||
from transformers import TrainingArguments
|
||||
|
||||
# Apply only specific kernels
|
||||
training_args = TrainingArguments(
|
||||
output_dir="your-model",
|
||||
learning_rate=2e-5,
|
||||
per_device_train_batch_size=16,
|
||||
per_device_eval_batch_size=16,
|
||||
num_train_epochs=2,
|
||||
weight_decay=0.01,
|
||||
eval_strategy="epoch",
|
||||
save_strategy="epoch",
|
||||
load_best_model_at_end=True,
|
||||
push_to_hub=True,
|
||||
use_liger_kernel=True,
|
||||
liger_kernel_config={
|
||||
"rope": True,
|
||||
"cross_entropy": True,
|
||||
"rms_norm": False, # Don't apply Liger's RMSNorm kernel
|
||||
"swiglu": True,
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
### NEFTune
|
||||
|
||||
[NEFTune](https://hf.co/papers/2310.05914) adds noise to the embedding vectors during training to improve model performance. Enable it in [`Trainer`] with the `neftune_noise_alpha` parameter in [`TrainingArguments`] to control how much noise is added.
|
||||
|
@ -264,7 +264,6 @@ class ExamplesTests(TestCasePlus):
|
||||
--dataset_config clean
|
||||
--train_split_name validation
|
||||
--eval_split_name validation
|
||||
--trust_remote_code
|
||||
--output_dir {tmp_dir}
|
||||
--overwrite_output_dir
|
||||
--num_train_epochs=2
|
||||
|
@ -312,7 +312,6 @@ class ExamplesTestsNoTrainer(TestCasePlus):
|
||||
{self.examples_dir}/pytorch/image-classification/run_image_classification_no_trainer.py
|
||||
--model_name_or_path google/vit-base-patch16-224-in21k
|
||||
--dataset_name hf-internal-testing/cats_vs_dogs_sample
|
||||
--trust_remote_code
|
||||
--learning_rate 1e-4
|
||||
--per_device_train_batch_size 2
|
||||
--per_device_eval_batch_size 1
|
||||
|
@ -390,7 +390,6 @@ class ExamplesTests(TestCasePlus):
|
||||
--output_dir {tmp_dir}
|
||||
--model_name_or_path google/vit-base-patch16-224-in21k
|
||||
--dataset_name hf-internal-testing/cats_vs_dogs_sample
|
||||
--trust_remote_code
|
||||
--do_train
|
||||
--do_eval
|
||||
--learning_rate 1e-4
|
||||
@ -424,7 +423,6 @@ class ExamplesTests(TestCasePlus):
|
||||
--dataset_config_name clean
|
||||
--train_split_name validation
|
||||
--eval_split_name validation
|
||||
--trust_remote_code
|
||||
--do_train
|
||||
--do_eval
|
||||
--learning_rate 1e-4
|
||||
@ -455,7 +453,6 @@ class ExamplesTests(TestCasePlus):
|
||||
--dataset_config_name clean
|
||||
--train_split_name validation
|
||||
--eval_split_name validation
|
||||
--trust_remote_code
|
||||
--do_train
|
||||
--do_eval
|
||||
--learning_rate 1e-4
|
||||
@ -488,7 +485,6 @@ class ExamplesTests(TestCasePlus):
|
||||
--dataset_config_name clean
|
||||
--train_split_name validation
|
||||
--eval_split_name validation
|
||||
--trust_remote_code
|
||||
--do_train
|
||||
--do_eval
|
||||
--learning_rate 1e-4
|
||||
@ -516,7 +512,6 @@ class ExamplesTests(TestCasePlus):
|
||||
--output_dir {tmp_dir}
|
||||
--model_name_or_path hf-internal-testing/tiny-random-wav2vec2
|
||||
--dataset_name anton-l/superb_demo
|
||||
--trust_remote_code
|
||||
--dataset_config_name ks
|
||||
--train_split_name test
|
||||
--eval_split_name test
|
||||
@ -551,7 +546,6 @@ class ExamplesTests(TestCasePlus):
|
||||
--dataset_name hf-internal-testing/librispeech_asr_dummy
|
||||
--dataset_config_names clean
|
||||
--dataset_split_names validation
|
||||
--trust_remote_code
|
||||
--learning_rate 1e-4
|
||||
--per_device_train_batch_size 4
|
||||
--per_device_eval_batch_size 4
|
||||
@ -572,7 +566,6 @@ class ExamplesTests(TestCasePlus):
|
||||
run_mae.py
|
||||
--output_dir {tmp_dir}
|
||||
--dataset_name hf-internal-testing/cats_vs_dogs_sample
|
||||
--trust_remote_code
|
||||
--do_train
|
||||
--do_eval
|
||||
--learning_rate 1e-4
|
||||
|
@ -315,7 +315,6 @@ class ExamplesTests(TestCasePlus):
|
||||
testargs = f"""
|
||||
run_image_classification.py
|
||||
--dataset_name hf-internal-testing/cats_vs_dogs_sample
|
||||
--trust_remote_code
|
||||
--model_name_or_path microsoft/resnet-18
|
||||
--do_train
|
||||
--do_eval
|
||||
|
@ -710,8 +710,8 @@ class AssistantToTargetTranslator:
|
||||
assistant_model: Optional["PreTrainedModel"] = None,
|
||||
assistant_prune_lm_head: bool = False,
|
||||
):
|
||||
self._target_tokenizer: "PreTrainedTokenizerBase" = target_tokenizer
|
||||
self._assistant_tokenizer: "PreTrainedTokenizerBase" = assistant_tokenizer
|
||||
self._target_tokenizer: PreTrainedTokenizerBase = target_tokenizer
|
||||
self._assistant_tokenizer: PreTrainedTokenizerBase = assistant_tokenizer
|
||||
self._assistant_model_device: str = (
|
||||
assistant_model_device if assistant_model is None else assistant_model.device
|
||||
)
|
||||
|
@ -72,7 +72,7 @@ class TextStreamer(BaseStreamer):
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, **decode_kwargs):
|
||||
def __init__(self, tokenizer: AutoTokenizer, skip_prompt: bool = False, **decode_kwargs):
|
||||
self.tokenizer = tokenizer
|
||||
self.skip_prompt = skip_prompt
|
||||
self.decode_kwargs = decode_kwargs
|
||||
@ -206,7 +206,7 @@ class TextIteratorStreamer(TextStreamer):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, timeout: Optional[float] = None, **decode_kwargs
|
||||
self, tokenizer: AutoTokenizer, skip_prompt: bool = False, timeout: Optional[float] = None, **decode_kwargs
|
||||
):
|
||||
super().__init__(tokenizer, skip_prompt, **decode_kwargs)
|
||||
self.text_queue = Queue()
|
||||
@ -284,7 +284,7 @@ class AsyncTextIteratorStreamer(TextStreamer):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, timeout: Optional[float] = None, **decode_kwargs
|
||||
self, tokenizer: AutoTokenizer, skip_prompt: bool = False, timeout: Optional[float] = None, **decode_kwargs
|
||||
):
|
||||
super().__init__(tokenizer, skip_prompt, **decode_kwargs)
|
||||
self.text_queue = asyncio.Queue()
|
||||
|
@ -4723,7 +4723,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
)
|
||||
|
||||
if return_dict_in_generate and output_scores:
|
||||
beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))
|
||||
beam_indices = tuple(beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices)))
|
||||
|
||||
# increase cur_len
|
||||
cur_len = cur_len + 1
|
||||
|
@ -938,6 +938,7 @@ class WandbCallback(TrainerCallback):
|
||||
|
||||
args_for_fake = copy.deepcopy(args)
|
||||
args_for_fake.deepspeed = None
|
||||
args_for_fake.deepspeed_plugin = None
|
||||
fake_trainer = Trainer(
|
||||
args=args_for_fake, model=model, processing_class=processing_class, eval_dataset=["fake"]
|
||||
)
|
||||
@ -1625,8 +1626,8 @@ class NeptuneCallback(TrainerCallback):
|
||||
target_path = consistent_checkpoint_path
|
||||
except OSError as e:
|
||||
logger.warning(
|
||||
"NeptuneCallback was unable to made a copy of checkpoint due to I/O exception: '{}'. "
|
||||
"Could fail trying to upload.".format(e)
|
||||
f"NeptuneCallback was unable to made a copy of checkpoint due to I/O exception: '{e}'. "
|
||||
"Could fail trying to upload."
|
||||
)
|
||||
|
||||
self._metadata_namespace[self._target_checkpoints_namespace].upload_files(target_path)
|
||||
@ -1975,9 +1976,7 @@ class ClearMLCallback(TrainerCallback):
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Could not remove checkpoint `{}` after going over the `save_total_limit`. Error is: {}".format(
|
||||
self._checkpoints_saved[0].name, e
|
||||
)
|
||||
f"Could not remove checkpoint `{self._checkpoints_saved[0].name}` after going over the `save_total_limit`. Error is: {e}"
|
||||
)
|
||||
break
|
||||
self._checkpoints_saved = self._checkpoints_saved[1:]
|
||||
|
@ -38,7 +38,14 @@ if SPARSE_MODE not in [TOP_LEFT_ALIGNED_CAUSAL_MASK_MODE, DOWN_RIGHT_ALIGNED_CAU
|
||||
"or 3 (down-right aligned causal mask)."
|
||||
)
|
||||
|
||||
ATTN_MASK_NPU = None
|
||||
ATTN_MASK_NPU_CACHE = {}
|
||||
|
||||
|
||||
def get_attn_mask_npu(device):
|
||||
"""Get or create attention mask for the specified device."""
|
||||
if device not in ATTN_MASK_NPU_CACHE:
|
||||
ATTN_MASK_NPU_CACHE[device] = torch.triu(torch.ones([2048, 2048], device=device), diagonal=1).bool()
|
||||
return ATTN_MASK_NPU_CACHE[device]
|
||||
|
||||
|
||||
def is_npu_fa2_top_left_aligned_causal_mask():
|
||||
@ -174,9 +181,7 @@ def npu_flash_attn_func(
|
||||
head_num = q.shape[2]
|
||||
output = torch_npu.npu_fusion_attention(q, k, v, head_num, "BSND", keep_prob=keep_prob, scale=softmax_scale)[0]
|
||||
else:
|
||||
global ATTN_MASK_NPU
|
||||
if ATTN_MASK_NPU is None:
|
||||
ATTN_MASK_NPU = torch.triu(torch.ones([2048, 2048], device=q.device), diagonal=1).bool()
|
||||
attn_mask_npu = get_attn_mask_npu(q.device)
|
||||
head_num = q.shape[2]
|
||||
output = torch_npu.npu_fusion_attention(
|
||||
q,
|
||||
@ -186,7 +191,7 @@ def npu_flash_attn_func(
|
||||
"BSND",
|
||||
keep_prob=keep_prob,
|
||||
scale=softmax_scale,
|
||||
atten_mask=ATTN_MASK_NPU,
|
||||
atten_mask=attn_mask_npu,
|
||||
sparse_mode=SPARSE_MODE,
|
||||
)[0]
|
||||
|
||||
@ -227,9 +232,7 @@ def npu_flash_attn_varlen_func(
|
||||
actual_seq_kvlen=tuple(cu_seqlens_k[1:].cpu().numpy().tolist()),
|
||||
)[0]
|
||||
else:
|
||||
global ATTN_MASK_NPU
|
||||
if ATTN_MASK_NPU is None:
|
||||
ATTN_MASK_NPU = torch.triu(torch.ones([2048, 2048], device=q.device), diagonal=1).bool()
|
||||
attn_mask_npu = get_attn_mask_npu(q.device)
|
||||
head_num = q.shape[1]
|
||||
output = torch_npu.npu_fusion_attention(
|
||||
q,
|
||||
@ -238,7 +241,7 @@ def npu_flash_attn_varlen_func(
|
||||
head_num,
|
||||
pse=None,
|
||||
padding_mask=None,
|
||||
atten_mask=ATTN_MASK_NPU,
|
||||
atten_mask=attn_mask_npu,
|
||||
scale=softmax_scale,
|
||||
keep_prob=keep_prob,
|
||||
input_layout="TND",
|
||||
|
@ -1409,10 +1409,10 @@ class TFPreTrainedModel(keras.Model, TFModelUtilsMixin, TFGenerationMixin, PushT
|
||||
|
||||
def prepare_tf_dataset(
|
||||
self,
|
||||
dataset: "datasets.Dataset", # noqa:F821
|
||||
dataset: datasets.Dataset, # noqa:F821
|
||||
batch_size: int = 8,
|
||||
shuffle: bool = True,
|
||||
tokenizer: Optional["PreTrainedTokenizerBase"] = None,
|
||||
tokenizer: Optional[PreTrainedTokenizerBase] = None,
|
||||
collate_fn: Optional[Callable] = None,
|
||||
collate_fn_args: Optional[dict[str, Any]] = None,
|
||||
drop_remainder: Optional[bool] = None,
|
||||
|
@ -29,7 +29,6 @@ import warnings
|
||||
from collections import defaultdict
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from functools import partial, wraps
|
||||
from threading import Thread
|
||||
@ -41,7 +40,6 @@ from huggingface_hub import split_torch_state_dict_into_shards
|
||||
from packaging import version
|
||||
from torch import Tensor, nn
|
||||
from torch.distributions import constraints
|
||||
from torch.nn import CrossEntropyLoss, Identity
|
||||
from torch.utils.checkpoint import checkpoint
|
||||
|
||||
from transformers.utils import is_torchao_available
|
||||
@ -50,7 +48,6 @@ from transformers.utils import is_torchao_available
|
||||
if is_torchao_available():
|
||||
from torchao.quantization import Int4WeightOnlyConfig
|
||||
|
||||
from .activations import get_activation
|
||||
from .configuration_utils import PretrainedConfig
|
||||
from .dynamic_module_utils import custom_object_save
|
||||
from .generation import CompileConfig, GenerationConfig
|
||||
@ -98,7 +95,6 @@ from .utils import (
|
||||
WEIGHTS_INDEX_NAME,
|
||||
WEIGHTS_NAME,
|
||||
ContextManagers,
|
||||
ModelOutput,
|
||||
PushToHubMixin,
|
||||
cached_file,
|
||||
check_torch_load_is_safe,
|
||||
@ -123,7 +119,6 @@ from .utils import (
|
||||
is_torch_xla_available,
|
||||
is_torch_xpu_available,
|
||||
logging,
|
||||
replace_return_docstrings,
|
||||
strtobool,
|
||||
)
|
||||
from .utils.generic import GeneralInterface
|
||||
@ -380,10 +375,10 @@ def get_parameter_dtype(parameter: Union[nn.Module, "ModuleUtilsMixin"]):
|
||||
|
||||
gen = parameter._named_members(get_members_fn=find_tensor_attributes)
|
||||
last_tuple = None
|
||||
for tuple in gen:
|
||||
last_tuple = tuple
|
||||
if tuple[1].is_floating_point():
|
||||
return tuple[1].dtype
|
||||
for gen_tuple in gen:
|
||||
last_tuple = gen_tuple
|
||||
if gen_tuple[1].is_floating_point():
|
||||
return gen_tuple[1].dtype
|
||||
|
||||
if last_tuple is not None:
|
||||
# fallback to the last dtype
|
||||
@ -3897,7 +3892,20 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
|
||||
@wraps(torch.nn.Module.cuda)
|
||||
def cuda(self, *args, **kwargs):
|
||||
if getattr(self, "quantization_method", None) == QuantizationMethod.HQQ:
|
||||
raise ValueError("`.cuda` is not supported for HQQ-quantized models.")
|
||||
from hqq.core.quantize import HQQLinear
|
||||
|
||||
# Since HQQLinear stores some tensors in the 'meta' attribute,
|
||||
# it's necessary to manually call the `cuda` method on HQQLinear layers.
|
||||
super().cuda(*args, **kwargs)
|
||||
for module in self.modules():
|
||||
if isinstance(module, HQQLinear):
|
||||
if len(args) > 0:
|
||||
device = args[0]
|
||||
else:
|
||||
device = kwargs.get("device", "cuda")
|
||||
module.cuda(device)
|
||||
return self
|
||||
|
||||
# Checks if the model has been loaded in 4-bit or 8-bit with BNB
|
||||
if getattr(self, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES:
|
||||
if getattr(self, "is_loaded_in_8bit", False):
|
||||
@ -3910,8 +3918,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
|
||||
"Calling `cuda()` is not supported for `4-bit` quantized models with the installed version of bitsandbytes. "
|
||||
f"The current device is `{self.device}`. If you intended to move the model, please install bitsandbytes >= 0.43.2."
|
||||
)
|
||||
else:
|
||||
return super().cuda(*args, **kwargs)
|
||||
return super().cuda(*args, **kwargs)
|
||||
|
||||
@wraps(torch.nn.Module.to)
|
||||
def to(self, *args, **kwargs):
|
||||
@ -3926,7 +3933,30 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
|
||||
break
|
||||
|
||||
if getattr(self, "quantization_method", None) == QuantizationMethod.HQQ:
|
||||
raise ValueError("`.to` is not supported for HQQ-quantized models.")
|
||||
from hqq.core.quantize import HQQLinear
|
||||
|
||||
# Since HQQLinear stores some tensors in the 'meta' attribute, we must
|
||||
# explicitly move the parameters to the target device for each HQQLinear layer after `to`.
|
||||
super().to(*args, **kwargs)
|
||||
for module in self.modules():
|
||||
if isinstance(module, HQQLinear):
|
||||
if "device" in kwargs:
|
||||
device = kwargs["device"]
|
||||
else:
|
||||
device = args[0]
|
||||
if "dtype" in kwargs:
|
||||
dtype = kwargs["dtype"]
|
||||
elif dtype_present_in_args:
|
||||
dtype = arg
|
||||
else:
|
||||
dtype = None
|
||||
# Due to the current messy implementation of HQQLinear, updating `compute_dtype`
|
||||
# followed by calling the `cuda` method achieves the intended behavior of `to`,
|
||||
# even when the target device is CPU.
|
||||
if dtype is not None:
|
||||
module.compute_dtype = dtype
|
||||
module.cuda(device)
|
||||
return self
|
||||
|
||||
if dtype_present_in_args and getattr(self, "quantization_method", None) == QuantizationMethod.QUARK:
|
||||
raise ValueError("Casting a Quark quantized model to a new `dtype` is not supported.")
|
||||
@ -4389,10 +4419,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
|
||||
raise ValueError("DeepSpeed Zero-3 is not compatible with passing a `device_map`.")
|
||||
if not is_accelerate_available():
|
||||
raise ValueError(
|
||||
(
|
||||
"Using a `device_map`, `tp_plan`, `torch.device` context manager or setting `torch.set_default_device(device)` "
|
||||
"requires `accelerate`. You can install it with `pip install accelerate`"
|
||||
)
|
||||
"Using a `device_map`, `tp_plan`, `torch.device` context manager or setting `torch.set_default_device(device)` "
|
||||
"requires `accelerate`. You can install it with `pip install accelerate`"
|
||||
)
|
||||
|
||||
# handling bnb config from kwargs, remove after `load_in_{4/8}bit` deprecation.
|
||||
@ -5591,453 +5619,6 @@ if PreTrainedModel.push_to_hub.__doc__ is not None:
|
||||
)
|
||||
|
||||
|
||||
class PoolerStartLogits(nn.Module):
|
||||
"""
|
||||
Compute SQuAD start logits from sequence hidden states.
|
||||
|
||||
Args:
|
||||
config ([`PretrainedConfig`]):
|
||||
The config used by the model, will be used to grab the `hidden_size` of the model.
|
||||
"""
|
||||
|
||||
def __init__(self, config: PretrainedConfig):
|
||||
super().__init__()
|
||||
self.dense = nn.Linear(config.hidden_size, 1)
|
||||
logger.warning_once(
|
||||
"[DEPRECATION WARNING] `PoolerStartLogits` is deprecated and will be removed in v4.53. "
|
||||
"Please use model-specific class, e.g. `XLMPoolerStartLogits`."
|
||||
)
|
||||
|
||||
def forward(
|
||||
self, hidden_states: torch.FloatTensor, p_mask: Optional[torch.FloatTensor] = None
|
||||
) -> torch.FloatTensor:
|
||||
"""
|
||||
Args:
|
||||
hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
|
||||
The final hidden states of the model.
|
||||
p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
|
||||
Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
|
||||
should be masked.
|
||||
|
||||
Returns:
|
||||
`torch.FloatTensor`: The start logits for SQuAD.
|
||||
"""
|
||||
x = self.dense(hidden_states).squeeze(-1)
|
||||
|
||||
if p_mask is not None:
|
||||
if get_parameter_dtype(self) == torch.float16:
|
||||
x = x * (1 - p_mask) - 65500 * p_mask
|
||||
else:
|
||||
x = x * (1 - p_mask) - 1e30 * p_mask
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class PoolerEndLogits(nn.Module):
|
||||
"""
|
||||
Compute SQuAD end logits from sequence hidden states.
|
||||
|
||||
Args:
|
||||
config ([`PretrainedConfig`]):
|
||||
The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
|
||||
to use.
|
||||
"""
|
||||
|
||||
def __init__(self, config: PretrainedConfig):
|
||||
super().__init__()
|
||||
self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
|
||||
self.activation = nn.Tanh()
|
||||
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||
self.dense_1 = nn.Linear(config.hidden_size, 1)
|
||||
logger.warning_once(
|
||||
"[DEPRECATION WARNING] `PoolerEndLogits` is deprecated and will be removed in v4.53. "
|
||||
"Please use model-specific class, e.g. `XLMPoolerEndLogits`."
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.FloatTensor,
|
||||
start_states: Optional[torch.FloatTensor] = None,
|
||||
start_positions: Optional[torch.LongTensor] = None,
|
||||
p_mask: Optional[torch.FloatTensor] = None,
|
||||
) -> torch.FloatTensor:
|
||||
"""
|
||||
Args:
|
||||
hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
|
||||
The final hidden states of the model.
|
||||
start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
|
||||
The hidden states of the first tokens for the labeled span.
|
||||
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
The position of the first token for the labeled span.
|
||||
p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
|
||||
Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
|
||||
should be masked.
|
||||
|
||||
<Tip>
|
||||
|
||||
One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
|
||||
`start_states`.
|
||||
|
||||
</Tip>
|
||||
|
||||
Returns:
|
||||
`torch.FloatTensor`: The end logits for SQuAD.
|
||||
"""
|
||||
assert start_states is not None or start_positions is not None, (
|
||||
"One of start_states, start_positions should be not None"
|
||||
)
|
||||
if start_positions is not None:
|
||||
slen, hsz = hidden_states.shape[-2:]
|
||||
start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
|
||||
start_states = hidden_states.gather(-2, start_positions) # shape (bsz, 1, hsz)
|
||||
start_states = start_states.expand(-1, slen, -1) # shape (bsz, slen, hsz)
|
||||
|
||||
x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1))
|
||||
x = self.activation(x)
|
||||
x = self.LayerNorm(x)
|
||||
x = self.dense_1(x).squeeze(-1)
|
||||
|
||||
if p_mask is not None:
|
||||
if get_parameter_dtype(self) == torch.float16:
|
||||
x = x * (1 - p_mask) - 65500 * p_mask
|
||||
else:
|
||||
x = x * (1 - p_mask) - 1e30 * p_mask
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class PoolerAnswerClass(nn.Module):
|
||||
"""
|
||||
Compute SQuAD 2.0 answer class from classification and start tokens hidden states.
|
||||
|
||||
Args:
|
||||
config ([`PretrainedConfig`]):
|
||||
The config used by the model, will be used to grab the `hidden_size` of the model.
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
|
||||
self.activation = nn.Tanh()
|
||||
self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False)
|
||||
logger.warning_once(
|
||||
"[DEPRECATION WARNING] `PoolerAnswerClass` is deprecated and will be removed in v4.53. "
|
||||
"Please use model-specific class, e.g. `XLMPoolerAnswerClass`."
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.FloatTensor,
|
||||
start_states: Optional[torch.FloatTensor] = None,
|
||||
start_positions: Optional[torch.LongTensor] = None,
|
||||
cls_index: Optional[torch.LongTensor] = None,
|
||||
) -> torch.FloatTensor:
|
||||
"""
|
||||
Args:
|
||||
hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
|
||||
The final hidden states of the model.
|
||||
start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
|
||||
The hidden states of the first tokens for the labeled span.
|
||||
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
The position of the first token for the labeled span.
|
||||
cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
|
||||
|
||||
<Tip>
|
||||
|
||||
One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
|
||||
`start_states`.
|
||||
|
||||
</Tip>
|
||||
|
||||
Returns:
|
||||
`torch.FloatTensor`: The SQuAD 2.0 answer class.
|
||||
"""
|
||||
# No dependency on end_feature so that we can obtain one single `cls_logits` for each sample.
|
||||
hsz = hidden_states.shape[-1]
|
||||
assert start_states is not None or start_positions is not None, (
|
||||
"One of start_states, start_positions should be not None"
|
||||
)
|
||||
if start_positions is not None:
|
||||
start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
|
||||
start_states = hidden_states.gather(-2, start_positions).squeeze(-2) # shape (bsz, hsz)
|
||||
|
||||
if cls_index is not None:
|
||||
cls_index = cls_index[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
|
||||
cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, hsz)
|
||||
else:
|
||||
cls_token_state = hidden_states[:, -1, :] # shape (bsz, hsz)
|
||||
|
||||
x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1))
|
||||
x = self.activation(x)
|
||||
x = self.dense_1(x).squeeze(-1)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
@dataclass
|
||||
class SquadHeadOutput(ModelOutput):
|
||||
"""
|
||||
Base class for outputs of question answering models using a [`~modeling_utils.SQuADHead`].
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
|
||||
Classification loss as the sum of start token, end token (and is_impossible if provided) classification
|
||||
losses.
|
||||
start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Log probabilities for the top config.start_n_top start token possibilities (beam-search).
|
||||
start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Indices for the top config.start_n_top start token possibilities (beam-search).
|
||||
end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
|
||||
(beam-search).
|
||||
end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
|
||||
cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
|
||||
Log probabilities for the `is_impossible` label of the answers.
|
||||
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
start_top_log_probs: Optional[torch.FloatTensor] = None
|
||||
start_top_index: Optional[torch.LongTensor] = None
|
||||
end_top_log_probs: Optional[torch.FloatTensor] = None
|
||||
end_top_index: Optional[torch.LongTensor] = None
|
||||
cls_logits: Optional[torch.FloatTensor] = None
|
||||
|
||||
def __post_init__(self):
|
||||
logger.warning_once(
|
||||
"[DEPRECATION WARNING] `SquadHeadOutput` is deprecated and will be removed in v4.53. "
|
||||
"Please use model-specific class, e.g. `XLMSquadHeadOutput`."
|
||||
)
|
||||
|
||||
|
||||
class SQuADHead(nn.Module):
|
||||
r"""
|
||||
A SQuAD head inspired by XLNet.
|
||||
|
||||
Args:
|
||||
config ([`PretrainedConfig`]):
|
||||
The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
|
||||
to use.
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.start_n_top = config.start_n_top
|
||||
self.end_n_top = config.end_n_top
|
||||
|
||||
self.start_logits = PoolerStartLogits(config)
|
||||
self.end_logits = PoolerEndLogits(config)
|
||||
self.answer_class = PoolerAnswerClass(config)
|
||||
|
||||
logger.warning_once(
|
||||
"[DEPRECATION WARNING] `SQuADHead` is deprecated and will be removed in v4.53. "
|
||||
"Please use model-specific class, e.g. `XLMSQuADHead`."
|
||||
)
|
||||
|
||||
@replace_return_docstrings(output_type=SquadHeadOutput, config_class=PretrainedConfig)
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.FloatTensor,
|
||||
start_positions: Optional[torch.LongTensor] = None,
|
||||
end_positions: Optional[torch.LongTensor] = None,
|
||||
cls_index: Optional[torch.LongTensor] = None,
|
||||
is_impossible: Optional[torch.LongTensor] = None,
|
||||
p_mask: Optional[torch.FloatTensor] = None,
|
||||
return_dict: bool = False,
|
||||
) -> Union[SquadHeadOutput, tuple[torch.FloatTensor]]:
|
||||
"""
|
||||
Args:
|
||||
hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
|
||||
Final hidden states of the model on the sequence tokens.
|
||||
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Positions of the first token for the labeled span.
|
||||
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Positions of the last token for the labeled span.
|
||||
cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
|
||||
is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Whether the question has a possible answer in the paragraph or not.
|
||||
p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
|
||||
Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
|
||||
should be masked.
|
||||
return_dict (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
||||
|
||||
Returns:
|
||||
"""
|
||||
start_logits = self.start_logits(hidden_states, p_mask=p_mask)
|
||||
|
||||
if start_positions is not None and end_positions is not None:
|
||||
# If we are on multi-GPU, let's remove the dimension added by batch splitting
|
||||
for x in (start_positions, end_positions, cls_index, is_impossible):
|
||||
if x is not None and x.dim() > 1:
|
||||
x.squeeze_(-1)
|
||||
|
||||
# during training, compute the end logits based on the ground truth of the start position
|
||||
end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask)
|
||||
|
||||
loss_fct = CrossEntropyLoss()
|
||||
start_loss = loss_fct(start_logits, start_positions)
|
||||
end_loss = loss_fct(end_logits, end_positions)
|
||||
total_loss = (start_loss + end_loss) / 2
|
||||
|
||||
if cls_index is not None and is_impossible is not None:
|
||||
# Predict answerability from the representation of CLS and START
|
||||
cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index)
|
||||
loss_fct_cls = nn.BCEWithLogitsLoss()
|
||||
cls_loss = loss_fct_cls(cls_logits, is_impossible)
|
||||
|
||||
# note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
|
||||
total_loss += cls_loss * 0.5
|
||||
|
||||
return SquadHeadOutput(loss=total_loss) if return_dict else (total_loss,)
|
||||
|
||||
else:
|
||||
# during inference, compute the end logits based on beam search
|
||||
bsz, slen, hsz = hidden_states.size()
|
||||
start_log_probs = nn.functional.softmax(start_logits, dim=-1) # shape (bsz, slen)
|
||||
|
||||
start_top_log_probs, start_top_index = torch.topk(
|
||||
start_log_probs, self.start_n_top, dim=-1
|
||||
) # shape (bsz, start_n_top)
|
||||
start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz)
|
||||
start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz)
|
||||
start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz)
|
||||
|
||||
hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(
|
||||
start_states
|
||||
) # shape (bsz, slen, start_n_top, hsz)
|
||||
p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
|
||||
end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
|
||||
end_log_probs = nn.functional.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top)
|
||||
|
||||
end_top_log_probs, end_top_index = torch.topk(
|
||||
end_log_probs, self.end_n_top, dim=1
|
||||
) # shape (bsz, end_n_top, start_n_top)
|
||||
end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
|
||||
end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
|
||||
|
||||
start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)
|
||||
cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)
|
||||
|
||||
if not return_dict:
|
||||
return (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits)
|
||||
else:
|
||||
return SquadHeadOutput(
|
||||
start_top_log_probs=start_top_log_probs,
|
||||
start_top_index=start_top_index,
|
||||
end_top_log_probs=end_top_log_probs,
|
||||
end_top_index=end_top_index,
|
||||
cls_logits=cls_logits,
|
||||
)
|
||||
|
||||
|
||||
class SequenceSummary(nn.Module):
|
||||
r"""
|
||||
Compute a single vector summary of a sequence hidden states.
|
||||
|
||||
Args:
|
||||
config ([`PretrainedConfig`]):
|
||||
The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
|
||||
config class of your model for the default values it uses):
|
||||
|
||||
- **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:
|
||||
|
||||
- `"last"` -- Take the last token hidden state (like XLNet)
|
||||
- `"first"` -- Take the first token hidden state (like Bert)
|
||||
- `"mean"` -- Take the mean of all tokens hidden states
|
||||
- `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
|
||||
- `"attn"` -- Not implemented now, use multi-head attention
|
||||
|
||||
- **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
|
||||
- **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
|
||||
(otherwise to `config.hidden_size`).
|
||||
- **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
|
||||
another string or `None` will add no activation.
|
||||
- **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
|
||||
- **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
|
||||
"""
|
||||
|
||||
def __init__(self, config: PretrainedConfig):
|
||||
super().__init__()
|
||||
|
||||
self.summary_type = getattr(config, "summary_type", "last")
|
||||
if self.summary_type == "attn":
|
||||
# We should use a standard multi-head attention module with absolute positional embedding for that.
|
||||
# Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
|
||||
# We can probably just use the multi-head attention module of PyTorch >=1.1.0
|
||||
raise NotImplementedError
|
||||
|
||||
self.summary = Identity()
|
||||
if hasattr(config, "summary_use_proj") and config.summary_use_proj:
|
||||
if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
|
||||
num_classes = config.num_labels
|
||||
else:
|
||||
num_classes = config.hidden_size
|
||||
self.summary = nn.Linear(config.hidden_size, num_classes)
|
||||
|
||||
activation_string = getattr(config, "summary_activation", None)
|
||||
self.activation: Callable = get_activation(activation_string) if activation_string else Identity()
|
||||
|
||||
self.first_dropout = Identity()
|
||||
if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
|
||||
self.first_dropout = nn.Dropout(config.summary_first_dropout)
|
||||
|
||||
self.last_dropout = Identity()
|
||||
if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
|
||||
self.last_dropout = nn.Dropout(config.summary_last_dropout)
|
||||
|
||||
logger.warning_once(
|
||||
"[DEPRECATION WARNING] `SequenceSummary` is deprecated and will be removed in v4.53. "
|
||||
"Please use model-specific class, e.g. `XLMSequenceSummary`."
|
||||
)
|
||||
|
||||
def forward(
|
||||
self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None
|
||||
) -> torch.FloatTensor:
|
||||
"""
|
||||
Compute a single vector summary of a sequence hidden states.
|
||||
|
||||
Args:
|
||||
hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
|
||||
The hidden states of the last layer.
|
||||
cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
|
||||
Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.
|
||||
|
||||
Returns:
|
||||
`torch.FloatTensor`: The summary of the sequence hidden states.
|
||||
"""
|
||||
if self.summary_type == "last":
|
||||
output = hidden_states[:, -1]
|
||||
elif self.summary_type == "first":
|
||||
output = hidden_states[:, 0]
|
||||
elif self.summary_type == "mean":
|
||||
output = hidden_states.mean(dim=1)
|
||||
elif self.summary_type == "cls_index":
|
||||
if cls_index is None:
|
||||
cls_index = torch.full_like(
|
||||
hidden_states[..., :1, :],
|
||||
hidden_states.shape[-2] - 1,
|
||||
dtype=torch.long,
|
||||
)
|
||||
else:
|
||||
cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
|
||||
cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))
|
||||
# shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
|
||||
output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size)
|
||||
elif self.summary_type == "attn":
|
||||
raise NotImplementedError
|
||||
|
||||
output = self.first_dropout(output)
|
||||
output = self.summary(output)
|
||||
output = self.activation(output)
|
||||
output = self.last_dropout(output)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def unwrap_model(model: nn.Module, recursive: bool = False) -> nn.Module:
|
||||
"""
|
||||
Recursively unwraps a model from potential containers (as used in distributed training).
|
||||
|
@ -203,7 +203,7 @@ class AlbertTokenizer(PreTrainedTokenizer):
|
||||
pieces = self.sp_model.encode(text, out_type=str)
|
||||
new_pieces = []
|
||||
for piece in pieces:
|
||||
if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
|
||||
if len(piece) > 1 and piece[-1] == "," and piece[-2].isdigit():
|
||||
# Logic to handle special cases see https://github.com/google-research/bert/blob/master/README.md#tokenization
|
||||
# `9,9` -> ['▁9', ',', '9'] instead of [`_9,`, '9']
|
||||
cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
|
||||
|
@ -206,7 +206,7 @@ def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_fo
|
||||
|
||||
if "speech-commands" in model_name:
|
||||
# TODO: Convert dataset to Parquet
|
||||
dataset = load_dataset("google/speech_commands", "v0.02", split="validation", trust_remote_code=True)
|
||||
dataset = load_dataset("google/speech_commands", "v0.02", split="validation")
|
||||
waveform = dataset[0]["audio"]["array"]
|
||||
else:
|
||||
filepath = hf_hub_download(
|
||||
|
@ -74,14 +74,14 @@ else:
|
||||
("data2vec-vision", ("BeitImageProcessor", "BeitImageProcessorFast")),
|
||||
("deformable_detr", ("DeformableDetrImageProcessor", "DeformableDetrImageProcessorFast")),
|
||||
("deit", ("DeiTImageProcessor", "DeiTImageProcessorFast")),
|
||||
("depth_anything", ("DPTImageProcessor",)),
|
||||
("depth_anything", ("DPTImageProcessor", "DPTImageProcessorFast")),
|
||||
("depth_pro", ("DepthProImageProcessor", "DepthProImageProcessorFast")),
|
||||
("deta", ("DetaImageProcessor",)),
|
||||
("detr", ("DetrImageProcessor", "DetrImageProcessorFast")),
|
||||
("dinat", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||
("dinov2", ("BitImageProcessor", "BitImageProcessorFast")),
|
||||
("donut-swin", ("DonutImageProcessor", "DonutImageProcessorFast")),
|
||||
("dpt", ("DPTImageProcessor",)),
|
||||
("dpt", ("DPTImageProcessor", "DPTImageProcessorFast")),
|
||||
("efficientformer", ("EfficientFormerImageProcessor",)),
|
||||
("efficientnet", ("EfficientNetImageProcessor", "EfficientNetImageProcessorFast")),
|
||||
("flava", ("FlavaImageProcessor", "FlavaImageProcessorFast")),
|
||||
|
@ -245,6 +245,10 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
|
||||
("gpt_neox_japanese", ("GPTNeoXJapaneseTokenizer", None)),
|
||||
("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
|
||||
("gptsan-japanese", ("GPTSanJapaneseTokenizer", None)),
|
||||
("granite", ("GPT2Tokenizer", None)),
|
||||
("granitemoe", ("GPT2Tokenizer", None)),
|
||||
("granitemoehybrid", ("GPT2Tokenizer", None)),
|
||||
("granitemoeshared", ("GPT2Tokenizer", None)),
|
||||
("grounding-dino", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("groupvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("helium", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
|
||||
|
@ -311,7 +311,12 @@ class AutoVideoProcessor:
|
||||
if video_processor_class is None and video_processor_auto_map is None:
|
||||
image_processor_class = config_dict.pop("image_processor_type", None)
|
||||
if image_processor_class is not None:
|
||||
video_processor_class = image_processor_class.replace("ImageProcessor", "VideoProcessor")
|
||||
video_processor_class_inferred = image_processor_class.replace("ImageProcessor", "VideoProcessor")
|
||||
|
||||
# Some models have different image processors, e.g. InternVL uses GotOCRImageProcessor
|
||||
# We cannot use GotOCRVideoProcessor when falling back for BC and should try to infer from config later on
|
||||
if video_processor_class_inferred in VIDEO_PROCESSOR_MAPPING_NAMES.values():
|
||||
video_processor_class = video_processor_class_inferred
|
||||
if "AutoImageProcessor" in config_dict.get("auto_map", {}):
|
||||
image_processor_auto_map = config_dict["auto_map"]["AutoImageProcessor"]
|
||||
video_processor_auto_map = image_processor_auto_map.replace("ImageProcessor", "VideoProcessor")
|
||||
|
@ -830,7 +830,7 @@ class BambaMixer(nn.Module):
|
||||
|
||||
# 2. Compute the state for each intra-chunk
|
||||
# (right term of low-rank factorization of off-diagonal blocks; B terms)
|
||||
decay_states = torch.exp((A_cumsum[:, :, :, -1:] - A_cumsum))
|
||||
decay_states = torch.exp(A_cumsum[:, :, :, -1:] - A_cumsum)
|
||||
B_decay = B * decay_states.permute(0, -2, -1, 1)[..., None]
|
||||
states = (B_decay[..., None, :] * hidden_states[..., None]).sum(dim=2)
|
||||
|
||||
|
@ -632,7 +632,7 @@ class BambaMixer(nn.Module):
|
||||
|
||||
# 2. Compute the state for each intra-chunk
|
||||
# (right term of low-rank factorization of off-diagonal blocks; B terms)
|
||||
decay_states = torch.exp((A_cumsum[:, :, :, -1:] - A_cumsum))
|
||||
decay_states = torch.exp(A_cumsum[:, :, :, -1:] - A_cumsum)
|
||||
B_decay = B * decay_states.permute(0, -2, -1, 1)[..., None]
|
||||
states = (B_decay[..., None, :] * hidden_states[..., None]).sum(dim=2)
|
||||
|
||||
|
@ -32,7 +32,7 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
|
||||
# See all BART models at https://huggingface.co/models?filter=bart
|
||||
|
||||
|
||||
@lru_cache()
|
||||
@lru_cache
|
||||
def bytes_to_unicode():
|
||||
"""
|
||||
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
|
||||
|
@ -266,7 +266,7 @@ def convert_beit_checkpoint(checkpoint_url, pytorch_dump_folder_path):
|
||||
# Check outputs on an image
|
||||
if is_semantic:
|
||||
image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False)
|
||||
ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
|
||||
ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
|
||||
image = Image.open(ds[0]["file"])
|
||||
else:
|
||||
image_processor = BeitImageProcessor(
|
||||
|
@ -174,11 +174,6 @@ class BeitImageProcessorFast(BaseImageProcessorFast):
|
||||
processed_segmentation_maps = processed_segmentation_maps.to(torch.int64)
|
||||
return processed_segmentation_maps
|
||||
|
||||
def __call__(self, images, segmentation_maps=None, **kwargs):
|
||||
# Overrides the `__call__` method of the `Preprocessor` class such that the images and segmentation maps can both
|
||||
# be passed in as positional arguments.
|
||||
return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs)
|
||||
|
||||
@auto_docstring
|
||||
def preprocess(
|
||||
self,
|
||||
|
@ -110,7 +110,7 @@ class BeitDropPath(nn.Module):
|
||||
return drop_path(hidden_states, self.drop_prob, self.training)
|
||||
|
||||
def extra_repr(self) -> str:
|
||||
return "p={}".format(self.drop_prob)
|
||||
return f"p={self.drop_prob}"
|
||||
|
||||
|
||||
# Based on timm implementation, which can be found here:
|
||||
@ -513,8 +513,8 @@ class BeitLayer(nn.Module):
|
||||
|
||||
init_values = config.layer_scale_init_value
|
||||
if init_values > 0:
|
||||
self.lambda_1 = nn.Parameter(init_values * torch.ones((config.hidden_size)), requires_grad=True)
|
||||
self.lambda_2 = nn.Parameter(init_values * torch.ones((config.hidden_size)), requires_grad=True)
|
||||
self.lambda_1 = nn.Parameter(init_values * torch.ones(config.hidden_size), requires_grad=True)
|
||||
self.lambda_2 = nn.Parameter(init_values * torch.ones(config.hidden_size), requires_grad=True)
|
||||
else:
|
||||
self.lambda_1, self.lambda_2 = None, None
|
||||
|
||||
|
@ -934,7 +934,7 @@ class SentencepieceTokenizer:
|
||||
pieces = self.sp_model.encode(text, out_type=str)
|
||||
new_pieces = []
|
||||
for piece in pieces:
|
||||
if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
|
||||
if len(piece) > 1 and piece[-1] == "," and piece[-2].isdigit():
|
||||
cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
|
||||
if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
|
||||
if len(cur_pieces[0]) == 1:
|
||||
|
@ -115,7 +115,7 @@ class Dictionary:
|
||||
except FileNotFoundError as fnfe:
|
||||
raise fnfe
|
||||
except UnicodeError:
|
||||
raise Exception("Incorrect encoding detected in {}, please rebuild the dataset".format(f))
|
||||
raise Exception(f"Incorrect encoding detected in {f}, please rebuild the dataset")
|
||||
return
|
||||
|
||||
lines = f.readlines()
|
||||
@ -133,11 +133,11 @@ class Dictionary:
|
||||
word = line
|
||||
if word in self and not overwrite:
|
||||
raise RuntimeError(
|
||||
"Duplicate word found when loading Dictionary: '{}'. "
|
||||
f"Duplicate word found when loading Dictionary: '{word}'. "
|
||||
"Duplicate words can overwrite earlier ones by adding the "
|
||||
"#fairseq:overwrite flag at the end of the corresponding row "
|
||||
"in the dictionary file. If using the Camembert model, please "
|
||||
"download an updated copy of the model file.".format(word)
|
||||
"download an updated copy of the model file."
|
||||
)
|
||||
self.add_symbol(word, n=count, overwrite=overwrite)
|
||||
except ValueError:
|
||||
|
@ -135,7 +135,7 @@ class BitGroupNormActivation(nn.GroupNorm):
|
||||
"""
|
||||
|
||||
def __init__(self, config, num_channels, eps=1e-5, affine=True, apply_activation=True):
|
||||
super(BitGroupNormActivation, self).__init__(config.num_groups, num_channels, eps=eps, affine=affine)
|
||||
super().__init__(config.num_groups, num_channels, eps=eps, affine=affine)
|
||||
if apply_activation:
|
||||
self.activation = ACT2FN[config.hidden_act]
|
||||
else:
|
||||
@ -310,7 +310,7 @@ class BitDropPath(nn.Module):
|
||||
return drop_path(hidden_states, self.drop_prob, self.training)
|
||||
|
||||
def extra_repr(self) -> str:
|
||||
return "p={}".format(self.drop_prob)
|
||||
return f"p={self.drop_prob}"
|
||||
|
||||
|
||||
def make_div(value, divisor=8):
|
||||
|
@ -1183,7 +1183,7 @@ class BlenderbotModel(BlenderbotPreTrainedModel):
|
||||
)
|
||||
return BlenderbotSmallModel.from_pretrained(pretrained_model_name_or_path)
|
||||
|
||||
return super(BlenderbotModel, cls).from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.shared
|
||||
@ -1344,9 +1344,7 @@ class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel, GenerationMi
|
||||
)
|
||||
return BlenderbotSmallForConditionalGeneration.from_pretrained(pretrained_model_name_or_path)
|
||||
|
||||
return super(BlenderbotForConditionalGeneration, cls).from_pretrained(
|
||||
pretrained_model_name_or_path, *model_args, **kwargs
|
||||
)
|
||||
return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
|
||||
def get_encoder(self):
|
||||
return self.model.get_encoder()
|
||||
|
@ -35,7 +35,7 @@ VOCAB_FILES_NAMES = {
|
||||
}
|
||||
|
||||
|
||||
@lru_cache()
|
||||
@lru_cache
|
||||
# Copied from transformers.models.roberta.tokenization_roberta.bytes_to_unicode
|
||||
def bytes_to_unicode():
|
||||
"""
|
||||
|
@ -641,9 +641,7 @@ class BlipTextModel(BlipTextPreTrainedModel):
|
||||
extended_attention_mask = attention_mask[:, None, None, :]
|
||||
else:
|
||||
raise ValueError(
|
||||
"Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
|
||||
input_shape, attention_mask.shape
|
||||
)
|
||||
f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
|
||||
)
|
||||
|
||||
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
|
||||
@ -723,7 +721,7 @@ class BlipTextModel(BlipTextPreTrainedModel):
|
||||
past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
|
||||
|
||||
if attention_mask is None:
|
||||
attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length))).to(device)
|
||||
attention_mask = torch.ones((batch_size, seq_length + past_key_values_length)).to(device)
|
||||
|
||||
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
|
||||
# ourselves in which case we just need to make it broadcastable to all heads.
|
||||
|
@ -800,9 +800,7 @@ class TFBlipTextModel(TFBlipTextPreTrainedModel):
|
||||
extended_attention_mask = attention_mask[:, None, None, :]
|
||||
else:
|
||||
raise ValueError(
|
||||
"Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
|
||||
input_shape, attention_mask.shape
|
||||
)
|
||||
f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
|
||||
)
|
||||
|
||||
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
|
||||
@ -881,7 +879,7 @@ class TFBlipTextModel(TFBlipTextPreTrainedModel):
|
||||
past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
|
||||
|
||||
if attention_mask is None:
|
||||
attention_mask = tf.ones(((batch_size, seq_length + past_key_values_length)))
|
||||
attention_mask = tf.ones((batch_size, seq_length + past_key_values_length))
|
||||
|
||||
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
|
||||
# ourselves in which case we just need to make it broadcastable to all heads.
|
||||
|
@ -1144,9 +1144,7 @@ class Blip2QFormerModel(Blip2PreTrainedModel):
|
||||
extended_attention_mask = attention_mask[:, None, None, :]
|
||||
else:
|
||||
raise ValueError(
|
||||
"Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
|
||||
input_shape, attention_mask.shape
|
||||
)
|
||||
f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
|
||||
)
|
||||
|
||||
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
|
||||
|
@ -98,7 +98,7 @@ def convert_bloom_checkpoint_to_pytorch(
|
||||
config = BloomConfig()
|
||||
|
||||
for j, file in enumerate(file_names):
|
||||
print("Processing file: {}".format(file))
|
||||
print(f"Processing file: {file}")
|
||||
tensors = None
|
||||
|
||||
for i in range(pretraining_tp):
|
||||
@ -132,7 +132,7 @@ def convert_bloom_checkpoint_to_pytorch(
|
||||
tensors,
|
||||
os.path.join(
|
||||
pytorch_dump_folder_path,
|
||||
"pytorch_model_{}-of-{}.bin".format(str(j + 1).zfill(5), str(len(file_names)).zfill(5)),
|
||||
f"pytorch_model_{str(j + 1).zfill(5)}-of-{str(len(file_names)).zfill(5)}.bin",
|
||||
),
|
||||
)
|
||||
|
||||
@ -140,8 +140,8 @@ def convert_bloom_checkpoint_to_pytorch(
|
||||
value = tensors[key]
|
||||
total_size += value.numel() * get_dtype_size(value.dtype)
|
||||
if key not in index_dict["weight_map"]:
|
||||
index_dict["weight_map"][key] = "pytorch_model_{}-of-{}.bin".format(
|
||||
str(j + 1).zfill(5), str(len(file_names)).zfill(5)
|
||||
index_dict["weight_map"][key] = (
|
||||
f"pytorch_model_{str(j + 1).zfill(5)}-of-{str(len(file_names)).zfill(5)}.bin"
|
||||
)
|
||||
|
||||
config = BloomConfig()
|
||||
|
@ -74,7 +74,7 @@ class BrosPositionalEmbedding1D(nn.Module):
|
||||
# Reference: https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py#L15
|
||||
|
||||
def __init__(self, config):
|
||||
super(BrosPositionalEmbedding1D, self).__init__()
|
||||
super().__init__()
|
||||
|
||||
self.dim_bbox_sinusoid_emb_1d = config.dim_bbox_sinusoid_emb_1d
|
||||
|
||||
@ -93,7 +93,7 @@ class BrosPositionalEmbedding1D(nn.Module):
|
||||
|
||||
class BrosPositionalEmbedding2D(nn.Module):
|
||||
def __init__(self, config):
|
||||
super(BrosPositionalEmbedding2D, self).__init__()
|
||||
super().__init__()
|
||||
|
||||
self.dim_bbox = config.dim_bbox
|
||||
self.x_pos_emb = BrosPositionalEmbedding1D(config)
|
||||
@ -112,7 +112,7 @@ class BrosPositionalEmbedding2D(nn.Module):
|
||||
|
||||
class BrosBboxEmbeddings(nn.Module):
|
||||
def __init__(self, config):
|
||||
super(BrosBboxEmbeddings, self).__init__()
|
||||
super().__init__()
|
||||
self.bbox_sinusoid_emb = BrosPositionalEmbedding2D(config)
|
||||
self.bbox_projection = nn.Linear(config.dim_bbox_sinusoid_emb_2d, config.dim_bbox_projection, bias=False)
|
||||
|
||||
|
@ -100,7 +100,7 @@ class ChameleonRotaryEmbedding(nn.Module):
|
||||
# Force float32 since bfloat16 loses precision on long contexts
|
||||
# See https://github.com/huggingface/transformers/pull/29285
|
||||
device_type = x.device.type
|
||||
device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
|
||||
device_type = device_type if device_type != "mps" else "cpu"
|
||||
with torch.autocast(device_type=device_type, enabled=False):
|
||||
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
||||
emb = torch.cat((freqs, freqs), dim=-1)
|
||||
|
@ -610,7 +610,7 @@ class ClapAudioLayer(nn.Module):
|
||||
mask_windows = window_partition(img_mask, self.window_size)
|
||||
mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
|
||||
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
|
||||
attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
|
||||
attn_mask = attn_mask.masked_fill(attn_mask != 0, -100.0).masked_fill(attn_mask == 0, 0.0)
|
||||
else:
|
||||
attn_mask = None
|
||||
return attn_mask
|
||||
|
@ -34,7 +34,7 @@ VOCAB_FILES_NAMES = {
|
||||
}
|
||||
|
||||
|
||||
@lru_cache()
|
||||
@lru_cache
|
||||
def bytes_to_unicode():
|
||||
"""
|
||||
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
|
||||
@ -488,7 +488,7 @@ class CLIPTokenizer(PreTrainedTokenizer):
|
||||
|
||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
|
||||
if not os.path.isdir(save_directory):
|
||||
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
||||
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
||||
return
|
||||
vocab_file = os.path.join(
|
||||
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
|
||||
@ -506,8 +506,8 @@ class CLIPTokenizer(PreTrainedTokenizer):
|
||||
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
|
||||
if index != token_index:
|
||||
logger.warning(
|
||||
"Saving vocabulary to {}: BPE merge indices are not consecutive."
|
||||
" Please check that the tokenizer is not corrupted!".format(merge_file)
|
||||
f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
|
||||
" Please check that the tokenizer is not corrupted!"
|
||||
)
|
||||
index = token_index
|
||||
writer.write(" ".join(bpe_tokens) + "\n")
|
||||
|
@ -181,7 +181,7 @@ def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_
|
||||
missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
|
||||
|
||||
if missing_keys != ["clip.text_model.embeddings.position_ids", "clip.vision_model.embeddings.position_ids"]:
|
||||
raise ValueError("Missing keys that are not expected: {}".format(missing_keys))
|
||||
raise ValueError(f"Missing keys that are not expected: {missing_keys}")
|
||||
if unexpected_keys != ["decoder.reduce.weight", "decoder.reduce.bias"]:
|
||||
raise ValueError(f"Unexpected keys: {unexpected_keys}")
|
||||
|
||||
|
@ -15,7 +15,14 @@
|
||||
|
||||
"""English Normalizer class for CLVP."""
|
||||
|
||||
import re
|
||||
import sys
|
||||
|
||||
|
||||
if sys.version_info >= (3, 11):
|
||||
# Atomic grouping support was only added to the core RE in Python 3.11
|
||||
import re
|
||||
else:
|
||||
import regex as re
|
||||
|
||||
|
||||
class EnglishNormalizer:
|
||||
@ -199,12 +206,12 @@ class EnglishNormalizer:
|
||||
This method is used to normalize numbers within a text such as converting the numbers to words, removing
|
||||
commas, etc.
|
||||
"""
|
||||
text = re.sub(re.compile(r"([0-9][0-9\,]+[0-9])"), self._remove_commas, text)
|
||||
text = re.sub(re.compile(r"£([0-9\,]*[0-9]+)"), r"\1 pounds", text)
|
||||
text = re.sub(re.compile(r"\$([0-9\.\,]*[0-9]+)"), self._expand_dollars, text)
|
||||
text = re.sub(re.compile(r"([0-9]+\.[0-9]+)"), self._expand_decimal_point, text)
|
||||
text = re.sub(re.compile(r"[0-9]+(st|nd|rd|th)"), self._expand_ordinal, text)
|
||||
text = re.sub(re.compile(r"[0-9]+"), self._expand_number, text)
|
||||
text = re.sub(r"([0-9][0-9,]+[0-9])", self._remove_commas, text)
|
||||
text = re.sub(r"£([0-9,]*[0-9])", r"\1 pounds", text)
|
||||
text = re.sub(r"\$([0-9.,]*[0-9])", self._expand_dollars, text)
|
||||
text = re.sub(r"([0-9]++\.[0-9]+)", self._expand_decimal_point, text)
|
||||
text = re.sub(r"[0-9]++(st|nd|rd|th)", self._expand_ordinal, text)
|
||||
text = re.sub(r"[0-9]+", self._expand_number, text)
|
||||
return text
|
||||
|
||||
def expand_abbreviations(self, text: str) -> str:
|
||||
|
@ -34,7 +34,7 @@ VOCAB_FILES_NAMES = {
|
||||
}
|
||||
|
||||
|
||||
@lru_cache()
|
||||
@lru_cache
|
||||
# Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
|
||||
def bytes_to_unicode():
|
||||
"""
|
||||
|
@ -42,7 +42,7 @@ VOCAB_FILES_NAMES = {
|
||||
}
|
||||
|
||||
|
||||
@lru_cache()
|
||||
@lru_cache
|
||||
def bytes_to_unicode():
|
||||
"""
|
||||
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
|
||||
|
@ -70,7 +70,7 @@ class ConvNextDropPath(nn.Module):
|
||||
return drop_path(hidden_states, self.drop_prob, self.training)
|
||||
|
||||
def extra_repr(self) -> str:
|
||||
return "p={}".format(self.drop_prob)
|
||||
return f"p={self.drop_prob}"
|
||||
|
||||
|
||||
class ConvNextLayerNorm(nn.Module):
|
||||
@ -149,7 +149,7 @@ class ConvNextLayer(nn.Module):
|
||||
self.act = ACT2FN[config.hidden_act]
|
||||
self.pwconv2 = nn.Linear(4 * dim, dim)
|
||||
self.layer_scale_parameter = (
|
||||
nn.Parameter(config.layer_scale_init_value * torch.ones((dim)), requires_grad=True)
|
||||
nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True)
|
||||
if config.layer_scale_init_value > 0
|
||||
else None
|
||||
)
|
||||
|
@ -70,7 +70,7 @@ class ConvNextV2DropPath(nn.Module):
|
||||
return drop_path(hidden_states, self.drop_prob, self.training)
|
||||
|
||||
def extra_repr(self) -> str:
|
||||
return "p={}".format(self.drop_prob)
|
||||
return f"p={self.drop_prob}"
|
||||
|
||||
|
||||
class ConvNextV2GRN(nn.Module):
|
||||
|
@ -207,7 +207,7 @@ class CpmTokenizer(PreTrainedTokenizer):
|
||||
pieces = self.sp_model.encode(text, out_type=str)
|
||||
new_pieces = []
|
||||
for piece in pieces:
|
||||
if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
|
||||
if len(piece) > 1 and piece[-1] == "," and piece[-2].isdigit():
|
||||
cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
|
||||
if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
|
||||
if len(cur_pieces[0]) == 1:
|
||||
|
@ -86,7 +86,7 @@ class CvtDropPath(nn.Module):
|
||||
return drop_path(hidden_states, self.drop_prob, self.training)
|
||||
|
||||
def extra_repr(self) -> str:
|
||||
return "p={}".format(self.drop_prob)
|
||||
return f"p={self.drop_prob}"
|
||||
|
||||
|
||||
class CvtEmbeddings(nn.Module):
|
||||
|
@ -187,7 +187,7 @@ class DFineMultiscaleDeformableAttention(nn.Module):
|
||||
sampling_locations = reference_points[:, :, None, :, :2] + offset
|
||||
else:
|
||||
raise ValueError(
|
||||
"Last dim of reference_points must be 2 or 4, but get {} instead.".format(reference_points.shape[-1])
|
||||
f"Last dim of reference_points must be 2 or 4, but get {reference_points.shape[-1]} instead."
|
||||
)
|
||||
|
||||
output = self.ms_deformable_attn_core(
|
||||
|
@ -517,7 +517,7 @@ class DFineMultiscaleDeformableAttention(nn.Module):
|
||||
sampling_locations = reference_points[:, :, None, :, :2] + offset
|
||||
else:
|
||||
raise ValueError(
|
||||
"Last dim of reference_points must be 2 or 4, but get {} instead.".format(reference_points.shape[-1])
|
||||
f"Last dim of reference_points must be 2 or 4, but get {reference_points.shape[-1]} instead."
|
||||
)
|
||||
|
||||
output = self.ms_deformable_attn_core(
|
||||
|
@ -384,7 +384,7 @@ def gen_sine_position_embeddings(pos_tensor, hidden_size=256):
|
||||
|
||||
pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
|
||||
else:
|
||||
raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1)))
|
||||
raise ValueError(f"Unknown pos_tensor shape(-1):{pos_tensor.size(-1)}")
|
||||
return pos
|
||||
|
||||
|
||||
@ -1254,7 +1254,7 @@ class DabDetrModel(DabDetrPreTrainedModel):
|
||||
|
||||
self.num_patterns = config.num_patterns
|
||||
if not isinstance(self.num_patterns, int):
|
||||
logger.warning("num_patterns should be int but {}".format(type(self.num_patterns)))
|
||||
logger.warning(f"num_patterns should be int but {type(self.num_patterns)}")
|
||||
self.num_patterns = 0
|
||||
if self.num_patterns > 0:
|
||||
self.patterns = nn.Embedding(self.num_patterns, self.hidden_size)
|
||||
|
@ -157,24 +157,12 @@ def recursively_load_weights(orig_dict, hf_model, model_name):
|
||||
elif len(mapped_key) == 3:
|
||||
integers = re.findall(r"\b\d+\b", name)
|
||||
if mapped_key[0][0] == "d":
|
||||
mapped_key = "{}.{}.{}{}.{}".format(
|
||||
mapped_key[0],
|
||||
str(int(integers[0]) - 1),
|
||||
mapped_key[1],
|
||||
str(int(integers[1]) - 1),
|
||||
mapped_key[2],
|
||||
)
|
||||
mapped_key = f"{mapped_key[0]}.{str(int(integers[0]) - 1)}.{mapped_key[1]}{str(int(integers[1]) - 1)}.{mapped_key[2]}"
|
||||
else:
|
||||
mapped_key = "{}.{}.{}{}.{}".format(
|
||||
mapped_key[0],
|
||||
str(int(integers[0]) - 1),
|
||||
mapped_key[1],
|
||||
str(int(integers[1]) + 1),
|
||||
mapped_key[2],
|
||||
)
|
||||
mapped_key = f"{mapped_key[0]}.{str(int(integers[0]) - 1)}.{mapped_key[1]}{str(int(integers[1]) + 1)}.{mapped_key[2]}"
|
||||
elif len(mapped_key) == 2:
|
||||
integers = re.findall(r"\b\d+\b", name)
|
||||
mapped_key = "{}.{}.{}".format(mapped_key[0], str(int(integers[0]) - 1), mapped_key[1])
|
||||
mapped_key = f"{mapped_key[0]}.{str(int(integers[0]) - 1)}.{mapped_key[1]}"
|
||||
|
||||
is_used = True
|
||||
if "weight_g" in name:
|
||||
|
@ -226,7 +226,7 @@ def convert_wav2vec2_checkpoint(
|
||||
|
||||
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-lv60")
|
||||
|
||||
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
input_audio = [x["array"] for x in ds[:4]["audio"]]
|
||||
|
||||
inputs = processor(input_audio, return_tensors="pt", padding=True)
|
||||
|
@ -185,18 +185,12 @@ def load_beit_model(args, is_finetuned, is_large):
|
||||
missing_keys = warn_missing_keys
|
||||
|
||||
if len(missing_keys) > 0:
|
||||
print(
|
||||
"Weights of {} not initialized from pretrained model: {}".format(
|
||||
model.__class__.__name__, missing_keys
|
||||
)
|
||||
)
|
||||
print(f"Weights of {model.__class__.__name__} not initialized from pretrained model: {missing_keys}")
|
||||
if len(unexpected_keys) > 0:
|
||||
print("Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys))
|
||||
print(f"Weights from pretrained model not used in {model.__class__.__name__}: {unexpected_keys}")
|
||||
if len(ignore_missing_keys) > 0:
|
||||
print(
|
||||
"Ignored weights of {} not initialized from pretrained model: {}".format(
|
||||
model.__class__.__name__, ignore_missing_keys
|
||||
)
|
||||
f"Ignored weights of {model.__class__.__name__} not initialized from pretrained model: {ignore_missing_keys}"
|
||||
)
|
||||
if len(error_msgs) > 0:
|
||||
print("\n".join(error_msgs))
|
||||
|
@ -1229,7 +1229,7 @@ class Data2VecAudioForAudioFrameClassification(Data2VecAudioPreTrainedModel):
|
||||
|
||||
class AMSoftmaxLoss(nn.Module):
|
||||
def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
|
||||
super(AMSoftmaxLoss, self).__init__()
|
||||
super().__init__()
|
||||
self.scale = scale
|
||||
self.margin = margin
|
||||
self.num_labels = num_labels
|
||||
|
@ -101,7 +101,7 @@ class Data2VecVisionDropPath(nn.Module):
|
||||
return drop_path(hidden_states, self.drop_prob, self.training)
|
||||
|
||||
def extra_repr(self) -> str:
|
||||
return "p={}".format(self.drop_prob)
|
||||
return f"p={self.drop_prob}"
|
||||
|
||||
|
||||
# Copied from transformers.models.beit.modeling_beit.BeitEmbeddings with Beit->Data2VecVision
|
||||
@ -515,8 +515,8 @@ class Data2VecVisionLayer(nn.Module):
|
||||
|
||||
init_values = config.layer_scale_init_value
|
||||
if init_values > 0:
|
||||
self.lambda_1 = nn.Parameter(init_values * torch.ones((config.hidden_size)), requires_grad=True)
|
||||
self.lambda_2 = nn.Parameter(init_values * torch.ones((config.hidden_size)), requires_grad=True)
|
||||
self.lambda_1 = nn.Parameter(init_values * torch.ones(config.hidden_size), requires_grad=True)
|
||||
self.lambda_2 = nn.Parameter(init_values * torch.ones(config.hidden_size), requires_grad=True)
|
||||
else:
|
||||
self.lambda_1, self.lambda_2 = None, None
|
||||
|
||||
|
@ -306,7 +306,7 @@ class TFData2VecVisionSelfAttention(keras.layers.Layer):
|
||||
hidden_states: tf.Tensor,
|
||||
head_mask: tf.Tensor,
|
||||
output_attentions: bool,
|
||||
relative_position_bias: Optional["TFData2VecVisionRelativePositionBias"] = None,
|
||||
relative_position_bias: Optional[TFData2VecVisionRelativePositionBias] = None,
|
||||
training: bool = False,
|
||||
) -> tuple[tf.Tensor]:
|
||||
batch_size = shape_list(hidden_states)[0]
|
||||
@ -416,7 +416,7 @@ class TFData2VecVisionAttention(keras.layers.Layer):
|
||||
input_tensor: tf.Tensor,
|
||||
head_mask: tf.Tensor,
|
||||
output_attentions: bool,
|
||||
relative_position_bias: Optional["TFData2VecVisionRelativePositionBias"] = None,
|
||||
relative_position_bias: Optional[TFData2VecVisionRelativePositionBias] = None,
|
||||
training: bool = False,
|
||||
) -> tuple[tf.Tensor]:
|
||||
self_outputs = self.attention(
|
||||
@ -538,8 +538,8 @@ class TFData2VecVisionLayer(keras.layers.Layer):
|
||||
trainable=True,
|
||||
name="lambda_2",
|
||||
)
|
||||
self.lambda_1.assign(self.init_values * tf.ones((self.config.hidden_size)))
|
||||
self.lambda_2.assign(self.init_values * tf.ones((self.config.hidden_size)))
|
||||
self.lambda_1.assign(self.init_values * tf.ones(self.config.hidden_size))
|
||||
self.lambda_2.assign(self.init_values * tf.ones(self.config.hidden_size))
|
||||
else:
|
||||
self.lambda_1, self.lambda_2 = None, None
|
||||
|
||||
@ -570,7 +570,7 @@ class TFData2VecVisionLayer(keras.layers.Layer):
|
||||
hidden_states: tf.Tensor,
|
||||
head_mask: tf.Tensor,
|
||||
output_attentions: bool,
|
||||
relative_position_bias: Optional["TFData2VecVisionRelativePositionBias"] = None,
|
||||
relative_position_bias: Optional[TFData2VecVisionRelativePositionBias] = None,
|
||||
training: bool = False,
|
||||
) -> tuple[tf.Tensor]:
|
||||
self_attention_outputs = self.attention(
|
||||
|
@ -64,7 +64,7 @@ class DbrxRotaryEmbedding(nn.Module):
|
||||
# Force float32 since bfloat16 loses precision on long contexts
|
||||
# See https://github.com/huggingface/transformers/pull/29285
|
||||
device_type = x.device.type
|
||||
device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
|
||||
device_type = device_type if device_type != "mps" else "cpu"
|
||||
with torch.autocast(device_type=device_type, enabled=False):
|
||||
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
||||
emb = torch.cat((freqs, freqs), dim=-1)
|
||||
@ -387,9 +387,14 @@ class DbrxFlashAttention2(DbrxAttention):
|
||||
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
|
||||
# in fp32. (LlamaRMSNorm handles it correctly)
|
||||
input_dtype = query_states.dtype
|
||||
device_type = query_states.device.type if query_states.device.type != "mps" else "cpu"
|
||||
if input_dtype == torch.float32:
|
||||
if torch.is_autocast_enabled():
|
||||
target_dtype = torch.get_autocast_gpu_dtype()
|
||||
target_dtype = (
|
||||
torch.get_autocast_dtype(device_type)
|
||||
if hasattr(torch, "get_autocast_dtype")
|
||||
else torch.get_autocast_gpu_dtype()
|
||||
)
|
||||
# Handle the case where the model is quantized
|
||||
elif hasattr(self.config, "_pre_quantization_dtype"):
|
||||
target_dtype = self.config._pre_quantization_dtype
|
||||
|
@ -219,7 +219,7 @@ class DecisionTransformerGPT2Attention(nn.Module):
|
||||
scale_factor /= float(self.layer_idx + 1)
|
||||
|
||||
# Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk))
|
||||
with torch.amp.autocast(query.device.type, enabled=False):
|
||||
with torch.autocast(query.device.type, enabled=False):
|
||||
q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
|
||||
attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
|
||||
attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
|
||||
|
@ -113,7 +113,7 @@ class DeepseekV3TopkRouter(nn.Module):
|
||||
self.norm_topk_prob = config.norm_topk_prob
|
||||
|
||||
self.weight = nn.Parameter(torch.empty((self.n_routed_experts, config.hidden_size)))
|
||||
self.register_buffer("e_score_correction_bias", torch.zeros((self.n_routed_experts)))
|
||||
self.register_buffer("e_score_correction_bias", torch.zeros(self.n_routed_experts))
|
||||
|
||||
@torch.no_grad()
|
||||
def get_topk_indices(self, scores):
|
||||
|
@ -110,7 +110,7 @@ class DeepseekV3TopkRouter(nn.Module):
|
||||
self.norm_topk_prob = config.norm_topk_prob
|
||||
|
||||
self.weight = nn.Parameter(torch.empty((self.n_routed_experts, config.hidden_size)))
|
||||
self.register_buffer("e_score_correction_bias", torch.zeros((self.n_routed_experts)))
|
||||
self.register_buffer("e_score_correction_bias", torch.zeros(self.n_routed_experts))
|
||||
|
||||
@torch.no_grad()
|
||||
def get_topk_indices(self, scores):
|
||||
|
@ -270,7 +270,7 @@ class EfficientFormerDropPath(nn.Module):
|
||||
return drop_path(hidden_states, self.drop_prob, self.training)
|
||||
|
||||
def extra_repr(self) -> str:
|
||||
return "p={}".format(self.drop_prob)
|
||||
return f"p={self.drop_prob}"
|
||||
|
||||
|
||||
class EfficientFormerFlat(nn.Module):
|
||||
@ -303,8 +303,8 @@ class EfficientFormerMeta3D(nn.Module):
|
||||
self.drop_path = EfficientFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
|
||||
self.use_layer_scale = config.use_layer_scale
|
||||
if config.use_layer_scale:
|
||||
self.layer_scale_1 = nn.Parameter(config.layer_scale_init_value * torch.ones((dim)), requires_grad=True)
|
||||
self.layer_scale_2 = nn.Parameter(config.layer_scale_init_value * torch.ones((dim)), requires_grad=True)
|
||||
self.layer_scale_1 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True)
|
||||
self.layer_scale_2 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True)
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> tuple[torch.Tensor]:
|
||||
self_attention_outputs = self.token_mixer(self.layernorm1(hidden_states), output_attentions)
|
||||
@ -370,8 +370,8 @@ class EfficientFormerMeta4D(nn.Module):
|
||||
self.drop_path = EfficientFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
|
||||
self.use_layer_scale = config.use_layer_scale
|
||||
if config.use_layer_scale:
|
||||
self.layer_scale_1 = nn.Parameter(config.layer_scale_init_value * torch.ones((dim)), requires_grad=True)
|
||||
self.layer_scale_2 = nn.Parameter(config.layer_scale_init_value * torch.ones((dim)), requires_grad=True)
|
||||
self.layer_scale_1 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True)
|
||||
self.layer_scale_2 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True)
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]:
|
||||
outputs = self.token_mixer(hidden_states)
|
||||
|
@ -484,7 +484,7 @@ ERNIE_M_INPUTS_DOCSTRING = r"""
|
||||
)
|
||||
class ErnieMModel(ErnieMPreTrainedModel):
|
||||
def __init__(self, config, add_pooling_layer=True):
|
||||
super(ErnieMModel, self).__init__(config)
|
||||
super().__init__(config)
|
||||
self.initializer_range = config.initializer_range
|
||||
self.embeddings = ErnieMEmbeddings(config)
|
||||
self.encoder = ErnieMEncoder(config)
|
||||
@ -964,7 +964,7 @@ class ErnieMForQuestionAnswering(ErnieMPreTrainedModel):
|
||||
)
|
||||
class ErnieMForInformationExtraction(ErnieMPreTrainedModel):
|
||||
def __init__(self, config):
|
||||
super(ErnieMForInformationExtraction, self).__init__(config)
|
||||
super().__init__(config)
|
||||
self.ernie_m = ErnieMModel(config)
|
||||
self.linear_start = nn.Linear(config.hidden_size, 1)
|
||||
self.linear_end = nn.Linear(config.hidden_size, 1)
|
||||
|
@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
"""Tokenization classes for Ernie-M."""
|
||||
|
||||
import io
|
||||
import os
|
||||
import unicodedata
|
||||
from typing import Any, Optional
|
||||
@ -172,7 +171,7 @@ class ErnieMTokenizer(PreTrainedTokenizer):
|
||||
|
||||
def clean_text(self, text):
|
||||
"""Performs invalid character removal and whitespace cleanup on text."""
|
||||
return "".join((self.SP_CHAR_MAPPING.get(c, c) for c in text))
|
||||
return "".join(self.SP_CHAR_MAPPING.get(c, c) for c in text)
|
||||
|
||||
def _tokenize(self, text, enable_sampling=False, nbest_size=64, alpha=0.1):
|
||||
"""Tokenize a string."""
|
||||
@ -373,7 +372,7 @@ class ErnieMTokenizer(PreTrainedTokenizer):
|
||||
|
||||
def load_vocab(self, filepath):
|
||||
token_to_idx = {}
|
||||
with io.open(filepath, "r", encoding="utf-8") as f:
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
for index, line in enumerate(f):
|
||||
token = line.rstrip("\n")
|
||||
token_to_idx[token] = int(index)
|
||||
|
@ -118,7 +118,7 @@ class MegaSimpleRelativePositionalBias(nn.Module):
|
||||
|
||||
def forward(self, seq_len):
|
||||
if seq_len > self.max_positions:
|
||||
raise ValueError("Sequence length {} going beyond max length {}".format(seq_len, self.max_positions))
|
||||
raise ValueError(f"Sequence length {seq_len} going beyond max length {self.max_positions}")
|
||||
|
||||
# seq_len * 2 - 1
|
||||
bias = self.rel_pos_bias[(self.max_positions - seq_len) : (self.max_positions + seq_len - 1)]
|
||||
@ -298,7 +298,7 @@ class MegaSequenceNorm(nn.Module):
|
||||
elif norm_type == "syncbatchnorm":
|
||||
self.norm = nn.SyncBatchNorm(embedding_dim, eps=eps, affine=affine)
|
||||
else:
|
||||
raise ValueError("Unknown norm type: {}".format(norm_type))
|
||||
raise ValueError(f"Unknown norm type: {norm_type}")
|
||||
|
||||
def forward(self, input):
|
||||
if isinstance(self.norm, nn.modules.batchnorm._BatchNorm):
|
||||
@ -563,7 +563,7 @@ class MegaGatedCrossAttention(nn.Module):
|
||||
elif self.config.relative_positional_bias == "rotary":
|
||||
self.rel_pos_bias = MegaRotaryRelativePositionalBias(config)
|
||||
else:
|
||||
raise ValueError("unknown relative position bias: {}".format(self.config.relative_positional_bias))
|
||||
raise ValueError(f"unknown relative position bias: {self.config.relative_positional_bias}")
|
||||
|
||||
self.softmax = nn.Softmax(dim=-1)
|
||||
|
||||
|
@ -287,7 +287,7 @@ class NatDropPath(nn.Module):
|
||||
return drop_path(hidden_states, self.drop_prob, self.training)
|
||||
|
||||
def extra_repr(self) -> str:
|
||||
return "p={}".format(self.drop_prob)
|
||||
return f"p={self.drop_prob}"
|
||||
|
||||
|
||||
class NeighborhoodAttention(nn.Module):
|
||||
|
@ -99,7 +99,7 @@ TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
|
||||
"""
|
||||
|
||||
|
||||
@lru_cache()
|
||||
@lru_cache
|
||||
def bytes_to_unicode():
|
||||
"""
|
||||
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
|
||||
|
@ -79,7 +79,7 @@ class VanDropPath(nn.Module):
|
||||
return drop_path(hidden_states, self.drop_prob, self.training)
|
||||
|
||||
def extra_repr(self) -> str:
|
||||
return "p={}".format(self.drop_prob)
|
||||
return f"p={self.drop_prob}"
|
||||
|
||||
|
||||
class VanOverlappingPatchEmbedder(nn.Module):
|
||||
@ -204,7 +204,7 @@ class VanLayerScaling(nn.Module):
|
||||
|
||||
def __init__(self, hidden_size: int, initial_value: float = 1e-2):
|
||||
super().__init__()
|
||||
self.weight = nn.Parameter(initial_value * torch.ones((hidden_size)), requires_grad=True)
|
||||
self.weight = nn.Parameter(initial_value * torch.ones(hidden_size), requires_grad=True)
|
||||
|
||||
def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
|
||||
# unsqueezing for broadcasting
|
||||
|
@ -306,9 +306,14 @@ class DiffLlamaFlashAttention2(DiffLlamaAttention):
|
||||
# in fp32. (DiffLlamaRMSNorm handles it correctly)
|
||||
|
||||
input_dtype = query_states.dtype
|
||||
device_type = query_states.device.type if query_states.device.type != "mps" else "cpu"
|
||||
if input_dtype == torch.float32:
|
||||
if torch.is_autocast_enabled():
|
||||
target_dtype = torch.get_autocast_gpu_dtype()
|
||||
target_dtype = (
|
||||
torch.get_autocast_dtype(device_type)
|
||||
if hasattr(torch, "get_autocast_dtype")
|
||||
else torch.get_autocast_gpu_dtype()
|
||||
)
|
||||
# Handle the case where the model is quantized
|
||||
elif hasattr(self.config, "_pre_quantization_dtype"):
|
||||
target_dtype = self.config._pre_quantization_dtype
|
||||
|
@ -239,9 +239,14 @@ class DiffLlamaFlashAttention2(DiffLlamaAttention):
|
||||
# in fp32. (DiffLlamaRMSNorm handles it correctly)
|
||||
|
||||
input_dtype = query_states.dtype
|
||||
device_type = query_states.device.type if query_states.device.type != "mps" else "cpu"
|
||||
if input_dtype == torch.float32:
|
||||
if torch.is_autocast_enabled():
|
||||
target_dtype = torch.get_autocast_gpu_dtype()
|
||||
target_dtype = (
|
||||
torch.get_autocast_dtype(device_type)
|
||||
if hasattr(torch, "get_autocast_dtype")
|
||||
else torch.get_autocast_gpu_dtype()
|
||||
)
|
||||
# Handle the case where the model is quantized
|
||||
elif hasattr(self.config, "_pre_quantization_dtype"):
|
||||
target_dtype = self.config._pre_quantization_dtype
|
||||
|
@ -275,7 +275,7 @@ class DinatDropPath(nn.Module):
|
||||
return drop_path(hidden_states, self.drop_prob, self.training)
|
||||
|
||||
def extra_repr(self) -> str:
|
||||
return "p={}".format(self.drop_prob)
|
||||
return f"p={self.drop_prob}"
|
||||
|
||||
|
||||
class NeighborhoodAttention(nn.Module):
|
||||
|
@ -343,7 +343,7 @@ class Dinov2DropPath(nn.Module):
|
||||
return drop_path(hidden_states, self.drop_prob, self.training)
|
||||
|
||||
def extra_repr(self) -> str:
|
||||
return "p={}".format(self.drop_prob)
|
||||
return f"p={self.drop_prob}"
|
||||
|
||||
|
||||
class Dinov2MLP(nn.Module):
|
||||
|
@ -360,7 +360,7 @@ class Dinov2WithRegistersDropPath(nn.Module):
|
||||
return drop_path(hidden_states, self.drop_prob, self.training)
|
||||
|
||||
def extra_repr(self) -> str:
|
||||
return "p={}".format(self.drop_prob)
|
||||
return f"p={self.drop_prob}"
|
||||
|
||||
|
||||
class Dinov2WithRegistersMLP(nn.Module):
|
||||
|
@ -289,9 +289,14 @@ class DistilBertFlashAttention2(MultiHeadSelfAttention):
|
||||
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
|
||||
# in fp32. (LlamaRMSNorm handles it correctly)
|
||||
|
||||
device_type = query_states.device.type if query_states.device.type != "mps" else "cpu"
|
||||
if query_states.dtype == torch.float32:
|
||||
if torch.is_autocast_enabled():
|
||||
target_dtype = torch.get_autocast_gpu_dtype()
|
||||
target_dtype = (
|
||||
torch.get_autocast_dtype(device_type)
|
||||
if hasattr(torch, "get_autocast_dtype")
|
||||
else torch.get_autocast_gpu_dtype()
|
||||
)
|
||||
# Handle the case where the model is quantized
|
||||
elif hasattr(self.config, "_pre_quantization_dtype"):
|
||||
target_dtype = self.config._pre_quantization_dtype
|
||||
|
@ -393,7 +393,7 @@ class DonutSwinDropPath(nn.Module):
|
||||
return drop_path(hidden_states, self.drop_prob, self.training)
|
||||
|
||||
def extra_repr(self) -> str:
|
||||
return "p={}".format(self.drop_prob)
|
||||
return f"p={self.drop_prob}"
|
||||
|
||||
|
||||
# Copied from transformers.models.swin.modeling_swin.SwinSelfAttention with Swin->DonutSwin
|
||||
@ -625,7 +625,7 @@ class DonutSwinLayer(nn.Module):
|
||||
mask_windows = window_partition(img_mask, self.window_size)
|
||||
mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
|
||||
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
|
||||
attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
|
||||
attn_mask = attn_mask.masked_fill(attn_mask != 0, -100.0).masked_fill(attn_mask == 0, 0.0)
|
||||
else:
|
||||
attn_mask = None
|
||||
return attn_mask
|
||||
|
@ -21,6 +21,7 @@ if TYPE_CHECKING:
|
||||
from .configuration_dpt import *
|
||||
from .feature_extraction_dpt import *
|
||||
from .image_processing_dpt import *
|
||||
from .image_processing_dpt_fast import *
|
||||
from .modeling_dpt import *
|
||||
else:
|
||||
import sys
|
||||
|
474
src/transformers/models/dpt/image_processing_dpt_fast.py
Normal file
474
src/transformers/models/dpt/image_processing_dpt_fast.py
Normal file
@ -0,0 +1,474 @@
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
# This file was automatically generated from src/transformers/models/dpt/modular_dpt.py.
|
||||
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
||||
# the file from the modular. If any change should be done, please apply the change to the
|
||||
# modular_dpt.py file directly. One of our CI enforces this.
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
from collections.abc import Iterable
|
||||
from typing import TYPE_CHECKING, Optional, Union
|
||||
|
||||
from transformers.image_processing_base import BatchFeature
|
||||
from transformers.image_transforms import group_images_by_shape, reorder_images
|
||||
|
||||
from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs
|
||||
from ...image_utils import (
|
||||
IMAGENET_STANDARD_MEAN,
|
||||
IMAGENET_STANDARD_STD,
|
||||
ChannelDimension,
|
||||
ImageInput,
|
||||
PILImageResampling,
|
||||
SizeDict,
|
||||
is_torch_tensor,
|
||||
make_list_of_images,
|
||||
pil_torch_interpolation_mapping,
|
||||
validate_kwargs,
|
||||
)
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import (
|
||||
TensorType,
|
||||
auto_docstring,
|
||||
is_torch_available,
|
||||
is_torchvision_available,
|
||||
is_torchvision_v2_available,
|
||||
requires_backends,
|
||||
)
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ...modeling_outputs import DepthEstimatorOutput
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_torchvision_v2_available():
|
||||
from torchvision.transforms.v2 import functional as F
|
||||
elif is_torchvision_available():
|
||||
from torchvision.transforms import functional as F
|
||||
|
||||
|
||||
class DPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
"""
|
||||
ensure_multiple_of (`int`, *optional*, defaults to 1):
|
||||
If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overidden
|
||||
by `ensure_multiple_of` in `preprocess`.
|
||||
do_pad (`bool`, *optional*, defaults to `False`):
|
||||
Whether to apply center padding. This was introduced in the DINOv2 paper, which uses the model in
|
||||
combination with DPT.
|
||||
size_divisor (`int`, *optional*):
|
||||
If `do_pad` is `True`, pads the image dimensions to be divisible by this value. This was introduced in the
|
||||
DINOv2 paper, which uses the model in combination with DPT.
|
||||
keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
|
||||
If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
|
||||
be overidden by `keep_aspect_ratio` in `preprocess`.
|
||||
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
|
||||
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
|
||||
is used for background, and background itself is not included in all classes of a dataset (e.g.
|
||||
ADE20k). The background label will be replaced by 255.
|
||||
"""
|
||||
|
||||
ensure_multiple_of: Optional[int]
|
||||
size_divisor: Optional[int]
|
||||
do_pad: Optional[bool]
|
||||
keep_aspect_ratio: Optional[bool]
|
||||
do_reduce_labels: Optional[bool]
|
||||
|
||||
|
||||
def get_resize_output_image_size(
|
||||
input_image: "torch.Tensor",
|
||||
output_size: Union[int, Iterable[int]],
|
||||
keep_aspect_ratio: bool,
|
||||
multiple: int,
|
||||
) -> SizeDict:
|
||||
def constrain_to_multiple_of(val, multiple, min_val=0, max_val=None):
|
||||
x = round(val / multiple) * multiple
|
||||
|
||||
if max_val is not None and x > max_val:
|
||||
x = math.floor(val / multiple) * multiple
|
||||
|
||||
if x < min_val:
|
||||
x = math.ceil(val / multiple) * multiple
|
||||
|
||||
return x
|
||||
|
||||
input_height, input_width = input_image.shape[-2:]
|
||||
output_height, output_width = output_size
|
||||
|
||||
# determine new height and width
|
||||
scale_height = output_height / input_height
|
||||
scale_width = output_width / input_width
|
||||
|
||||
if keep_aspect_ratio:
|
||||
# scale as little as possible
|
||||
if abs(1 - scale_width) < abs(1 - scale_height):
|
||||
# fit width
|
||||
scale_height = scale_width
|
||||
else:
|
||||
# fit height
|
||||
scale_width = scale_height
|
||||
|
||||
new_height = constrain_to_multiple_of(scale_height * input_height, multiple=multiple)
|
||||
new_width = constrain_to_multiple_of(scale_width * input_width, multiple=multiple)
|
||||
|
||||
return SizeDict(height=new_height, width=new_width)
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class DPTImageProcessorFast(BaseImageProcessorFast):
|
||||
resample = PILImageResampling.BICUBIC
|
||||
image_mean = IMAGENET_STANDARD_MEAN
|
||||
image_std = IMAGENET_STANDARD_STD
|
||||
size = {"height": 384, "width": 384}
|
||||
default_to_square = True
|
||||
crop_size = None
|
||||
do_resize = True
|
||||
do_center_crop = None
|
||||
do_rescale = True
|
||||
do_normalize = True
|
||||
do_reduce_labels = None
|
||||
|
||||
valid_kwargs = DPTFastImageProcessorKwargs
|
||||
do_pad = False
|
||||
rescale_factor = 1 / 255
|
||||
ensure_multiple_of = 1
|
||||
keep_aspect_ratio = False
|
||||
|
||||
def __init__(self, **kwargs: Unpack[DPTFastImageProcessorKwargs]):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def reduce_label(self, labels: list["torch.Tensor"]):
|
||||
for idx in range(len(labels)):
|
||||
label = labels[idx]
|
||||
label = torch.where(label == 0, torch.tensor(255, dtype=label.dtype), label)
|
||||
label = label - 1
|
||||
label = torch.where(label == 254, torch.tensor(255, dtype=label.dtype), label)
|
||||
labels[idx] = label
|
||||
|
||||
return label
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images: list["torch.Tensor"],
|
||||
do_reduce_labels: bool,
|
||||
do_resize: bool,
|
||||
size: SizeDict,
|
||||
interpolation: Optional["F.InterpolationMode"],
|
||||
do_center_crop: bool,
|
||||
crop_size: SizeDict,
|
||||
do_rescale: bool,
|
||||
rescale_factor: float,
|
||||
do_normalize: bool,
|
||||
image_mean: Optional[Union[float, list[float]]],
|
||||
image_std: Optional[Union[float, list[float]]],
|
||||
return_tensors: Optional[Union[str, TensorType]],
|
||||
keep_aspect_ratio: bool,
|
||||
ensure_multiple_of: Optional[int],
|
||||
do_pad: bool,
|
||||
size_divisor: Optional[int],
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
if do_reduce_labels:
|
||||
images = self.reduce_label(images)
|
||||
|
||||
# Group images by size for batched resizing
|
||||
grouped_images, grouped_images_index = group_images_by_shape(images)
|
||||
resized_images_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
if do_resize:
|
||||
stacked_images = self.resize(
|
||||
image=stacked_images,
|
||||
size=size,
|
||||
interpolation=interpolation,
|
||||
ensure_multiple_of=ensure_multiple_of,
|
||||
keep_aspect_ratio=keep_aspect_ratio,
|
||||
)
|
||||
resized_images_grouped[shape] = stacked_images
|
||||
resized_images = reorder_images(resized_images_grouped, grouped_images_index)
|
||||
|
||||
# Group images by size for further processing
|
||||
# Needed in case do_resize is False, or resize returns images with different sizes
|
||||
grouped_images, grouped_images_index = group_images_by_shape(resized_images)
|
||||
processed_images_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
if do_center_crop:
|
||||
stacked_images = self.center_crop(stacked_images, crop_size)
|
||||
if do_pad:
|
||||
stacked_images = self.pad_image(stacked_images, size_divisor)
|
||||
# Fused rescale and normalize
|
||||
stacked_images = self.rescale_and_normalize(
|
||||
stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
|
||||
)
|
||||
processed_images_grouped[shape] = stacked_images
|
||||
|
||||
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
||||
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
||||
return processed_images
|
||||
|
||||
def _preprocess_images(
|
||||
self,
|
||||
images,
|
||||
**kwargs,
|
||||
):
|
||||
"""Preprocesses images."""
|
||||
kwargs["do_reduce_labels"] = False
|
||||
processed_images = self._preprocess(images=images, **kwargs)
|
||||
return processed_images
|
||||
|
||||
def _preprocess_segmentation_maps(
|
||||
self,
|
||||
segmentation_maps,
|
||||
**kwargs,
|
||||
):
|
||||
"""Preprocesses segmentation maps."""
|
||||
processed_segmentation_maps = []
|
||||
for segmentation_map in segmentation_maps:
|
||||
segmentation_map = self._process_image(
|
||||
segmentation_map, do_convert_rgb=False, input_data_format=ChannelDimension.FIRST
|
||||
)
|
||||
|
||||
if segmentation_map.ndim == 2:
|
||||
segmentation_map = segmentation_map[None, ...]
|
||||
|
||||
processed_segmentation_maps.append(segmentation_map)
|
||||
|
||||
kwargs["do_normalize"] = False
|
||||
kwargs["do_rescale"] = False
|
||||
kwargs["input_data_format"] = ChannelDimension.FIRST
|
||||
processed_segmentation_maps = self._preprocess(images=processed_segmentation_maps, **kwargs)
|
||||
|
||||
processed_segmentation_maps = processed_segmentation_maps.squeeze(1)
|
||||
|
||||
processed_segmentation_maps = processed_segmentation_maps.to(torch.int64)
|
||||
return processed_segmentation_maps
|
||||
|
||||
@auto_docstring
|
||||
def preprocess(
|
||||
self,
|
||||
images: ImageInput,
|
||||
segmentation_maps: Optional[ImageInput] = None,
|
||||
**kwargs: Unpack[DPTFastImageProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
r"""
|
||||
segmentation_maps (`ImageInput`, *optional*):
|
||||
The segmentation maps to preprocess.
|
||||
"""
|
||||
validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_kwargs.__annotations__.keys())
|
||||
# Set default kwargs from self. This ensures that if a kwarg is not provided
|
||||
# by the user, it gets its default value from the instance, or is set to None.
|
||||
for kwarg_name in self.valid_kwargs.__annotations__:
|
||||
kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
|
||||
|
||||
# Extract parameters that are only used for preparing the input images
|
||||
do_convert_rgb = kwargs.pop("do_convert_rgb")
|
||||
input_data_format = kwargs.pop("input_data_format")
|
||||
device = kwargs.pop("device")
|
||||
# Prepare input images
|
||||
images = self._prepare_input_images(
|
||||
images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device
|
||||
)
|
||||
|
||||
# Prepare segmentation maps
|
||||
if segmentation_maps is not None:
|
||||
segmentation_maps = make_list_of_images(images=segmentation_maps, expected_ndims=2)
|
||||
|
||||
# Update kwargs that need further processing before being validated
|
||||
kwargs = self._further_process_kwargs(**kwargs)
|
||||
|
||||
# Validate kwargs
|
||||
self._validate_preprocess_kwargs(**kwargs)
|
||||
|
||||
# torch resize uses interpolation instead of resample
|
||||
resample = kwargs.pop("resample")
|
||||
kwargs["interpolation"] = (
|
||||
pil_torch_interpolation_mapping[resample] if isinstance(resample, (PILImageResampling, int)) else resample
|
||||
)
|
||||
|
||||
# Pop kwargs that are not needed in _preprocess
|
||||
kwargs.pop("default_to_square")
|
||||
kwargs.pop("data_format")
|
||||
|
||||
images = self._preprocess_images(
|
||||
images=images,
|
||||
**kwargs,
|
||||
)
|
||||
data = {"pixel_values": images}
|
||||
|
||||
if segmentation_maps is not None:
|
||||
segmentation_maps = self._preprocess_segmentation_maps(
|
||||
segmentation_maps=segmentation_maps,
|
||||
**kwargs,
|
||||
)
|
||||
data["labels"] = segmentation_maps
|
||||
|
||||
return BatchFeature(data=data)
|
||||
|
||||
def post_process_semantic_segmentation(self, outputs, target_sizes: Optional[list[tuple]] = None):
|
||||
"""
|
||||
Converts the output of [`DPTForSemanticSegmentation`] into semantic segmentation maps. Only supports PyTorch.
|
||||
|
||||
Args:
|
||||
outputs ([`DPTForSemanticSegmentation`]):
|
||||
Raw outputs of the model.
|
||||
target_sizes (`list[Tuple]` of length `batch_size`, *optional*):
|
||||
List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
|
||||
predictions will not be resized.
|
||||
|
||||
Returns:
|
||||
semantic_segmentation: `list[torch.Tensor]` of length `batch_size`, where each item is a semantic
|
||||
segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
|
||||
specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
|
||||
"""
|
||||
# TODO: add support for other frameworks
|
||||
logits = outputs.logits
|
||||
|
||||
# Resize logits and compute semantic segmentation maps
|
||||
if target_sizes is not None:
|
||||
if len(logits) != len(target_sizes):
|
||||
raise ValueError(
|
||||
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
|
||||
)
|
||||
|
||||
if is_torch_tensor(target_sizes):
|
||||
target_sizes = target_sizes.numpy()
|
||||
|
||||
semantic_segmentation = []
|
||||
|
||||
for idx in range(len(logits)):
|
||||
resized_logits = torch.nn.functional.interpolate(
|
||||
logits[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
|
||||
)
|
||||
semantic_map = resized_logits[0].argmax(dim=0)
|
||||
semantic_segmentation.append(semantic_map)
|
||||
else:
|
||||
semantic_segmentation = logits.argmax(dim=1)
|
||||
semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
|
||||
|
||||
return semantic_segmentation
|
||||
|
||||
def resize(
|
||||
self,
|
||||
image: "torch.Tensor",
|
||||
size: SizeDict,
|
||||
interpolation: "F.InterpolationMode" = None,
|
||||
antialias: bool = True,
|
||||
ensure_multiple_of: Optional[int] = 1,
|
||||
keep_aspect_ratio: bool = False,
|
||||
) -> "torch.Tensor":
|
||||
"""
|
||||
Resize an image to `(size["height"], size["width"])`.
|
||||
|
||||
Args:
|
||||
image (`torch.Tensor`):
|
||||
Image to resize.
|
||||
size (`SizeDict`):
|
||||
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
|
||||
interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
|
||||
`InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`.
|
||||
antialias (`bool`, *optional*, defaults to `True`):
|
||||
Whether to use antialiasing when resizing the image
|
||||
ensure_multiple_of (`int`, *optional*):
|
||||
If `do_resize` is `True`, the image is resized to a size that is a multiple of this value
|
||||
keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
|
||||
If `True`, and `do_resize` is `True`, the image is resized to the largest possible size such that the aspect ratio is preserved.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`: The resized image.
|
||||
"""
|
||||
if not size.height or not size.width:
|
||||
raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}")
|
||||
|
||||
output_size = get_resize_output_image_size(
|
||||
image,
|
||||
output_size=(size.height, size.width),
|
||||
keep_aspect_ratio=keep_aspect_ratio,
|
||||
multiple=ensure_multiple_of,
|
||||
)
|
||||
return super().resize(image, output_size, interpolation=interpolation, antialias=antialias)
|
||||
|
||||
def pad_image(
|
||||
self,
|
||||
image: "torch.Tensor",
|
||||
size_divisor: int = 1,
|
||||
) -> "torch.Tensor":
|
||||
r"""
|
||||
Center pad a batch of images to be a multiple of `size_divisor`.
|
||||
|
||||
Args:
|
||||
image (`torch.Tensor`):
|
||||
Image to pad. Can be a batch of images of dimensions (N, C, H, W) or a single image of dimensions (C, H, W).
|
||||
size_divisor (`int`):
|
||||
The width and height of the image will be padded to a multiple of this number.
|
||||
"""
|
||||
height, width = image.shape[-2:]
|
||||
|
||||
def _get_pad(size, size_divisor):
|
||||
new_size = math.ceil(size / size_divisor) * size_divisor
|
||||
pad_size = new_size - size
|
||||
pad_size_left = pad_size // 2
|
||||
pad_size_right = pad_size - pad_size_left
|
||||
return pad_size_left, pad_size_right
|
||||
|
||||
pad_top, pad_bottom = _get_pad(height, size_divisor)
|
||||
pad_left, pad_right = _get_pad(width, size_divisor)
|
||||
padding = (pad_left, pad_top, pad_right, pad_bottom)
|
||||
return F.pad(image, padding)
|
||||
|
||||
def post_process_depth_estimation(
|
||||
self,
|
||||
outputs: "DepthEstimatorOutput",
|
||||
target_sizes: Optional[Union[TensorType, list[tuple[int, int]], None]] = None,
|
||||
) -> list[dict[str, TensorType]]:
|
||||
"""
|
||||
Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images.
|
||||
Only supports PyTorch.
|
||||
|
||||
Args:
|
||||
outputs ([`DepthEstimatorOutput`]):
|
||||
Raw outputs of the model.
|
||||
target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*):
|
||||
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
|
||||
(height, width) of each image in the batch. If left to None, predictions will not be resized.
|
||||
|
||||
Returns:
|
||||
`List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
|
||||
predictions.
|
||||
"""
|
||||
requires_backends(self, "torch")
|
||||
|
||||
predicted_depth = outputs.predicted_depth
|
||||
|
||||
if (target_sizes is not None) and (len(predicted_depth) != len(target_sizes)):
|
||||
raise ValueError(
|
||||
"Make sure that you pass in as many target sizes as the batch dimension of the predicted depth"
|
||||
)
|
||||
|
||||
results = []
|
||||
target_sizes = [None] * len(predicted_depth) if target_sizes is None else target_sizes
|
||||
for depth, target_size in zip(predicted_depth, target_sizes):
|
||||
if target_size is not None:
|
||||
depth = torch.nn.functional.interpolate(
|
||||
depth.unsqueeze(0).unsqueeze(1), size=target_size, mode="bicubic", align_corners=False
|
||||
).squeeze()
|
||||
|
||||
results.append({"predicted_depth": depth})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
__all__ = ["DPTImageProcessorFast"]
|
313
src/transformers/models/dpt/modular_dpt.py
Normal file
313
src/transformers/models/dpt/modular_dpt.py
Normal file
@ -0,0 +1,313 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
from collections.abc import Iterable
|
||||
from typing import TYPE_CHECKING, Optional, Union
|
||||
|
||||
from transformers.image_processing_base import BatchFeature
|
||||
from transformers.image_transforms import group_images_by_shape, reorder_images
|
||||
from transformers.models.beit.image_processing_beit_fast import BeitImageProcessorFast
|
||||
|
||||
from ...image_processing_utils_fast import (
|
||||
DefaultFastImageProcessorKwargs,
|
||||
)
|
||||
from ...image_utils import (
|
||||
IMAGENET_STANDARD_MEAN,
|
||||
IMAGENET_STANDARD_STD,
|
||||
PILImageResampling,
|
||||
SizeDict,
|
||||
)
|
||||
from ...utils import (
|
||||
TensorType,
|
||||
auto_docstring,
|
||||
is_torch_available,
|
||||
is_torchvision_available,
|
||||
is_torchvision_v2_available,
|
||||
requires_backends,
|
||||
)
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ...modeling_outputs import DepthEstimatorOutput
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_torchvision_v2_available():
|
||||
from torchvision.transforms.v2 import functional as F
|
||||
elif is_torchvision_available():
|
||||
from torchvision.transforms import functional as F
|
||||
|
||||
|
||||
def get_resize_output_image_size(
|
||||
input_image: "torch.Tensor",
|
||||
output_size: Union[int, Iterable[int]],
|
||||
keep_aspect_ratio: bool,
|
||||
multiple: int,
|
||||
) -> SizeDict:
|
||||
def constrain_to_multiple_of(val, multiple, min_val=0, max_val=None):
|
||||
x = round(val / multiple) * multiple
|
||||
|
||||
if max_val is not None and x > max_val:
|
||||
x = math.floor(val / multiple) * multiple
|
||||
|
||||
if x < min_val:
|
||||
x = math.ceil(val / multiple) * multiple
|
||||
|
||||
return x
|
||||
|
||||
input_height, input_width = input_image.shape[-2:]
|
||||
output_height, output_width = output_size
|
||||
|
||||
# determine new height and width
|
||||
scale_height = output_height / input_height
|
||||
scale_width = output_width / input_width
|
||||
|
||||
if keep_aspect_ratio:
|
||||
# scale as little as possible
|
||||
if abs(1 - scale_width) < abs(1 - scale_height):
|
||||
# fit width
|
||||
scale_height = scale_width
|
||||
else:
|
||||
# fit height
|
||||
scale_width = scale_height
|
||||
|
||||
new_height = constrain_to_multiple_of(scale_height * input_height, multiple=multiple)
|
||||
new_width = constrain_to_multiple_of(scale_width * input_width, multiple=multiple)
|
||||
|
||||
return SizeDict(height=new_height, width=new_width)
|
||||
|
||||
|
||||
class DPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||
"""
|
||||
ensure_multiple_of (`int`, *optional*, defaults to 1):
|
||||
If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overidden
|
||||
by `ensure_multiple_of` in `preprocess`.
|
||||
do_pad (`bool`, *optional*, defaults to `False`):
|
||||
Whether to apply center padding. This was introduced in the DINOv2 paper, which uses the model in
|
||||
combination with DPT.
|
||||
size_divisor (`int`, *optional*):
|
||||
If `do_pad` is `True`, pads the image dimensions to be divisible by this value. This was introduced in the
|
||||
DINOv2 paper, which uses the model in combination with DPT.
|
||||
keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
|
||||
If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
|
||||
be overidden by `keep_aspect_ratio` in `preprocess`.
|
||||
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
|
||||
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
|
||||
is used for background, and background itself is not included in all classes of a dataset (e.g.
|
||||
ADE20k). The background label will be replaced by 255.
|
||||
"""
|
||||
|
||||
ensure_multiple_of: Optional[int]
|
||||
size_divisor: Optional[int]
|
||||
do_pad: Optional[bool]
|
||||
keep_aspect_ratio: Optional[bool]
|
||||
do_reduce_labels: Optional[bool]
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class DPTImageProcessorFast(BeitImageProcessorFast):
|
||||
resample = PILImageResampling.BICUBIC
|
||||
image_mean = IMAGENET_STANDARD_MEAN
|
||||
image_std = IMAGENET_STANDARD_STD
|
||||
size = {"height": 384, "width": 384}
|
||||
do_resize = True
|
||||
do_rescale = True
|
||||
do_normalize = True
|
||||
do_pad = False
|
||||
rescale_factor = 1 / 255
|
||||
ensure_multiple_of = 1
|
||||
keep_aspect_ratio = False
|
||||
do_reduce_labels = False
|
||||
crop_size = None
|
||||
do_center_crop = None
|
||||
do_reduce_labels = None
|
||||
|
||||
valid_kwargs = DPTFastImageProcessorKwargs
|
||||
|
||||
def from_dict():
|
||||
raise NotImplementedError("No need to override this method")
|
||||
|
||||
def resize(
|
||||
self,
|
||||
image: "torch.Tensor",
|
||||
size: SizeDict,
|
||||
interpolation: "F.InterpolationMode" = None,
|
||||
antialias: bool = True,
|
||||
ensure_multiple_of: Optional[int] = 1,
|
||||
keep_aspect_ratio: bool = False,
|
||||
) -> "torch.Tensor":
|
||||
"""
|
||||
Resize an image to `(size["height"], size["width"])`.
|
||||
|
||||
Args:
|
||||
image (`torch.Tensor`):
|
||||
Image to resize.
|
||||
size (`SizeDict`):
|
||||
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
|
||||
interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
|
||||
`InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`.
|
||||
antialias (`bool`, *optional*, defaults to `True`):
|
||||
Whether to use antialiasing when resizing the image
|
||||
ensure_multiple_of (`int`, *optional*):
|
||||
If `do_resize` is `True`, the image is resized to a size that is a multiple of this value
|
||||
keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
|
||||
If `True`, and `do_resize` is `True`, the image is resized to the largest possible size such that the aspect ratio is preserved.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`: The resized image.
|
||||
"""
|
||||
if not size.height or not size.width:
|
||||
raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}")
|
||||
|
||||
output_size = get_resize_output_image_size(
|
||||
image,
|
||||
output_size=(size.height, size.width),
|
||||
keep_aspect_ratio=keep_aspect_ratio,
|
||||
multiple=ensure_multiple_of,
|
||||
)
|
||||
return BeitImageProcessorFast().resize(image, output_size, interpolation=interpolation, antialias=antialias)
|
||||
|
||||
def pad_image(
|
||||
self,
|
||||
image: "torch.Tensor",
|
||||
size_divisor: int = 1,
|
||||
) -> "torch.Tensor":
|
||||
r"""
|
||||
Center pad a batch of images to be a multiple of `size_divisor`.
|
||||
|
||||
Args:
|
||||
image (`torch.Tensor`):
|
||||
Image to pad. Can be a batch of images of dimensions (N, C, H, W) or a single image of dimensions (C, H, W).
|
||||
size_divisor (`int`):
|
||||
The width and height of the image will be padded to a multiple of this number.
|
||||
"""
|
||||
height, width = image.shape[-2:]
|
||||
|
||||
def _get_pad(size, size_divisor):
|
||||
new_size = math.ceil(size / size_divisor) * size_divisor
|
||||
pad_size = new_size - size
|
||||
pad_size_left = pad_size // 2
|
||||
pad_size_right = pad_size - pad_size_left
|
||||
return pad_size_left, pad_size_right
|
||||
|
||||
pad_top, pad_bottom = _get_pad(height, size_divisor)
|
||||
pad_left, pad_right = _get_pad(width, size_divisor)
|
||||
padding = (pad_left, pad_top, pad_right, pad_bottom)
|
||||
return F.pad(image, padding)
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images: list["torch.Tensor"],
|
||||
do_reduce_labels: bool,
|
||||
do_resize: bool,
|
||||
size: SizeDict,
|
||||
interpolation: Optional["F.InterpolationMode"],
|
||||
do_center_crop: bool,
|
||||
crop_size: SizeDict,
|
||||
do_rescale: bool,
|
||||
rescale_factor: float,
|
||||
do_normalize: bool,
|
||||
image_mean: Optional[Union[float, list[float]]],
|
||||
image_std: Optional[Union[float, list[float]]],
|
||||
return_tensors: Optional[Union[str, TensorType]],
|
||||
keep_aspect_ratio: bool,
|
||||
ensure_multiple_of: Optional[int],
|
||||
do_pad: bool,
|
||||
size_divisor: Optional[int],
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
if do_reduce_labels:
|
||||
images = self.reduce_label(images)
|
||||
|
||||
# Group images by size for batched resizing
|
||||
grouped_images, grouped_images_index = group_images_by_shape(images)
|
||||
resized_images_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
if do_resize:
|
||||
stacked_images = self.resize(
|
||||
image=stacked_images,
|
||||
size=size,
|
||||
interpolation=interpolation,
|
||||
ensure_multiple_of=ensure_multiple_of,
|
||||
keep_aspect_ratio=keep_aspect_ratio,
|
||||
)
|
||||
resized_images_grouped[shape] = stacked_images
|
||||
resized_images = reorder_images(resized_images_grouped, grouped_images_index)
|
||||
|
||||
# Group images by size for further processing
|
||||
# Needed in case do_resize is False, or resize returns images with different sizes
|
||||
grouped_images, grouped_images_index = group_images_by_shape(resized_images)
|
||||
processed_images_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
if do_center_crop:
|
||||
stacked_images = self.center_crop(stacked_images, crop_size)
|
||||
if do_pad:
|
||||
stacked_images = self.pad_image(stacked_images, size_divisor)
|
||||
# Fused rescale and normalize
|
||||
stacked_images = self.rescale_and_normalize(
|
||||
stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
|
||||
)
|
||||
processed_images_grouped[shape] = stacked_images
|
||||
|
||||
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
||||
processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
|
||||
return processed_images
|
||||
|
||||
def post_process_depth_estimation(
|
||||
self,
|
||||
outputs: "DepthEstimatorOutput",
|
||||
target_sizes: Optional[Union[TensorType, list[tuple[int, int]], None]] = None,
|
||||
) -> list[dict[str, TensorType]]:
|
||||
"""
|
||||
Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images.
|
||||
Only supports PyTorch.
|
||||
|
||||
Args:
|
||||
outputs ([`DepthEstimatorOutput`]):
|
||||
Raw outputs of the model.
|
||||
target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*):
|
||||
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
|
||||
(height, width) of each image in the batch. If left to None, predictions will not be resized.
|
||||
|
||||
Returns:
|
||||
`List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
|
||||
predictions.
|
||||
"""
|
||||
requires_backends(self, "torch")
|
||||
|
||||
predicted_depth = outputs.predicted_depth
|
||||
|
||||
if (target_sizes is not None) and (len(predicted_depth) != len(target_sizes)):
|
||||
raise ValueError(
|
||||
"Make sure that you pass in as many target sizes as the batch dimension of the predicted depth"
|
||||
)
|
||||
|
||||
results = []
|
||||
target_sizes = [None] * len(predicted_depth) if target_sizes is None else target_sizes
|
||||
for depth, target_size in zip(predicted_depth, target_sizes):
|
||||
if target_size is not None:
|
||||
depth = torch.nn.functional.interpolate(
|
||||
depth.unsqueeze(0).unsqueeze(1), size=target_size, mode="bicubic", align_corners=False
|
||||
).squeeze()
|
||||
|
||||
results.append({"predicted_depth": depth})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
__all__ = ["DPTImageProcessorFast"]
|
@ -459,9 +459,14 @@ class EsmFlashAttention2(EsmSelfAttention):
|
||||
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
|
||||
# in fp32.
|
||||
input_dtype = query_layer.dtype
|
||||
device_type = query_layer.device.type if query_layer.device.type != "mps" else "cpu"
|
||||
if input_dtype == torch.float32:
|
||||
if torch.is_autocast_enabled():
|
||||
target_dtype = torch.get_autocast_gpu_dtype()
|
||||
target_dtype = (
|
||||
torch.get_autocast_dtype(device_type)
|
||||
if hasattr(torch, "get_autocast_dtype")
|
||||
else torch.get_autocast_gpu_dtype()
|
||||
)
|
||||
# Handle the case where the model is quantized
|
||||
elif hasattr(self.config, "_pre_quantization_dtype"):
|
||||
target_dtype = self.config._pre_quantization_dtype
|
||||
|
@ -133,9 +133,14 @@ class EsmForProteinFoldingOutput(ModelOutput):
|
||||
max_predicted_aligned_error: Optional[torch.FloatTensor] = None
|
||||
|
||||
|
||||
def is_fp16_enabled():
|
||||
def is_fp16_enabled(device_type):
|
||||
# Autocast world
|
||||
fp16_enabled = torch.get_autocast_gpu_dtype() == torch.float16
|
||||
autocast_dtype = (
|
||||
torch.get_autocast_dtype(device_type)
|
||||
if hasattr(torch, "get_autocast_dtype")
|
||||
else torch.get_autocast_gpu_dtype()
|
||||
)
|
||||
fp16_enabled = autocast_dtype == torch.float16
|
||||
fp16_enabled = fp16_enabled and torch.is_autocast_enabled()
|
||||
|
||||
return fp16_enabled
|
||||
@ -885,8 +890,9 @@ class EsmFoldTriangleMultiplicativeUpdate(nn.Module):
|
||||
b = b * self.sigmoid(self.linear_b_g(z))
|
||||
b = b * self.linear_b_p(z)
|
||||
|
||||
if is_fp16_enabled():
|
||||
with torch.cuda.amp.autocast(enabled=False):
|
||||
device_type = a.device.type if a.device.type != "mps" else "cpu"
|
||||
if is_fp16_enabled(device_type):
|
||||
with torch.autocast(device_type=device_type, enabled=False):
|
||||
x = self._combine_projections(a.float(), b.float())
|
||||
else:
|
||||
x = self._combine_projections(a, b)
|
||||
@ -1414,7 +1420,7 @@ class EsmFoldInvariantPointAttention(nn.Module):
|
||||
|
||||
self.linear_b = EsmFoldLinear(c_z, config.num_heads_ipa)
|
||||
|
||||
self.head_weights = nn.Parameter(torch.zeros((config.num_heads_ipa)))
|
||||
self.head_weights = nn.Parameter(torch.zeros(config.num_heads_ipa))
|
||||
|
||||
concat_out_dim = config.num_heads_ipa * (c_z + config.ipa_dim + config.num_v_points * 4)
|
||||
self.linear_out = EsmFoldLinear(concat_out_dim, c_s, init="final")
|
||||
@ -1499,8 +1505,9 @@ class EsmFoldInvariantPointAttention(nn.Module):
|
||||
z[0] = z[0].cpu()
|
||||
|
||||
# [*, H, N_res, N_res]
|
||||
if is_fp16_enabled():
|
||||
with torch.cuda.amp.autocast(enabled=False):
|
||||
device_type = q.device.type if q.device.type != "mps" else "cpu"
|
||||
if is_fp16_enabled(device_type):
|
||||
with torch.autocast(device_type=device_type, enabled=False):
|
||||
a = torch.matmul(
|
||||
permute_final_dims(q.float(), (1, 0, 2)), # [*, H, N_res, C_hidden]
|
||||
permute_final_dims(k.float(), (1, 2, 0)), # [*, H, C_hidden, N_res]
|
||||
|
@ -398,7 +398,7 @@ def map_structure_with_atom_order(in_list: list, first_call: bool = True) -> lis
|
||||
return in_list
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=None)
|
||||
@functools.cache
|
||||
def load_stereo_chemical_props() -> tuple[
|
||||
Mapping[str, list[Bond]],
|
||||
Mapping[str, list[Bond]],
|
||||
|
@ -16,7 +16,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Sequence
|
||||
from functools import lru_cache
|
||||
from functools import cache
|
||||
from typing import Any, Callable, Optional
|
||||
|
||||
import numpy as np
|
||||
@ -75,7 +75,7 @@ def rot_vec_mul(r: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
|
||||
)
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
@cache
|
||||
def identity_rot_mats(
|
||||
batch_dims: tuple[int, ...],
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
@ -90,7 +90,7 @@ def identity_rot_mats(
|
||||
return rots
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
@cache
|
||||
def identity_trans(
|
||||
batch_dims: tuple[int, ...],
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
@ -101,7 +101,7 @@ def identity_trans(
|
||||
return trans
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
@cache
|
||||
def identity_quats(
|
||||
batch_dims: tuple[int, ...],
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
@ -220,7 +220,7 @@ _CACHED_QUATS: dict[str, np.ndarray] = {
|
||||
}
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
@cache
|
||||
def _get_quat(quat_key: str, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
|
||||
return torch.tensor(_CACHED_QUATS[quat_key], dtype=dtype, device=device)
|
||||
|
||||
@ -1070,7 +1070,7 @@ class Rigid:
|
||||
e0 = [c / denom for c in e0]
|
||||
dot = sum((c1 * c2 for c1, c2 in zip(e0, e1)))
|
||||
e1 = [c2 - c1 * dot for c1, c2 in zip(e0, e1)]
|
||||
denom = torch.sqrt(sum((c * c for c in e1)) + eps * torch.ones_like(e1[0]))
|
||||
denom = torch.sqrt(sum(c * c for c in e1) + eps * torch.ones_like(e1[0]))
|
||||
e1 = [c / denom for c in e1]
|
||||
e2 = [
|
||||
e0[1] * e1[2] - e0[2] * e1[1],
|
||||
|
@ -488,9 +488,14 @@ class FalconFlashAttention2(FalconAttention):
|
||||
# therefore the input hidden states gets silently casted in float32. Hence, we need
|
||||
# cast them back in float16 just to be sure everything works as expected.
|
||||
input_dtype = query_layer.dtype
|
||||
device_type = query_layer.device.type if query_layer.device.type != "mps" else "cpu"
|
||||
if input_dtype == torch.float32:
|
||||
if torch.is_autocast_enabled():
|
||||
target_dtype = torch.get_autocast_gpu_dtype()
|
||||
target_dtype = (
|
||||
torch.get_autocast_dtype(device_type)
|
||||
if hasattr(torch, "get_autocast_dtype")
|
||||
else torch.get_autocast_gpu_dtype()
|
||||
)
|
||||
# Handle the case where the model is quantized
|
||||
elif hasattr(self.config, "_pre_quantization_dtype"):
|
||||
target_dtype = self.config._pre_quantization_dtype
|
||||
|
@ -949,7 +949,7 @@ class FalconH1Mixer(nn.Module):
|
||||
|
||||
# 2. Compute the state for each intra-chunk
|
||||
# (right term of low-rank factorization of off-diagonal blocks; B terms)
|
||||
decay_states = torch.exp((A_cumsum[:, :, :, -1:] - A_cumsum))
|
||||
decay_states = torch.exp(A_cumsum[:, :, :, -1:] - A_cumsum)
|
||||
B_decay = B * decay_states.permute(0, -2, -1, 1)[..., None]
|
||||
states = (B_decay[..., None, :] * hidden_states[..., None]).sum(dim=2)
|
||||
|
||||
|
@ -748,7 +748,7 @@ class FalconH1Mixer(nn.Module):
|
||||
|
||||
# 2. Compute the state for each intra-chunk
|
||||
# (right term of low-rank factorization of off-diagonal blocks; B terms)
|
||||
decay_states = torch.exp((A_cumsum[:, :, :, -1:] - A_cumsum))
|
||||
decay_states = torch.exp(A_cumsum[:, :, :, -1:] - A_cumsum)
|
||||
B_decay = B * decay_states.permute(0, -2, -1, 1)[..., None]
|
||||
states = (B_decay[..., None, :] * hidden_states[..., None]).sum(dim=2)
|
||||
|
||||
|
@ -318,7 +318,7 @@ class FlavaImageProcessor(BaseImageProcessor):
|
||||
image_processor_dict["codebook_crop_size"] = kwargs.pop("codebook_crop_size")
|
||||
return super().from_dict(image_processor_dict, **kwargs)
|
||||
|
||||
@lru_cache()
|
||||
@lru_cache
|
||||
def masking_generator(
|
||||
self,
|
||||
input_size_patches,
|
||||
|
@ -273,7 +273,7 @@ class FlavaImageProcessorFast(BaseImageProcessorFast):
|
||||
image_processor_dict["codebook_crop_size"] = kwargs.pop("codebook_crop_size")
|
||||
return super().from_dict(image_processor_dict, **kwargs)
|
||||
|
||||
@lru_cache()
|
||||
@lru_cache
|
||||
def masking_generator(
|
||||
self,
|
||||
input_size_patches,
|
||||
|
@ -1446,7 +1446,7 @@ class FlavaImageCodebook(FlavaPreTrainedModel):
|
||||
param.requires_grad = False
|
||||
|
||||
def get_codebook_indices(self, pixel_values: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
f"""
|
||||
Args:
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Codebook pixel values can be obtained using [`AutoImageProcessor`] by passing
|
||||
@ -1458,8 +1458,8 @@ class FlavaImageCodebook(FlavaPreTrainedModel):
|
||||
>>> import requests
|
||||
>>> from transformers import AutoImageProcessor, FlavaImageCodebook
|
||||
|
||||
>>> model = FlavaImageCodebook.from_pretrained("{0}")
|
||||
>>> image_processor = AutoImageProcessor.from_pretrained("{0}")
|
||||
>>> model = FlavaImageCodebook.from_pretrained("{_CHECKPOINT_FOR_CODEBOOK_DOC}")
|
||||
>>> image_processor = AutoImageProcessor.from_pretrained("{_CHECKPOINT_FOR_CODEBOOK_DOC}")
|
||||
|
||||
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
>>> image = Image.open(requests.get(url, stream=True).raw)
|
||||
@ -1469,7 +1469,7 @@ class FlavaImageCodebook(FlavaPreTrainedModel):
|
||||
|
||||
>>> outputs = model.get_codebook_indices(**inputs)
|
||||
```
|
||||
""".format(_CHECKPOINT_FOR_CODEBOOK_DOC)
|
||||
"""
|
||||
z_logits = self.blocks(pixel_values)
|
||||
return torch.argmax(z_logits, axis=1)
|
||||
|
||||
@ -1478,7 +1478,7 @@ class FlavaImageCodebook(FlavaPreTrainedModel):
|
||||
return nn.Softmax(dim=1)(z_logits)
|
||||
|
||||
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
|
||||
"""
|
||||
f"""
|
||||
Args:
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Codebook pixel values can be obtained using [`AutoImageProcessor`] by passing
|
||||
@ -1491,8 +1491,8 @@ class FlavaImageCodebook(FlavaPreTrainedModel):
|
||||
>>> import requests
|
||||
>>> from transformers import AutoImageProcessor, FlavaImageCodebook
|
||||
|
||||
>>> model = FlavaImageCodebook.from_pretrained("{0}")
|
||||
>>> image_processor = AutoImageProcessor.from_pretrained("{0}")
|
||||
>>> model = FlavaImageCodebook.from_pretrained("{_CHECKPOINT_FOR_CODEBOOK_DOC}")
|
||||
>>> image_processor = AutoImageProcessor.from_pretrained("{_CHECKPOINT_FOR_CODEBOOK_DOC}")
|
||||
|
||||
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
>>> image = Image.open(requests.get(url, stream=True).raw)
|
||||
@ -1504,7 +1504,7 @@ class FlavaImageCodebook(FlavaPreTrainedModel):
|
||||
>>> print(outputs.shape)
|
||||
(1, 196)
|
||||
```
|
||||
""".format(_CHECKPOINT_FOR_CODEBOOK_DOC)
|
||||
"""
|
||||
if len(pixel_values.shape) != 4:
|
||||
raise ValueError(f"input shape {pixel_values.shape} is not 4d")
|
||||
if pixel_values.shape[1] != self.input_channels:
|
||||
|
@ -177,7 +177,7 @@ class FNetTokenizer(PreTrainedTokenizer):
|
||||
pieces = self.sp_model.encode(text, out_type=str)
|
||||
new_pieces = []
|
||||
for piece in pieces:
|
||||
if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
|
||||
if len(piece) > 1 and piece[-1] == "," and piece[-2].isdigit():
|
||||
cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
|
||||
if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
|
||||
if len(cur_pieces[0]) == 1:
|
||||
|
@ -293,7 +293,7 @@ class FocalNetDropPath(nn.Module):
|
||||
return drop_path(hidden_states, self.drop_prob, self.training)
|
||||
|
||||
def extra_repr(self) -> str:
|
||||
return "p={}".format(self.drop_prob)
|
||||
return f"p={self.drop_prob}"
|
||||
|
||||
|
||||
class FocalNetModulation(nn.Module):
|
||||
@ -431,8 +431,8 @@ class FocalNetLayer(nn.Module):
|
||||
self.gamma_1 = 1.0
|
||||
self.gamma_2 = 1.0
|
||||
if config.use_layerscale:
|
||||
self.gamma_1 = nn.Parameter(config.layerscale_value * torch.ones((dim)), requires_grad=True)
|
||||
self.gamma_2 = nn.Parameter(config.layerscale_value * torch.ones((dim)), requires_grad=True)
|
||||
self.gamma_1 = nn.Parameter(config.layerscale_value * torch.ones(dim), requires_grad=True)
|
||||
self.gamma_2 = nn.Parameter(config.layerscale_value * torch.ones(dim), requires_grad=True)
|
||||
|
||||
def forward(self, hidden_state, input_dimensions):
|
||||
height, width = input_dimensions
|
||||
|
@ -65,7 +65,7 @@ class GLPNDropPath(nn.Module):
|
||||
return drop_path(hidden_states, self.drop_prob, self.training)
|
||||
|
||||
def extra_repr(self) -> str:
|
||||
return "p={}".format(self.drop_prob)
|
||||
return f"p={self.drop_prob}"
|
||||
|
||||
|
||||
# Copied from transformers.models.segformer.modeling_segformer.SegformerOverlapPatchEmbeddings
|
||||
|
@ -229,7 +229,7 @@ class GPT2Attention(nn.Module):
|
||||
scale_factor /= float(self.layer_idx + 1)
|
||||
|
||||
# Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk))
|
||||
with torch.amp.autocast(query.device.type, enabled=False):
|
||||
with torch.autocast(query.device.type, enabled=False):
|
||||
q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
|
||||
attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
|
||||
attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
|
||||
|
@ -33,7 +33,7 @@ VOCAB_FILES_NAMES = {
|
||||
}
|
||||
|
||||
|
||||
@lru_cache()
|
||||
@lru_cache
|
||||
def bytes_to_unicode():
|
||||
"""
|
||||
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
|
||||
|
@ -343,9 +343,14 @@ class GPTBigCodeFlashAttention2(GPTBigCodeAttention):
|
||||
# therefore the input hidden states gets silently casted in float32. Hence, we need
|
||||
# cast them back in float16 just to be sure everything works as expected.
|
||||
input_dtype = query.dtype
|
||||
device_type = query.device.type if query.device.type != "mps" else "cpu"
|
||||
if input_dtype == torch.float32:
|
||||
if torch.is_autocast_enabled():
|
||||
target_dtype = torch.get_autocast_gpu_dtype()
|
||||
target_dtype = (
|
||||
torch.get_autocast_dtype(device_type)
|
||||
if hasattr(torch, "get_autocast_dtype")
|
||||
else torch.get_autocast_gpu_dtype()
|
||||
)
|
||||
# Handle the case where the model is quantized
|
||||
elif hasattr(self.config, "_pre_quantization_dtype"):
|
||||
target_dtype = self.config._pre_quantization_dtype
|
||||
|
@ -323,9 +323,14 @@ class GPTNeoFlashAttention2(GPTNeoSelfAttention):
|
||||
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
|
||||
# in fp32. (LlamaRMSNorm handles it correctly)
|
||||
|
||||
device_type = query.device.type if query.device.type != "mps" else "cpu"
|
||||
if query.dtype == torch.float32:
|
||||
if torch.is_autocast_enabled():
|
||||
target_dtype = torch.get_autocast_gpu_dtype()
|
||||
target_dtype = (
|
||||
torch.get_autocast_dtype(device_type)
|
||||
if hasattr(torch, "get_autocast_dtype")
|
||||
else torch.get_autocast_gpu_dtype()
|
||||
)
|
||||
# Handle the case where the model is quantized
|
||||
elif hasattr(self.config, "_pre_quantization_dtype"):
|
||||
target_dtype = self.config._pre_quantization_dtype
|
||||
|
@ -355,9 +355,14 @@ class GPTJFlashAttention2(GPTJAttention):
|
||||
# in fp32. (LlamaRMSNorm handles it correctly)
|
||||
|
||||
input_dtype = query.dtype
|
||||
device_type = query.device.type if query.device.type != "mps" else "cpu"
|
||||
if input_dtype == torch.float32:
|
||||
if torch.is_autocast_enabled():
|
||||
target_dtype = torch.get_autocast_gpu_dtype()
|
||||
target_dtype = (
|
||||
torch.get_autocast_dtype(device_type)
|
||||
if hasattr(torch, "get_autocast_dtype")
|
||||
else torch.get_autocast_gpu_dtype()
|
||||
)
|
||||
# Handle the case where the model is quantized
|
||||
elif hasattr(self.config, "_pre_quantization_dtype"):
|
||||
target_dtype = self.config._pre_quantization_dtype
|
||||
|
@ -324,7 +324,7 @@ class GraniteMoeMoE(nn.Module):
|
||||
"""
|
||||
|
||||
def __init__(self, config: GraniteMoeConfig):
|
||||
super(GraniteMoeMoE, self).__init__()
|
||||
super().__init__()
|
||||
|
||||
self.input_size = config.hidden_size
|
||||
self.hidden_size = config.intermediate_size
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user