mirror of
https://github.com/huggingface/transformers.git
synced 2025-11-14 06:14:50 +08:00
Compare commits
33 Commits
model-list
...
cleanup_tf
| Author | SHA1 | Date | |
|---|---|---|---|
| d3a87f9de3 | |||
| 80134e6e66 | |||
| ce40ca0d4c | |||
| 6408d3b01a | |||
| f40ef03214 | |||
| 5150dac727 | |||
| 27c3807991 | |||
| ffb35fe142 | |||
| 1fd63dd532 | |||
| 240d19f4a3 | |||
| ba938fa590 | |||
| 6744ebe745 | |||
| 1709ed96e4 | |||
| fd36275be2 | |||
| 922e85487b | |||
| f9e668abf3 | |||
| 7951105d69 | |||
| 58a3f8caac | |||
| fcea1e1fe0 | |||
| 563f2ffb21 | |||
| 6f479d5d75 | |||
| d012f34e0d | |||
| e76364d5c1 | |||
| 2b8068c306 | |||
| 33c60a5254 | |||
| fa22b56903 | |||
| f30c22500b | |||
| 496c283615 | |||
| df45a92cea | |||
| 3ff0e69f84 | |||
| 31839d741a | |||
| 2072f3059e | |||
| 3760afb21c |
1
.github/workflows/benchmark.yml
vendored
1
.github/workflows/benchmark.yml
vendored
@ -40,7 +40,6 @@ jobs:
|
||||
run: python3 -m pip install -r benchmark_v2/requirements.txt kernels
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: /transformers
|
||||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e ".[torch]"
|
||||
|
||||
- name: Run benchmark
|
||||
|
||||
@ -117,8 +117,6 @@ def flush_memory():
|
||||
# Clear CUDA cache
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.reset_max_memory_allocated()
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
torch.cuda.synchronize()
|
||||
gc.collect()
|
||||
|
||||
|
||||
@ -1008,6 +1008,8 @@
|
||||
title: AltCLIP
|
||||
- local: model_doc/aria
|
||||
title: Aria
|
||||
- local: model_doc/audioflamingo3
|
||||
title: AudioFlamingo3
|
||||
- local: model_doc/aya_vision
|
||||
title: AyaVision
|
||||
- local: model_doc/blip
|
||||
|
||||
402
docs/source/en/model_doc/audioflamingo3.md
Normal file
402
docs/source/en/model_doc/audioflamingo3.md
Normal file
@ -0,0 +1,402 @@
|
||||
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
*This model was released on 2025-07-10 and added to Hugging Face Transformers on 2025-11-11.*
|
||||
|
||||
# Audio Flamingo 3
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
|
||||
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
|
||||
## Overview
|
||||
|
||||
Audio Flamingo 3 (AF3) is a fully open large audio–language model designed for robust understanding and reasoning over speech, environmental sounds, and music. AF3 pairs a Whisper-style audio encoder with a causal language model and performs replace-in-place audio–text fusion: the processor aligns post-pool audio frames to a dedicated placeholder token and the model replaces those token slots with projected audio embeddings during the forward pass.
|
||||
|
||||
The model checkpoint is available at: [nvidia/audio-flamingo-3-hf](https://huggingface.co/nvidia/audio-flamingo-3-hf)
|
||||
|
||||
Highlights:
|
||||
|
||||
- Unified audio encoder across speech, sound, and music.
|
||||
- **Long-audio support via windowing and post-pool alignment (up to 10 minutes maximum).** The model processes audio in 30-second windows with a hard limit of 20 windows (10 minutes total). Audio longer than 10 minutes will be truncated.
|
||||
- Deterministic fusion that preserves sequence length by replacing audio placeholder tokens with audio embeddings.
|
||||
|
||||
This model was contributed by [Lasha Koroshinadze](https://huggingface.co/lashahub) and [Eric Bezzam](https://huggingface.co/bezzam).
|
||||
|
||||
### Paper
|
||||
|
||||
[Audio Flamingo 3](https://huggingface.co/papers/2507.08128): Advancing Audio Intelligence with Fully Open Large Audio Language Models
|
||||
A. Goel, S. Ghosh, J. Kim, S. Kumar, Z. Kong, S. Lee, C.-H. H. Yang, R. Duraiswami, D. Manocha, R. Valle, B. Catanzaro
|
||||
NVIDIA and University of Maryland
|
||||
Project: https://research.nvidia.com/labs/adlr/AF3/
|
||||
|
||||
## Usage
|
||||
|
||||
### Audio Instruct Mode
|
||||
|
||||
The model supports audio-text instructions, including multi-turn interactions, all processed in batches.
|
||||
|
||||
➡️ audio + text instruction
|
||||
|
||||
```python
|
||||
from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
|
||||
|
||||
model_id = "nvidia/audio-flamingo-3-hf"
|
||||
processor = AutoProcessor.from_pretrained(model_id)
|
||||
model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
|
||||
|
||||
conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Transcribe the input speech."},
|
||||
{"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/WhDJDIviAOg_120_10.mp3"},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
inputs = processor.apply_chat_template(
|
||||
conversation,
|
||||
tokenize=True,
|
||||
add_generation_prompt=True,
|
||||
return_dict=True,
|
||||
).to(model.device)
|
||||
|
||||
outputs = model.generate(**inputs, max_new_tokens=500)
|
||||
|
||||
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
||||
print(decoded_outputs)
|
||||
```
|
||||
|
||||
➡️ multi-turn:
|
||||
|
||||
```python
|
||||
from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
|
||||
|
||||
model_id = "nvidia/audio-flamingo-3-hf"
|
||||
processor = AutoProcessor.from_pretrained(model_id)
|
||||
model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
|
||||
|
||||
conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Instruction: How does the tone of female speech change throughout the audio? Choose the correct option among the options below: (A) Sad to happy (B) Happy to sad (C) Neutral to happy (D) Happy to neutral.",
|
||||
},
|
||||
{"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/000000786159.31.wav"},
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": [{"type": "text", "text": "(A) Sad to happy"}],
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Why do you think so?"},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
inputs = processor.apply_chat_template(
|
||||
conversation,
|
||||
tokenize=True,
|
||||
add_generation_prompt=True,
|
||||
return_dict=True,
|
||||
).to(model.device)
|
||||
|
||||
outputs = model.generate(**inputs, max_new_tokens=500)
|
||||
|
||||
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
||||
print(decoded_outputs)
|
||||
```
|
||||
|
||||
➡️ text only:
|
||||
|
||||
```python
|
||||
from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
|
||||
|
||||
model_id = "nvidia/audio-flamingo-3-hf"
|
||||
processor = AutoProcessor.from_pretrained(model_id)
|
||||
model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
|
||||
|
||||
conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "What is the capital of France?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
inputs = processor.apply_chat_template(
|
||||
conversation,
|
||||
tokenize=True,
|
||||
add_generation_prompt=True,
|
||||
return_dict=True,
|
||||
).to(model.device)
|
||||
|
||||
outputs = model.generate(**inputs, max_new_tokens=500)
|
||||
|
||||
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
||||
print(decoded_outputs)
|
||||
```
|
||||
|
||||
➡️ audio only:
|
||||
|
||||
```python
|
||||
from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
|
||||
|
||||
model_id = "nvidia/audio-flamingo-3-hf"
|
||||
processor = AutoProcessor.from_pretrained(model_id)
|
||||
model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
|
||||
|
||||
conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/WhDJDIviAOg_120_10.mp3"},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
inputs = processor.apply_chat_template(
|
||||
conversation,
|
||||
tokenize=True,
|
||||
add_generation_prompt=True,
|
||||
return_dict=True,
|
||||
).to(model.device)
|
||||
|
||||
outputs = model.generate(**inputs, max_new_tokens=500)
|
||||
|
||||
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
||||
print(decoded_outputs)
|
||||
```
|
||||
|
||||
➡️ batched inference!
|
||||
|
||||
```python
|
||||
from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
|
||||
|
||||
model_id = "nvidia/audio-flamingo-3-hf"
|
||||
processor = AutoProcessor.from_pretrained(model_id)
|
||||
model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
|
||||
|
||||
conversations = [
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Transcribe the input speech."},
|
||||
{
|
||||
"type": "audio",
|
||||
"path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav",
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
|
||||
},
|
||||
{"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
|
||||
],
|
||||
}
|
||||
],
|
||||
]
|
||||
|
||||
inputs = processor.apply_chat_template(
|
||||
conversations,
|
||||
tokenize=True,
|
||||
add_generation_prompt=True,
|
||||
return_dict=True,
|
||||
).to(model.device)
|
||||
|
||||
outputs = model.generate(**inputs, max_new_tokens=500)
|
||||
|
||||
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
||||
print(decoded_outputs)
|
||||
```
|
||||
|
||||
➡️ Training:
|
||||
|
||||
```python
|
||||
from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
|
||||
|
||||
model_id = "nvidia/audio-flamingo-3-hf"
|
||||
processor = AutoProcessor.from_pretrained(model_id)
|
||||
model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
|
||||
model.train()
|
||||
|
||||
conversation = [
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Transcribe the input speech."},
|
||||
{"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/WhDJDIviAOg_120_10.mp3"},
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": [{"type": "text", "text": "The transcription of the audio is 'summer follows spring the days grow longer and the nights are warm'."}],
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
|
||||
},
|
||||
{"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": [{"type": "text", "text": "The transcription of the audio is 'some transcription of the audio'."}],
|
||||
}
|
||||
|
||||
]
|
||||
]
|
||||
|
||||
inputs = processor.apply_chat_template(
|
||||
conversation,
|
||||
tokenize=True,
|
||||
add_generation_prompt=True,
|
||||
return_dict=True,
|
||||
output_labels=True,
|
||||
).to(model.device)
|
||||
|
||||
loss = model(**inputs).loss
|
||||
loss.backward()
|
||||
```
|
||||
|
||||
➡️ transcription shortcut
|
||||
|
||||
```python
|
||||
from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
|
||||
|
||||
model_id = "nvidia/audio-flamingo-3-hf"
|
||||
processor = AutoProcessor.from_pretrained(model_id)
|
||||
model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
|
||||
|
||||
inputs = processor.apply_transcription_request(audio="https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav").to(model.device)
|
||||
|
||||
outputs = model.generate(**inputs, max_new_tokens=500)
|
||||
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True, strip_prefix=True)
|
||||
|
||||
print(decoded_outputs)
|
||||
```
|
||||
|
||||
The model is trained to emit transcriptions prefixed with assistant framing such as `The spoken content of the audio is "<text>".`. Use `strip_prefix=True` (as shown above) to remove the fixed assistant sentence and surrounding quotes so that only the transcription remains.
|
||||
|
||||
## How the model works
|
||||
|
||||
### Architecture
|
||||
|
||||
* **AudioFlamingo3Encoder**
|
||||
Whisper-style feature extractor + encoder → average-pool over time (stride 2) → LayerNorm.
|
||||
Produces per-frame hidden states at the post-pool rate.
|
||||
|
||||
* **AudioFlamingo3MultiModalProjector**
|
||||
A small MLP that maps encoder features to the language model’s hidden size.
|
||||
|
||||
* **AudioFlamingo3ForConditionalGeneration**
|
||||
A causal language model that accepts text embeddings where each audio placeholder token slot is replaced, in place, by an audio frame embedding. No sequence-length change is introduced by fusion.
|
||||
|
||||
### Processor-level alignment
|
||||
|
||||
1. Each raw waveform is split into fixed-length windows based on the feature extractor’s `chunk_length` (seconds) and `sampling_rate` (Hz).
|
||||
2. For each window, the processor computes the number of post-pool frames `post_pool_len` that the encoder will output (matching the conv/pool schedule).
|
||||
3. The processor expands the audio placeholder token by the total number of post-pool frames across all windows.
|
||||
4. The model later replaces those token positions with the corresponding projected audio embeddings.
|
||||
|
||||
## Usage patterns
|
||||
|
||||
### Transcription shortcut
|
||||
|
||||
For automatic speech recognition you can skip writing the default instruction each time and call
|
||||
[`~transformers.AudioFlamingo3Processor.apply_transcription_request`]:
|
||||
|
||||
```python
|
||||
inputs = processor.apply_transcription_request(audio=audio_array)
|
||||
```
|
||||
|
||||
Pass `prompt="Transcribe the input speech."` (or a list of prompts for batch audio) to customize the instruction while
|
||||
keeping the audio placeholder handling.
|
||||
|
||||
`audio` accepts in-memory arrays, local file paths, or URLs. Any processor kwargs (`text_kwargs`, `audio_kwargs`, etc.)
|
||||
are forwarded, so you can tweak padding or tensor formats just like when calling `processor(...)`.
|
||||
|
||||
## Long audio and windowing
|
||||
|
||||
**Important: Maximum audio length is 10 minutes.** Audio longer than this will be truncated.
|
||||
|
||||
* The default setup processes 30-second windows at 16 kHz mono.
|
||||
* **The processor enforces a hard limit of 20 windows per sample, resulting in a maximum of 10 minutes of audio (20 windows × 30 seconds).**
|
||||
* For each window:
|
||||
|
||||
* `mel_len` is the padded mel length.
|
||||
* A conv stack reduces time as `conv_output_len = (mel_len - 1) // 2 + 1`.
|
||||
* Post-pool frames per window: `post_pool_len = (conv_output_len - 2) // 2 + 1`.
|
||||
* An audio placeholder token is expanded to the sum of `post_pool_len` across all windows.
|
||||
|
||||
## Padding, attention, and caching
|
||||
|
||||
* **Left padding vs right padding**
|
||||
For generation with mixed prompt lengths in a batch, left padding is usually preferable.
|
||||
For training, right padding is common; AF3’s fusion mechanism itself is padding-agnostic because it replaces in place.
|
||||
* **Attention masks**
|
||||
The processor returns `attention_mask` (text) and `input_features_mask` (audio). The model builds an internal 4-D mask on the encoder’s pre-pool axis with negative infinity at pad positions.
|
||||
* **Caching**
|
||||
During generation, `input_features` and `input_features_mask` are only passed on the first step. Subsequent steps use cached keys/values from the language model.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
* Empty or truncated outputs when batching
|
||||
Use left padding for batched generation and decode only the new tokens after the prompt length, as shown in the quickstart.
|
||||
|
||||
## AudioFlamingo3Config
|
||||
|
||||
[[autodoc]] AudioFlamingo3Config
|
||||
|
||||
## AudioFlamingo3EncoderConfig
|
||||
|
||||
[[autodoc]] AudioFlamingo3EncoderConfig
|
||||
|
||||
## AudioFlamingo3Processor
|
||||
|
||||
[[autodoc]] AudioFlamingo3Processor
|
||||
|
||||
## AudioFlamingo3Encoder
|
||||
|
||||
[[autodoc]] AudioFlamingo3Encoder
|
||||
- forward
|
||||
|
||||
## AudioFlamingo3ForConditionalGeneration
|
||||
|
||||
[[autodoc]] AudioFlamingo3ForConditionalGeneration
|
||||
- forward
|
||||
@ -169,6 +169,9 @@ print("Pooled output shape:", pooled_output.shape)
|
||||
[[autodoc]] DINOv3ViTModel
|
||||
- forward
|
||||
|
||||
## DINOv3ViTBackbone
|
||||
[[autodoc]] DINOv3ViTBackbone
|
||||
|
||||
## DINOv3ConvNextModel
|
||||
|
||||
[[autodoc]] DINOv3ConvNextModel
|
||||
|
||||
@ -329,7 +329,7 @@ from torchao.dtypes import Int4XPULayout
|
||||
from torchao.quantization.quant_primitives import ZeroPointDomain
|
||||
|
||||
|
||||
quant_config = Int4WeightOnlyConfig(group_size=128, layout=Int4XPULayout(), zero_point_domain=ZeroPointDomain.INT)
|
||||
quant_config = Int4WeightOnlyConfig(group_size=128, layout=Int4XPULayout(), zero_point_domain=ZeroPointDomain.INT, int4_packing_format="plain_int32")
|
||||
quantization_config = TorchAoConfig(quant_type=quant_config)
|
||||
|
||||
# Load and quantize the model
|
||||
@ -342,7 +342,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
|
||||
input_text = "What are we having for dinner?"
|
||||
input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)
|
||||
input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device)
|
||||
|
||||
# auto-compile the quantized model with `cache_implementation="static"` to get speed up
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
|
||||
@ -395,7 +395,7 @@ from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
|
||||
from torchao.quantization import Int4WeightOnlyConfig
|
||||
from torchao.dtypes import Int4CPULayout
|
||||
|
||||
quant_config = Int4WeightOnlyConfig(group_size=128, layout=Int4CPULayout())
|
||||
quant_config = Int4WeightOnlyConfig(group_size=128, layout=Int4CPULayout(), int4_packing_format="opaque")
|
||||
quantization_config = TorchAoConfig(quant_type=quant_config)
|
||||
|
||||
# Load and quantize the model
|
||||
@ -422,7 +422,7 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
|
||||
#### 1. Skip quantization for certain layers
|
||||
|
||||
With `ModuleFqnToConfig` we can specify a default configuration for all layers while skipping quantization for certain layers.
|
||||
With `FqnToConfig` we can specify a default configuration for all layers while skipping quantization for certain layers.
|
||||
|
||||
```py
|
||||
import torch
|
||||
@ -430,11 +430,11 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
|
||||
|
||||
model_id = "meta-llama/Llama-3.1-8B-Instruct"
|
||||
|
||||
from torchao.quantization import Int4WeightOnlyConfig, ModuleFqnToConfig
|
||||
from torchao.quantization import Int4WeightOnlyConfig, FqnToConfig
|
||||
config = Int4WeightOnlyConfig(group_size=128)
|
||||
|
||||
# set default to int4 (for linears), and skip quantizing `model.layers.0.self_attn.q_proj`
|
||||
quant_config = ModuleFqnToConfig({"_default": config, "model.layers.0.self_attn.q_proj": None})
|
||||
quant_config = FqnToConfig({"_default": config, "model.layers.0.self_attn.q_proj": None})
|
||||
quantization_config = TorchAoConfig(quant_type=quant_config)
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", dtype=torch.bfloat16, quantization_config=quantization_config)
|
||||
# lm_head is not quantized and model.layers.0.self_attn.q_proj is not quantized
|
||||
@ -459,7 +459,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
|
||||
|
||||
model_id = "facebook/opt-125m"
|
||||
|
||||
from torchao.quantization import Int4WeightOnlyConfig, ModuleFqnToConfig, Int8DynamicActivationInt4WeightConfig, IntxWeightOnlyConfig, PerAxis, MappingType
|
||||
from torchao.quantization import Int4WeightOnlyConfig, FqnToConfig, Int8DynamicActivationInt4WeightConfig, IntxWeightOnlyConfig, PerAxis, MappingType
|
||||
|
||||
weight_dtype = torch.int8
|
||||
granularity = PerAxis(0)
|
||||
@ -470,7 +470,7 @@ embedding_config = IntxWeightOnlyConfig(
|
||||
mapping_type=mapping_type,
|
||||
)
|
||||
linear_config = Int8DynamicActivationInt4WeightConfig(group_size=128)
|
||||
quant_config = ModuleFqnToConfig({"_default": linear_config, "model.decoder.embed_tokens": embedding_config, "model.decoder.embed_positions": None})
|
||||
quant_config = FqnToConfig({"_default": linear_config, "model.decoder.embed_tokens": embedding_config, "model.decoder.embed_positions": None})
|
||||
# set `include_embedding` to True in order to include embedding in quantization
|
||||
# when `include_embedding` is True, we'll remove input embedding from `modules_not_to_convert` as well
|
||||
quantization_config = TorchAoConfig(quant_type=quant_config, include_embedding=True)
|
||||
@ -521,7 +521,7 @@ from torchao.quantization import (
|
||||
IntxWeightOnlyConfig,
|
||||
PerRow,
|
||||
PerAxis,
|
||||
ModuleFqnToConfig,
|
||||
FqnToConfig,
|
||||
Float8Tensor,
|
||||
Int4TilePackedTo4dTensor,
|
||||
IntxUnpackedToInt8Tensor,
|
||||
@ -550,7 +550,7 @@ qconfig_dict = {
|
||||
|
||||
"_default": intxwo,
|
||||
}
|
||||
quant_config = ModuleFqnToConfig(qconfig_dict)
|
||||
quant_config = FqnToConfig(qconfig_dict)
|
||||
quantization_config = TorchAoConfig(quant_type=quant_config)
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
model_id,
|
||||
|
||||
@ -127,7 +127,7 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--use_slow_tokenizer",
|
||||
action="store_true",
|
||||
help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
|
||||
help="If passed, will use a slow tokenizer (not backed by the Hugging Face Tokenizers library).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per_device_train_batch_size",
|
||||
|
||||
@ -132,7 +132,7 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--use_slow_tokenizer",
|
||||
action="store_true",
|
||||
help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
|
||||
help="If passed, will use a slow tokenizer (not backed by the Tokenizers library).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per_device_train_batch_size",
|
||||
|
||||
@ -130,7 +130,7 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--use_slow_tokenizer",
|
||||
action="store_true",
|
||||
help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
|
||||
help="If passed, will use a slow tokenizer (not backed by the Hugging Face Tokenizers library).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per_device_train_batch_size",
|
||||
|
||||
@ -128,7 +128,7 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--use_slow_tokenizer",
|
||||
action="store_true",
|
||||
help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
|
||||
help="If passed, will use a slow tokenizer (not backed by the HuggingFace Tokenizers library).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per_device_train_batch_size",
|
||||
|
||||
@ -151,7 +151,7 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--use_slow_tokenizer",
|
||||
action="store_true",
|
||||
help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
|
||||
help="If passed, will use a slow tokenizer (not backed by the Hugging Face Tokenizers library).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per_device_train_batch_size",
|
||||
|
||||
@ -223,7 +223,7 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--use_slow_tokenizer",
|
||||
action="store_true",
|
||||
help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
|
||||
help="If passed, will use a slow tokenizer (not backed by the Hugging Face Tokenizers library).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per_device_train_batch_size",
|
||||
|
||||
@ -74,6 +74,7 @@ class ExamplesTestsNoTrainer(TestCasePlus):
|
||||
def tearDownClass(cls):
|
||||
shutil.rmtree(cls.tmpdir)
|
||||
|
||||
@slow
|
||||
@mock.patch.dict(os.environ, {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true"})
|
||||
def test_run_glue_no_trainer(self):
|
||||
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||
@ -147,6 +148,7 @@ class ExamplesTestsNoTrainer(TestCasePlus):
|
||||
self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
|
||||
self.assertTrue(os.path.exists(os.path.join(tmp_dir, "mlm_no_trainer")))
|
||||
|
||||
@slow
|
||||
@mock.patch.dict(os.environ, {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true"})
|
||||
def test_run_ner_no_trainer(self):
|
||||
# with so little data distributed training needs more epochs to get the score on par with 0/1 gpu
|
||||
@ -175,6 +177,7 @@ class ExamplesTestsNoTrainer(TestCasePlus):
|
||||
self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
|
||||
self.assertTrue(os.path.exists(os.path.join(tmp_dir, "ner_no_trainer")))
|
||||
|
||||
@slow
|
||||
@mock.patch.dict(os.environ, {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true"})
|
||||
def test_run_squad_no_trainer(self):
|
||||
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||
@ -203,6 +206,7 @@ class ExamplesTestsNoTrainer(TestCasePlus):
|
||||
self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
|
||||
self.assertTrue(os.path.exists(os.path.join(tmp_dir, "qa_no_trainer")))
|
||||
|
||||
@slow
|
||||
@mock.patch.dict(os.environ, {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true"})
|
||||
def test_run_swag_no_trainer(self):
|
||||
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||
@ -305,6 +309,7 @@ class ExamplesTestsNoTrainer(TestCasePlus):
|
||||
result = get_results(tmp_dir)
|
||||
self.assertGreaterEqual(result["eval_overall_accuracy"], 0.10)
|
||||
|
||||
@slow
|
||||
@mock.patch.dict(os.environ, {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true"})
|
||||
def test_run_image_classification_no_trainer(self):
|
||||
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||
|
||||
@ -374,6 +374,7 @@ class ExamplesTests(TestCasePlus):
|
||||
result = get_results(tmp_dir)
|
||||
self.assertGreaterEqual(result["eval_bleu"], 30)
|
||||
|
||||
@slow
|
||||
def test_run_image_classification(self):
|
||||
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||
testargs = f"""
|
||||
@ -403,6 +404,7 @@ class ExamplesTests(TestCasePlus):
|
||||
result = get_results(tmp_dir)
|
||||
self.assertGreaterEqual(result["eval_accuracy"], 0.8)
|
||||
|
||||
@slow
|
||||
def test_run_speech_recognition_ctc(self):
|
||||
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||
testargs = f"""
|
||||
@ -573,6 +575,7 @@ class ExamplesTests(TestCasePlus):
|
||||
model = ViTMAEForPreTraining.from_pretrained(tmp_dir)
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
@slow
|
||||
def test_run_semantic_segmentation(self):
|
||||
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||
testargs = f"""
|
||||
@ -597,6 +600,7 @@ class ExamplesTests(TestCasePlus):
|
||||
result = get_results(tmp_dir)
|
||||
self.assertGreaterEqual(result["eval_overall_accuracy"], 0.1)
|
||||
|
||||
@slow
|
||||
@patch.dict(os.environ, {"WANDB_DISABLED": "true"})
|
||||
def test_run_object_detection(self):
|
||||
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||
@ -624,6 +628,7 @@ class ExamplesTests(TestCasePlus):
|
||||
result = get_results(tmp_dir)
|
||||
self.assertGreaterEqual(result["test_map"], 0.1)
|
||||
|
||||
@slow
|
||||
@patch.dict(os.environ, {"WANDB_DISABLED": "true"})
|
||||
def test_run_instance_segmentation(self):
|
||||
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||
|
||||
@ -120,7 +120,7 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--use_slow_tokenizer",
|
||||
action="store_true",
|
||||
help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
|
||||
help="If passed, will use a slow tokenizer (not backed by the Hugging Face Tokenizers library).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per_device_train_batch_size",
|
||||
|
||||
@ -212,7 +212,7 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--use_slow_tokenizer",
|
||||
action="store_true",
|
||||
help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
|
||||
help="If passed, will use a slow tokenizer (not backed by the Hugging Face Tokenizers library).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per_device_train_batch_size",
|
||||
|
||||
@ -50,6 +50,7 @@ checkpoint: 检查点
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
<a href="https://huggingface.co/models"><img alt="Checkpoints on Hub" src="https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen"></a>
|
||||
<a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a>
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a>
|
||||
<a href="https://huggingface.co/docs/transformers/index"><img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/transformers/index.svg?down_color=red&down_message=offline&up_message=online"></a>
|
||||
@ -60,7 +61,7 @@ checkpoint: 检查点
|
||||
|
||||
<h4 align="center">
|
||||
<p>
|
||||
<a href="https://github.com/huggingface/transformers/">English</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README.md">English</a> |
|
||||
<b>简体中文</b> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_zh-hant.md">繁體中文</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ko.md">한국어</a> |
|
||||
@ -68,7 +69,7 @@ checkpoint: 检查点
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ja.md">日本語</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_hd.md">हिन्दी</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ru.md">Русский</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_pt-br.md">Рortuguês</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_pt-br.md">Português</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_te.md">తెలుగు</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
|
||||
@ -81,182 +82,258 @@ checkpoint: 检查点
|
||||
</h4>
|
||||
|
||||
<h3 align="center">
|
||||
<p>为 Jax、PyTorch 和 TensorFlow 打造的先进的自然语言处理函数库</p>
|
||||
<p>为文本、视觉、音频、视频与多模态提供推理与训练的先进预训练模型</p>
|
||||
</h3>
|
||||
|
||||
<h3 align="center">
|
||||
<a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_as_a_model_definition.png"/>
|
||||
</h3>
|
||||
|
||||
🤗 Transformers 提供了数以千计的预训练模型,支持 100 多种语言的文本分类、信息抽取、问答、摘要、翻译、文本生成。它的宗旨是让最先进的 NLP 技术人人易用。
|
||||
Transformers 充当跨文本、计算机视觉、音频、视频与多模态的最先进机器学习模型的「模型定义框架」,同时覆盖推理与训练。
|
||||
|
||||
🤗 Transformers 提供了便于快速下载和使用的API,让你可以把预训练模型用在给定文本、在你的数据集上微调然后通过 [model hub](https://huggingface.co/models) 与社区共享。同时,每个定义的 Python 模块都是完全独立的,便于修改和快速进行研究实验。
|
||||
它将模型的定义集中化,使整个生态系统对该定义达成一致。`transformers` 是跨框架的枢纽:一旦某模型定义被支持,它通常就能兼容多数训练框架(如 Axolotl、Unsloth、DeepSpeed、FSDP、PyTorch‑Lightning 等)、推理引擎(如 vLLM、SGLang、TGI 等),以及依赖 `transformers` 模型定义的相关库(如 llama.cpp、mlx 等)。
|
||||
|
||||
🤗 Transformers 支持三个最热门的深度学习库: [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) 以及 [TensorFlow](https://www.tensorflow.org/) — 并与之无缝整合。你可以直接使用一个框架训练你的模型然后用另一个加载和推理。
|
||||
我们的目标是持续支持新的最先进模型,并通过让模型定义保持简单、可定制且高效来普及其使用。
|
||||
|
||||
## 在线演示
|
||||
|
||||
你可以直接在模型页面上测试大多数 [model hub](https://huggingface.co/models) 上的模型。 我们也提供了 [私有模型托管、模型版本管理以及推理API](https://huggingface.co/pricing)。
|
||||
|
||||
这里是一些例子:
|
||||
- [用 BERT 做掩码填词](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
|
||||
- [用 Electra 做命名实体识别](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
|
||||
- [用 GPT-2 做文本生成](https://huggingface.co/openai-community/gpt2?text=A+long+time+ago%2C+)
|
||||
- [用 RoBERTa 做自然语言推理](https://huggingface.co/FacebookAI/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
|
||||
- [用 BART 做文本摘要](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
|
||||
- [用 DistilBERT 做问答](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
|
||||
- [用 T5 做翻译](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
|
||||
|
||||
**[Write With Transformer](https://transformer.huggingface.co)**,由 Hugging Face 团队打造,是一个文本生成的官方 demo。
|
||||
|
||||
## 如果你在寻找由 Hugging Face 团队提供的定制化支持服务
|
||||
|
||||
<a target="_blank" href="https://huggingface.co/support">
|
||||
<img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
|
||||
</a><br>
|
||||
|
||||
## 快速上手
|
||||
|
||||
我们为快速使用模型提供了 `pipeline` API。Pipeline 聚合了预训练模型和对应的文本预处理。下面是一个快速使用 pipeline 去判断正负面情绪的例子:
|
||||
|
||||
```python
|
||||
>>> from transformers import pipeline
|
||||
|
||||
# 使用情绪分析 pipeline
|
||||
>>> classifier = pipeline('sentiment-analysis')
|
||||
>>> classifier('We are very happy to introduce pipeline to the transformers repository.')
|
||||
[{'label': 'POSITIVE', 'score': 0.9996980428695679}]
|
||||
```
|
||||
|
||||
第二行代码下载并缓存了 pipeline 使用的预训练模型,而第三行代码则在给定的文本上进行了评估。这里的答案"正面" (positive) 具有 99 的置信度。
|
||||
|
||||
许多的 NLP 任务都有开箱即用的预训练 `pipeline`。比如说,我们可以轻松的从给定文本中抽取问题答案:
|
||||
|
||||
``` python
|
||||
>>> from transformers import pipeline
|
||||
|
||||
# 使用问答 pipeline
|
||||
>>> question_answerer = pipeline('question-answering')
|
||||
>>> question_answerer({
|
||||
... 'question': 'What is the name of the repository ?',
|
||||
... 'context': 'Pipeline has been included in the huggingface/transformers repository'
|
||||
... })
|
||||
{'score': 0.30970096588134766, 'start': 34, 'end': 58, 'answer': 'huggingface/transformers'}
|
||||
|
||||
```
|
||||
|
||||
除了给出答案,预训练模型还给出了对应的置信度分数、答案在词符化 (tokenized) 后的文本中开始和结束的位置。你可以从[这个教程](https://huggingface.co/docs/transformers/task_summary)了解更多 `pipeline` API 支持的任务。
|
||||
|
||||
要在你的任务上下载和使用任意预训练模型也很简单,只需三行代码。这里是 PyTorch 版的示例:
|
||||
```python
|
||||
>>> from transformers import AutoTokenizer, AutoModel
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
|
||||
>>> model = AutoModel.from_pretrained("google-bert/bert-base-uncased")
|
||||
|
||||
>>> inputs = tokenizer("Hello world!", return_tensors="pt")
|
||||
>>> outputs = model(**inputs)
|
||||
```
|
||||
这里是等效的 TensorFlow 代码:
|
||||
```python
|
||||
>>> from transformers import AutoTokenizer, TFAutoModel
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
|
||||
>>> model = TFAutoModel.from_pretrained("google-bert/bert-base-uncased")
|
||||
|
||||
>>> inputs = tokenizer("Hello world!", return_tensors="tf")
|
||||
>>> outputs = model(**inputs)
|
||||
```
|
||||
|
||||
词符化器 (tokenizer) 为所有的预训练模型提供了预处理,并可以直接对单个字符串进行调用(比如上面的例子)或对列表 (list) 调用。它会输出一个你可以在下游代码里使用或直接通过 `**` 解包表达式传给模型的词典 (dict)。
|
||||
|
||||
模型本身是一个常规的 [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) 或 [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model)(取决于你的后端),可以常规方式使用。 [这个教程](https://huggingface.co/transformers/training.html)解释了如何将这样的模型整合到经典的 PyTorch 或 TensorFlow 训练循环中,或是如何使用我们的 `Trainer` (训练器)API 来在一个新的数据集上快速微调。
|
||||
|
||||
## 为什么要用 transformers?
|
||||
|
||||
1. 便于使用的先进模型:
|
||||
- NLU 和 NLG 上表现优越
|
||||
- 对教学和实践友好且低门槛
|
||||
- 高级抽象,只需了解三个类
|
||||
- 对所有模型统一的API
|
||||
|
||||
1. 更低计算开销,更少的碳排放:
|
||||
- 研究人员可以分享已训练的模型而非每次从头开始训练
|
||||
- 工程师可以减少计算用时和生产环境开销
|
||||
- 数十种模型架构、两千多个预训练模型、100多种语言支持
|
||||
|
||||
1. 对于模型生命周期的每一个部分都面面俱到:
|
||||
- 训练先进的模型,只需 3 行代码
|
||||
- 模型在不同深度学习框架间任意转移,随你心意
|
||||
- 为训练、评估和生产选择最适合的框架,衔接无缝
|
||||
|
||||
1. 为你的需求轻松定制专属模型和用例:
|
||||
- 我们为每种模型架构提供了多个用例来复现原论文结果
|
||||
- 模型内部结构保持透明一致
|
||||
- 模型文件可单独使用,方便修改和快速实验
|
||||
|
||||
## 什么情况下我不该用 transformers?
|
||||
|
||||
- 本库并不是模块化的神经网络工具箱。模型文件中的代码特意呈若璞玉,未经额外抽象封装,以便研究人员快速迭代修改而不致溺于抽象和文件跳转之中。
|
||||
- `Trainer` API 并非兼容任何模型,只为本库之模型优化。若是在寻找适用于通用机器学习的训练循环实现,请另觅他库。
|
||||
- 尽管我们已尽力而为,[examples 目录](https://github.com/huggingface/transformers/tree/main/examples)中的脚本也仅为用例而已。对于你的特定问题,它们并不一定开箱即用,可能需要改几行代码以适之。
|
||||
目前在 [Hugging Face Hub](https://huggingface.com/models) 上有超过 1M+ 使用 `transformers` 的[模型检查点](https://huggingface.co/models?library=transformers&sort=trending),可随取随用。
|
||||
|
||||
今天就去探索 Hub,找到一个模型,并用 Transformers 立刻开始吧。
|
||||
|
||||
## 安装
|
||||
|
||||
### 使用 pip
|
||||
Transformers 支持 Python 3.9+,以及 [PyTorch](https://pytorch.org/get-started/locally/) 2.1+。
|
||||
|
||||
这个仓库已在 Python 3.9+、Flax 0.4.1+、PyTorch 2.1+ 和 TensorFlow 2.6+ 下经过测试。
|
||||
使用 [venv](https://docs.python.org/3/library/venv.html) 或 [uv](https://docs.astral.sh/uv/)(一个基于 Rust 的快速 Python 包与项目管理器)创建并激活虚拟环境:
|
||||
|
||||
你可以在[虚拟环境](https://docs.python.org/3/library/venv.html)中安装 🤗 Transformers。如果你还不熟悉 Python 的虚拟环境,请阅此[用户说明](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)。
|
||||
|
||||
首先,用你打算使用的版本的 Python 创建一个虚拟环境并激活。
|
||||
|
||||
然后,你需要安装 Flax、PyTorch 或 TensorFlow 其中之一。关于在你使用的平台上安装这些框架,请参阅 [TensorFlow 安装页](https://www.tensorflow.org/install/), [PyTorch 安装页](https://pytorch.org/get-started/locally/#start-locally) 或 [Flax 安装页](https://github.com/google/flax#quick-install)。
|
||||
|
||||
当这些后端之一安装成功后, 🤗 Transformers 可依此安装:
|
||||
|
||||
```bash
|
||||
pip install transformers
|
||||
```py
|
||||
# venv
|
||||
python -m venv .my-env
|
||||
source .my-env/bin/activate
|
||||
# uv
|
||||
uv venv .my-env
|
||||
source .my-env/bin/activate
|
||||
```
|
||||
|
||||
如果你想要试试用例或者想在正式发布前使用最新的开发中代码,你得[从源代码安装](https://huggingface.co/docs/transformers/installation#installing-from-source)。
|
||||
在虚拟环境中安装 Transformers:
|
||||
|
||||
### 使用 conda
|
||||
```py
|
||||
# pip
|
||||
pip install "transformers[torch]"
|
||||
|
||||
🤗 Transformers 可以通过 conda 依此安装:
|
||||
|
||||
```shell script
|
||||
conda install conda-forge::transformers
|
||||
# uv
|
||||
uv pip install "transformers[torch]"
|
||||
```
|
||||
|
||||
> **_笔记:_** 从 `huggingface` 渠道安装 `transformers` 已被废弃。
|
||||
如果你需要库中的最新改动或计划参与贡献,可从源码安装(注意:最新版可能不稳定;如遇错误,欢迎在 [issues](https://github.com/huggingface/transformers/issues) 中反馈):
|
||||
|
||||
要通过 conda 安装 Flax、PyTorch 或 TensorFlow 其中之一,请参阅它们各自安装页的说明。
|
||||
```shell
|
||||
git clone https://github.com/huggingface/transformers.git
|
||||
cd transformers
|
||||
|
||||
## 模型架构
|
||||
# pip
|
||||
pip install '.[torch]'
|
||||
|
||||
🤗 Transformers 支持的[**所有的模型检查点**](https://huggingface.co/models)由[用户](https://huggingface.co/users)和[组织](https://huggingface.co/organizations)上传,均与 huggingface.co [model hub](https://huggingface.co) 无缝整合。
|
||||
# uv
|
||||
uv pip install '.[torch]'
|
||||
```
|
||||
|
||||
目前的检查点数量: 
|
||||
## 快速上手
|
||||
|
||||
🤗 Transformers 目前支持如下的架构: 模型概述请阅[这里](https://huggingface.co/docs/transformers/model_summary).
|
||||
使用 [Pipeline](https://huggingface.co/docs/transformers/pipeline_tutorial) API 一步上手。`Pipeline` 是一个高级推理类,支持文本、音频、视觉与多模态任务,负责输入预处理并返回适配的输出。
|
||||
|
||||
要检查某个模型是否已有 Flax、PyTorch 或 TensorFlow 的实现,或其是否在 🤗 Tokenizers 库中有对应词符化器(tokenizer),敬请参阅[此表](https://huggingface.co/docs/transformers/index#supported-frameworks)。
|
||||
实例化一个用于文本生成的 pipeline,指定使用的模型。模型会被下载并缓存,方便复用。最后传入文本作为提示:
|
||||
|
||||
这些实现均已于多个数据集测试(请参看用例脚本)并应于原版实现表现相当。你可以在用例文档的[此节](https://huggingface.co/docs/transformers/examples)中了解表现的细节。
|
||||
```py
|
||||
from transformers import pipeline
|
||||
|
||||
pipeline = pipeline(task="text-generation", model="Qwen/Qwen2.5-1.5B")
|
||||
pipeline("the secret to baking a really good cake is ")
|
||||
[{'generated_text': 'the secret to baking a really good cake is 1) to use the right ingredients and 2) to follow the recipe exactly. the recipe for the cake is as follows: 1 cup of sugar, 1 cup of flour, 1 cup of milk, 1 cup of butter, 1 cup of eggs, 1 cup of chocolate chips. if you want to make 2 cakes, how much sugar do you need? To make 2 cakes, you will need 2 cups of sugar.'}]
|
||||
```
|
||||
|
||||
要与模型进行「聊天」,用法也一致。唯一不同是需要构造一段「聊天历史」(即 `Pipeline` 的输入):
|
||||
|
||||
> [!TIP]
|
||||
> 你也可以直接在命令行与模型聊天:
|
||||
> ```shell
|
||||
> transformers chat Qwen/Qwen2.5-0.5B-Instruct
|
||||
> ```
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import pipeline
|
||||
|
||||
chat = [
|
||||
{"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
|
||||
{"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
|
||||
]
|
||||
|
||||
pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", dtype=torch.bfloat16, device_map="auto")
|
||||
response = pipeline(chat, max_new_tokens=512)
|
||||
print(response[0]["generated_text"][-1]["content"])
|
||||
```
|
||||
|
||||
展开下方示例,查看 `Pipeline` 在不同模态与任务中的用法。
|
||||
|
||||
<details>
|
||||
<summary>自动语音识别</summary>
|
||||
|
||||
```py
|
||||
from transformers import pipeline
|
||||
|
||||
pipeline = pipeline(task="automatic-speech-recognition", model="openai/whisper-large-v3")
|
||||
pipeline("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
|
||||
{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>图像分类</summary>
|
||||
|
||||
<h3 align="center">
|
||||
<a><img src="https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png"></a>
|
||||
</h3>
|
||||
|
||||
```py
|
||||
from transformers import pipeline
|
||||
|
||||
pipeline = pipeline(task="image-classification", model="facebook/dinov2-small-imagenet1k-1-layer")
|
||||
pipeline("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
|
||||
[{"label": "macaw", "score": 0.997848391532898},
|
||||
{"label": "sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita",
|
||||
"score": 0.0016551691805943847},
|
||||
{"label": "lorikeet", "score": 0.00018523589824326336},
|
||||
{"label": "African grey, African gray, Psittacus erithacus",
|
||||
"score": 7.85409429227002e-05},
|
||||
{"label": "quail", "score": 5.502637941390276e-05}]
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>视觉问答</summary>
|
||||
|
||||
<h3 align="center">
|
||||
<a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg"></a>
|
||||
</h3>
|
||||
|
||||
```py
|
||||
from transformers import pipeline
|
||||
|
||||
pipeline = pipeline(task="visual-question-answering", model="Salesforce/blip-vqa-base")
|
||||
pipeline(
|
||||
image="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg",
|
||||
question="What is in the image?",
|
||||
)
|
||||
[{"answer": "statue of liberty"}]
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
## 为什么要用 Transformers?
|
||||
|
||||
1. 易于使用的最先进模型:
|
||||
- 在自然语言理解与生成、计算机视觉、音频、视频与多模态任务上表现优越。
|
||||
- 对研究者、工程师与开发者友好且低门槛。
|
||||
- 少量用户侧抽象,仅需学习三个类。
|
||||
- 统一的 API,使用所有预训练模型体验一致。
|
||||
|
||||
1. 更低计算开销与更小碳足迹:
|
||||
- 共享已训练的模型,而非每次从零开始训练。
|
||||
- 减少计算时间与生产环境成本。
|
||||
- 覆盖数十种模型架构,跨所有模态提供 1M+ 预训练检查点。
|
||||
|
||||
1. 在模型生命周期的每个阶段都可以选用合适的框架:
|
||||
- 3 行代码即可训练最先进模型。
|
||||
- 在 PyTorch/JAX/TF2.0 间自由迁移同一个模型。
|
||||
- 为训练、评估与生产挑选最合适的框架。
|
||||
|
||||
1. 轻松定制模型或用例:
|
||||
- 为每个架构提供示例以复现原论文结果。
|
||||
- 尽可能一致地暴露模型内部。
|
||||
- 模型文件可独立于库使用,便于快速实验。
|
||||
|
||||
<a target="_blank" href="https://huggingface.co/enterprise">
|
||||
<img alt="Hugging Face Enterprise Hub" src="https://github.com/user-attachments/assets/247fb16d-d251-4583-96c4-d3d76dda4925">
|
||||
</a><br>
|
||||
|
||||
## 为什么我不该用 Transformers?
|
||||
|
||||
- 该库不是一个可自由拼搭的神经网络模块化工具箱。模型文件中的代码刻意减少额外抽象,以便研究者能快速在各个模型上迭代,而无需深入更多抽象或文件跳转。
|
||||
- 训练 API 优化用于 Transformers 提供的 PyTorch 模型。若需要通用的机器学习训练循环,请使用其它库,如 [Accelerate](https://huggingface.co/docs/accelerate)。
|
||||
- [示例脚本](https://github.com/huggingface/transformers/tree/main/examples)只是「示例」。它们不一定能直接适配你的具体用例,需要你进行必要的改动。
|
||||
|
||||
|
||||
## 了解更多
|
||||
## 100 个使用 Transformers 的项目
|
||||
|
||||
| 章节 | 描述 |
|
||||
|-|-|
|
||||
| [文档](https://huggingface.co/docs/transformers/) | 完整的 API 文档和教程 |
|
||||
| [任务总结](https://huggingface.co/docs/transformers/task_summary) | 🤗 Transformers 支持的任务 |
|
||||
| [预处理教程](https://huggingface.co/docs/transformers/preprocessing) | 使用 `Tokenizer` 来为模型准备数据 |
|
||||
| [训练和微调](https://huggingface.co/docs/transformers/training) | 在 PyTorch/TensorFlow 的训练循环或 `Trainer` API 中使用 🤗 Transformers 提供的模型 |
|
||||
| [快速上手:微调和用例脚本](https://github.com/huggingface/transformers/tree/main/examples) | 为各种任务提供的用例脚本 |
|
||||
| [模型分享和上传](https://huggingface.co/docs/transformers/model_sharing) | 和社区上传和分享你微调的模型 |
|
||||
| [迁移](https://huggingface.co/docs/transformers/migration) | 从 `pytorch-transformers` 或 `pytorch-pretrained-bert` 迁移到 🤗 Transformers |
|
||||
Transformers 不止是一个使用预训练模型的工具包,它还是围绕 Hugging Face Hub 构建的项目社区。我们希望 Transformers 能助力开发者、研究人员、学生、老师、工程师与任何人构建理想项目。
|
||||
|
||||
为庆祝 Transformers 获得 100,000 颗星,我们制作了 [awesome-transformers](./awesome-transformers.md) 页面,展示了 100 个由社区构建的优秀项目。
|
||||
|
||||
如果你拥有或使用某个项目,认为它应该在列表中出现,欢迎提交 PR 添加它!
|
||||
|
||||
## 示例模型
|
||||
|
||||
你可以直接在它们的 [Hub 模型页](https://huggingface.co/models) 上测试我们的多数模型。
|
||||
|
||||
展开每个模态以查看不同用例中的部分示例模型。
|
||||
|
||||
<details>
|
||||
<summary>音频</summary>
|
||||
|
||||
- 使用 [Whisper](https://huggingface.co/openai/whisper-large-v3-turbo) 进行音频分类
|
||||
- 使用 [Moonshine](https://huggingface.co/UsefulSensors/moonshine) 进行自动语音识别
|
||||
- 使用 [Wav2Vec2](https://huggingface.co/superb/wav2vec2-base-superb-ks) 进行关键词检索
|
||||
- 使用 [Moshi](https://huggingface.co/kyutai/moshiko-pytorch-bf16) 进行语音到语音生成
|
||||
- 使用 [MusicGen](https://huggingface.co/facebook/musicgen-large) 文本到音频生成
|
||||
- 使用 [Bark](https://huggingface.co/suno/bark) 文本到语音生成
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>计算机视觉</summary>
|
||||
|
||||
- 使用 [SAM](https://huggingface.co/facebook/sam-vit-base) 自动生成掩码
|
||||
- 使用 [DepthPro](https://huggingface.co/apple/DepthPro-hf) 进行深度估计
|
||||
- 使用 [DINO v2](https://huggingface.co/facebook/dinov2-base) 进行图像分类
|
||||
- 使用 [SuperPoint](https://huggingface.co/magic-leap-community/superpoint) 进行关键点检测
|
||||
- 使用 [SuperGlue](https://huggingface.co/magic-leap-community/superglue_outdoor) 进行关键点匹配
|
||||
- 使用 [RT-DETRv2](https://huggingface.co/PekingU/rtdetr_v2_r50vd) 进行目标检测
|
||||
- 使用 [VitPose](https://huggingface.co/usyd-community/vitpose-base-simple) 进行姿态估计
|
||||
- 使用 [OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_swin_large) 进行通用分割
|
||||
- 使用 [VideoMAE](https://huggingface.co/MCG-NJU/videomae-large) 进行视频分类
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>多模态</summary>
|
||||
|
||||
- 使用 [Qwen2-Audio](https://huggingface.co/Qwen/Qwen2-Audio-7B) 实现音频或文本到文本
|
||||
- 使用 [LayoutLMv3](https://huggingface.co/microsoft/layoutlmv3-base) 进行文档问答
|
||||
- 使用 [Qwen-VL](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct) 实现图像或文本到文本
|
||||
- 使用 [BLIP-2](https://huggingface.co/Salesforce/blip2-opt-2.7b) 进行图文描述
|
||||
- 使用 [GOT-OCR2](https://huggingface.co/stepfun-ai/GOT-OCR-2.0-hf) 进行基于 OCR 的文档理解
|
||||
- 使用 [TAPAS](https://huggingface.co/google/tapas-base) 进行表格问答
|
||||
- 使用 [Emu3](https://huggingface.co/BAAI/Emu3-Gen) 进行统一的多模态理解与生成
|
||||
- 使用 [Llava-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf) 视觉到文本
|
||||
- 使用 [Llava](https://huggingface.co/llava-hf/llava-1.5-7b-hf) 进行视觉问答
|
||||
- 使用 [Kosmos-2](https://huggingface.co/microsoft/kosmos-2-patch14-224) 进行视觉指代表达分割
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>NLP</summary>
|
||||
|
||||
- 使用 [ModernBERT](https://huggingface.co/answerdotai/ModernBERT-base) 进行掩码词填充
|
||||
- 使用 [Gemma](https://huggingface.co/google/gemma-2-2b) 进行命名实体识别(NER)
|
||||
- 使用 [Mixtral](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) 进行问答
|
||||
- 使用 [BART](https://huggingface.co/facebook/bart-large-cnn) 进行摘要
|
||||
- 使用 [T5](https://huggingface.co/google-t5/t5-base) 进行翻译
|
||||
- 使用 [Llama](https://huggingface.co/meta-llama/Llama-3.2-1B) 进行文本生成
|
||||
- 使用 [Qwen](https://huggingface.co/Qwen/Qwen2.5-0.5B) 进行文本分类
|
||||
|
||||
</details>
|
||||
|
||||
## 引用
|
||||
|
||||
|
||||
@ -14,43 +14,6 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<!---
|
||||
A useful guide for English-Traditional Chinese translation of Hugging Face documentation
|
||||
- Add space around English words and numbers when they appear between Chinese characters. E.g., 共 100 多種語言; 使用 transformers 函式庫。
|
||||
- Use square quotes, e.g.,「引用」
|
||||
- Some of terms in the file can be found at National Academy for Educational Research (https://terms.naer.edu.tw/), an official website providing bilingual translations between English and Traditional Chinese.
|
||||
|
||||
Dictionary
|
||||
|
||||
API: API (不翻譯)
|
||||
add: 加入
|
||||
checkpoint: 檢查點
|
||||
code: 程式碼
|
||||
community: 社群
|
||||
confidence: 信賴度
|
||||
dataset: 資料集
|
||||
documentation: 文件
|
||||
example: 基本翻譯為「範例」,或依語意翻為「例子」
|
||||
finetune: 微調
|
||||
Hugging Face: Hugging Face(不翻譯)
|
||||
implementation: 實作
|
||||
inference: 推論
|
||||
library: 函式庫
|
||||
module: 模組
|
||||
NLP/Natural Language Processing: 以 NLP 出現時不翻譯,以 Natural Language Processing 出現時翻譯為自然語言處理
|
||||
online demos: 線上Demo
|
||||
pipeline: pipeline(不翻譯)
|
||||
pretrained/pretrain: 預訓練
|
||||
Python data structures (e.g., list, set, dict): 翻譯為串列,集合,字典,並用括號標註原英文
|
||||
repository: repository(不翻譯)
|
||||
summary: 概覽
|
||||
token-: token-(不翻譯)
|
||||
Trainer: Trainer(不翻譯)
|
||||
transformer: transformer(不翻譯)
|
||||
tutorial: 教學
|
||||
user: 使用者
|
||||
-->
|
||||
|
||||
<p align="center">
|
||||
<picture>
|
||||
<source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
|
||||
@ -62,6 +25,7 @@ user: 使用者
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
<a href="https://huggingface.com/models"><img alt="Checkpoints on Hub" src="https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen"></a>
|
||||
<a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a>
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a>
|
||||
<a href="https://huggingface.co/docs/transformers/index"><img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/transformers/index.svg?down_color=red&down_message=offline&up_message=online"></a>
|
||||
@ -72,7 +36,7 @@ user: 使用者
|
||||
|
||||
<h4 align="center">
|
||||
<p>
|
||||
<a href="https://github.com/huggingface/transformers/">English</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/README.md">English</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_zh-hans.md">简体中文</a> |
|
||||
<b>繁體中文</b> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ko.md">한국어</a> |
|
||||
@ -80,7 +44,7 @@ user: 使用者
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ja.md">日本語</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_hd.md">हिन्दी</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ru.md">Русский</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_pt-br.md">Рortuguês</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_pt-br.md">Português</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_te.md">తెలుగు</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
|
||||
@ -93,186 +57,261 @@ user: 使用者
|
||||
</h4>
|
||||
|
||||
<h3 align="center">
|
||||
<p>為 Jax、PyTorch 以及 TensorFlow 打造的先進自然語言處理函式庫</p>
|
||||
<p>最先進的預訓練模型,專為推理與訓練而生</p>
|
||||
</h3>
|
||||
|
||||
<h3 align="center">
|
||||
<a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_as_a_model_definition.png"/>
|
||||
</h3>
|
||||
|
||||
🤗 Transformers 提供了數以千計的預訓練模型,支援 100 多種語言的文本分類、資訊擷取、問答、摘要、翻譯、文本生成。它的宗旨是讓最先進的 NLP 技術人人易用。
|
||||
Transformers 是一個為最先進的機器學習模型(涵蓋文字、電腦視覺、音訊、影片及多模態)提供推理和訓練支援的模型定義框架。
|
||||
|
||||
🤗 Transformers 提供了便於快速下載和使用的API,讓你可以將預訓練模型用在給定文本、在你的資料集上微調然後經由 [model hub](https://huggingface.co/models) 與社群共享。同時,每個定義的 Python 模組架構均完全獨立,方便修改和快速研究實驗。
|
||||
它將模型定義集中化,使得該定義在整個生態系中能夠達成共識。`transformers` 是貫穿各個框架的樞紐:如果一個模型定義受到支援,它將與大多數訓練框架(如 Axolotl、Unsloth、DeepSpeed、FSDP、PyTorch-Lightning 等)、推理引擎(如 vLLM、SGLang、TGI 等)以及利用 `transformers` 模型定義的周邊建模函式庫(如 llama.cpp、mlx 等)相容。
|
||||
|
||||
🤗 Transformers 支援三個最熱門的深度學習函式庫: [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) 以及 [TensorFlow](https://www.tensorflow.org/) — 並與之完美整合。你可以直接使用其中一個框架訓練你的模型,然後用另一個載入和推論。
|
||||
我們致力於支援最新的頂尖模型,並透過使其模型定義變得簡單、可客製化且高效,來普及它們的應用。
|
||||
|
||||
## 線上Demo
|
||||
在 [Hugging Face Hub](https://huggingface.com/models) 上,有超過 100 萬個 Transformers [模型檢查點](https://huggingface.co/models?library=transformers&sort=trending) 供您使用。
|
||||
|
||||
你可以直接在 [model hub](https://huggingface.co/models) 上測試大多數的模型。我們也提供了 [私有模型託管、模型版本管理以及推論API](https://huggingface.co/pricing)。
|
||||
|
||||
這裡是一些範例:
|
||||
- [用 BERT 做遮蓋填詞](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
|
||||
- [用 Electra 做專有名詞辨識](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
|
||||
- [用 GPT-2 做文本生成](https://huggingface.co/openai-community/gpt2?text=A+long+time+ago%2C+)
|
||||
- [用 RoBERTa 做自然語言推論](https://huggingface.co/FacebookAI/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
|
||||
- [用 BART 做文本摘要](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
|
||||
- [用 DistilBERT 做問答](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
|
||||
- [用 T5 做翻譯](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
|
||||
|
||||
**[Write With Transformer](https://transformer.huggingface.co)**,由 Hugging Face 團隊所打造,是一個文本生成的官方 demo。
|
||||
|
||||
## 如果你在尋找由 Hugging Face 團隊所提供的客製化支援服務
|
||||
|
||||
<a target="_blank" href="https://huggingface.co/support">
|
||||
<img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
|
||||
</a><br>
|
||||
|
||||
## 快速上手
|
||||
|
||||
我們為快速使用模型提供了 `pipeline` API。 Pipeline 包含了預訓練模型和對應的文本預處理。下面是一個快速使用 pipeline 去判斷正負面情緒的例子:
|
||||
|
||||
```python
|
||||
>>> from transformers import pipeline
|
||||
|
||||
# 使用情緒分析 pipeline
|
||||
>>> classifier = pipeline('sentiment-analysis')
|
||||
>>> classifier('We are very happy to introduce pipeline to the transformers repository.')
|
||||
[{'label': 'POSITIVE', 'score': 0.9996980428695679}]
|
||||
```
|
||||
|
||||
第二行程式碼下載並快取 pipeline 使用的預訓練模型,而第三行程式碼則在給定的文本上進行了評估。這裡的答案“正面” (positive) 具有 99.97% 的信賴度。
|
||||
|
||||
許多的 NLP 任務都有隨選即用的預訓練 `pipeline`。例如,我們可以輕鬆地從給定文本中擷取問題答案:
|
||||
|
||||
``` python
|
||||
>>> from transformers import pipeline
|
||||
|
||||
# 使用問答 pipeline
|
||||
>>> question_answerer = pipeline('question-answering')
|
||||
>>> question_answerer({
|
||||
... 'question': 'What is the name of the repository ?',
|
||||
... 'context': 'Pipeline has been included in the huggingface/transformers repository'
|
||||
... })
|
||||
{'score': 0.30970096588134766, 'start': 34, 'end': 58, 'answer': 'huggingface/transformers'}
|
||||
|
||||
```
|
||||
|
||||
除了提供問題解答,預訓練模型還提供了對應的信賴度分數以及解答在 tokenized 後的文本中開始和結束的位置。你可以從[這個教學](https://huggingface.co/docs/transformers/task_summary)了解更多 `pipeline` API支援的任務。
|
||||
|
||||
要在你的任務中下載和使用任何預訓練模型很簡單,只需三行程式碼。這裡是 PyTorch 版的範例:
|
||||
```python
|
||||
>>> from transformers import AutoTokenizer, AutoModel
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
|
||||
>>> model = AutoModel.from_pretrained("google-bert/bert-base-uncased")
|
||||
|
||||
>>> inputs = tokenizer("Hello world!", return_tensors="pt")
|
||||
>>> outputs = model(**inputs)
|
||||
```
|
||||
這裡是對應的 TensorFlow 程式碼:
|
||||
```python
|
||||
>>> from transformers import AutoTokenizer, TFAutoModel
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
|
||||
>>> model = TFAutoModel.from_pretrained("google-bert/bert-base-uncased")
|
||||
|
||||
>>> inputs = tokenizer("Hello world!", return_tensors="tf")
|
||||
>>> outputs = model(**inputs)
|
||||
```
|
||||
|
||||
Tokenizer 為所有的預訓練模型提供了預處理,並可以直接轉換單一字串(比如上面的例子)或串列 (list)。它會輸出一個的字典 (dict) 讓你可以在下游程式碼裡使用或直接藉由 `**` 運算式傳給模型。
|
||||
|
||||
模型本身是一個常規的 [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) 或 [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model)(取決於你的後端),可依常規方式使用。 [這個教學](https://huggingface.co/transformers/training.html)解釋了如何將這樣的模型整合到一般的 PyTorch 或 TensorFlow 訓練迴圈中,或是如何使用我們的 `Trainer` API 在一個新的資料集上快速進行微調。
|
||||
|
||||
## 為什麼要用 transformers?
|
||||
|
||||
1. 便於使用的先進模型:
|
||||
- NLU 和 NLG 上性能卓越
|
||||
- 對教學和實作友好且低門檻
|
||||
- 高度抽象,使用者只須學習 3 個類別
|
||||
- 對所有模型使用的制式化API
|
||||
|
||||
1. 更低的運算成本,更少的碳排放:
|
||||
- 研究人員可以分享已訓練的模型而非每次從頭開始訓練
|
||||
- 工程師可以減少計算時間以及生產成本
|
||||
- 數十種模型架構、兩千多個預訓練模型、100多種語言支援
|
||||
|
||||
1. 對於模型生命週期的每一個部分都面面俱到:
|
||||
- 訓練先進的模型,只需 3 行程式碼
|
||||
- 模型可以在不同深度學習框架之間任意轉換
|
||||
- 為訓練、評估和生產選擇最適合的框架,並完美銜接
|
||||
|
||||
1. 為你的需求輕鬆客製化專屬模型和範例:
|
||||
- 我們為每種模型架構提供了多個範例來重現原論文結果
|
||||
- 一致的模型內部架構
|
||||
- 模型檔案可單獨使用,便於修改和快速實驗
|
||||
|
||||
## 什麼情況下我不該用 transformers?
|
||||
|
||||
- 本函式庫並不是模組化的神經網絡工具箱。模型文件中的程式碼並未做額外的抽象封裝,以便研究人員快速地翻閱及修改程式碼,而不會深陷複雜的類別包裝之中。
|
||||
- `Trainer` API 並非相容任何模型,它只為本函式庫中的模型最佳化。對於一般的機器學習用途,請使用其他函式庫。
|
||||
- 儘管我們已盡力而為,[examples 目錄](https://github.com/huggingface/transformers/tree/main/examples)中的腳本也僅為範例而已。對於特定問題,它們並不一定隨選即用,可能需要修改幾行程式碼以符合需求。
|
||||
立即探索 [Hub](https://huggingface.com/),尋找合適的模型,並使用 Transformers 幫助您快速上手。
|
||||
|
||||
## 安裝
|
||||
|
||||
### 使用 pip
|
||||
Transformers 支援 Python 3.9+ 和 [PyTorch](https://pytorch.org/get-started/locally/) 2.1+。
|
||||
|
||||
這個 Repository 已在 Python 3.9+、Flax 0.4.1+、PyTorch 2.1+ 和 TensorFlow 2.6+ 下經過測試。
|
||||
使用 [venv](https://docs.python.org/3/library/venv.html) 或基於 Rust 的高速 Python 套件及專案管理器 [uv](https://docs.astral.sh/uv/) 來建立並啟用虛擬環境。
|
||||
|
||||
你可以在[虛擬環境](https://docs.python.org/3/library/venv.html)中安裝 🤗 Transformers。如果你還不熟悉 Python 的虛擬環境,請閱此[使用者指引](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)。
|
||||
|
||||
首先,用你打算使用的版本的 Python 創建一個虛擬環境並進入。
|
||||
|
||||
然後,你需要安裝 Flax、PyTorch 或 TensorFlow 其中之一。對於該如何在你使用的平台上安裝這些框架,請參閱 [TensorFlow 安裝頁面](https://www.tensorflow.org/install/), [PyTorch 安裝頁面](https://pytorch.org/get-started/locally/#start-locally) 或 [Flax 安裝頁面](https://github.com/google/flax#quick-install)。
|
||||
|
||||
當其中一個後端安裝成功後,🤗 Transformers 可依此安裝:
|
||||
|
||||
```bash
|
||||
pip install transformers
|
||||
```py
|
||||
# venv
|
||||
python -m venv .my-env
|
||||
source .my-env/bin/activate
|
||||
# uv
|
||||
uv venv .my-env
|
||||
source .my-env/bin/activate
|
||||
```
|
||||
|
||||
如果你想要試試範例或者想在正式發布前使用最新開發中的程式碼,你必須[從原始碼安裝](https://huggingface.co/docs/transformers/installation#installing-from-source)。
|
||||
在您的虛擬環境中安裝 Transformers。
|
||||
|
||||
### 使用 conda
|
||||
```py
|
||||
# pip
|
||||
pip install "transformers[torch]"
|
||||
|
||||
🤗 Transformers 可以藉由 conda 依此安裝:
|
||||
|
||||
```shell script
|
||||
conda install conda-forge::transformers
|
||||
# uv
|
||||
uv pip install "transformers[torch]"
|
||||
```
|
||||
|
||||
> **_筆記:_** 從 `huggingface` 頻道安裝 `transformers` 已被淘汰。
|
||||
如果您想使用函式庫的最新變更或有興趣參與貢獻,可以從原始碼安裝 Transformers。然而,*最新*版本可能不穩定。如果您遇到任何錯誤,歡迎隨時提交一個 [issue](https://github.com/huggingface/transformers/issues)。
|
||||
|
||||
要藉由 conda 安裝 Flax、PyTorch 或 TensorFlow 其中之一,請參閱它們各自安裝頁面的說明。
|
||||
```shell
|
||||
git clone https://github.com/huggingface/transformers.git
|
||||
cd transformers
|
||||
|
||||
## 模型架構
|
||||
# pip
|
||||
pip install '.[torch]'
|
||||
|
||||
**🤗 Transformers 支援的[所有的模型檢查點](https://huggingface.co/models)**,由[使用者](https://huggingface.co/users)和[組織](https://huggingface.co/organizations)上傳,均與 huggingface.co [model hub](https://huggingface.co) 完美結合。
|
||||
# uv
|
||||
uv pip install '.[torch]'
|
||||
```
|
||||
|
||||
目前的檢查點數量: 
|
||||
## 快速入門
|
||||
|
||||
🤗 Transformers 目前支援以下的架構: 模型概覽請參閱[這裡](https://huggingface.co/docs/transformers/model_summary).
|
||||
透過 [Pipeline](https://huggingface.co/docs/transformers/pipeline_tutorial) API 快速開始使用 Transformers。`Pipeline` 是一個高階的推理類別,支援文字、音訊、視覺和多模態任務。它負責處理輸入資料的預處理,並回傳適當的輸出。
|
||||
|
||||
要檢查某個模型是否已有 Flax、PyTorch 或 TensorFlow 的實作,或其是否在🤗 Tokenizers 函式庫中有對應的 tokenizer,敬請參閱[此表](https://huggingface.co/docs/transformers/index#supported-frameworks)。
|
||||
實例化一個 pipeline 並指定用於文字生成的模型。該模型會被下載並快取,方便您之後輕鬆複用。最後,傳入一些文字來提示模型。
|
||||
|
||||
這些實作均已於多個資料集測試(請參閱範例腳本)並應與原版實作表現相當。你可以在範例文件的[此節](https://huggingface.co/docs/transformers/examples)中了解實作的細節。
|
||||
```py
|
||||
from transformers import pipeline
|
||||
|
||||
pipeline = pipeline(task="text-generation", model="Qwen/Qwen2.5-1.5B")
|
||||
pipeline("the secret to baking a really good cake is ")
|
||||
[{'generated_text': 'the secret to baking a really good cake is 1) to use the right ingredients and 2) to follow the recipe exactly. the recipe for the cake is as follows: 1 cup of sugar, 1 cup of flour, 1 cup of milk, 1 cup of butter, 1 cup of eggs, 1 cup of chocolate chips. if you want to make 2 cakes, how much sugar do you need? To make 2 cakes, you will need 2 cups of sugar.'}]
|
||||
```
|
||||
|
||||
## 了解更多
|
||||
與模型進行聊天,使用模式是相同的。唯一的區別是您需要建構一個您與系統之間的聊天歷史(作為 `Pipeline` 的輸入)。
|
||||
|
||||
| 章節 | 描述 |
|
||||
|-|-|
|
||||
| [文件](https://huggingface.co/transformers/) | 完整的 API 文件和教學 |
|
||||
| [任務概覽](https://huggingface.co/docs/transformers/task_summary) | 🤗 Transformers 支援的任務 |
|
||||
| [預處理教學](https://huggingface.co/docs/transformers/preprocessing) | 使用 `Tokenizer` 來為模型準備資料 |
|
||||
| [訓練和微調](https://huggingface.co/docs/transformers/training) | 使用 PyTorch/TensorFlow 的內建的訓練方式或於 `Trainer` API 中使用 🤗 Transformers 提供的模型 |
|
||||
| [快速上手:微調和範例腳本](https://github.com/huggingface/transformers/tree/main/examples) | 為各種任務提供的範例腳本 |
|
||||
| [模型分享和上傳](https://huggingface.co/docs/transformers/model_sharing) | 上傳並與社群分享你微調的模型 |
|
||||
| [遷移](https://huggingface.co/docs/transformers/migration) | 從 `pytorch-transformers` 或 `pytorch-pretrained-bert` 遷移到 🤗 Transformers |
|
||||
> [!TIP]
|
||||
> 你也可以直接在命令列中與模型聊天。
|
||||
> ```shell
|
||||
> transformers chat Qwen/Qwen2.5-0.5B-Instruct
|
||||
> ```
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import pipeline
|
||||
|
||||
chat = [
|
||||
{"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
|
||||
{"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
|
||||
]
|
||||
|
||||
pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", dtype=torch.bfloat16, device_map="auto")
|
||||
response = pipeline(chat, max_new_tokens=512)
|
||||
print(response[0]["generated_text"][-1]["content"])
|
||||
```
|
||||
|
||||
展開下面的範例,查看 `Pipeline` 如何在不同模態和任務上運作。
|
||||
|
||||
<details>
|
||||
<summary>自動語音辨識</summary>
|
||||
|
||||
```py
|
||||
from transformers import pipeline
|
||||
|
||||
pipeline = pipeline(task="automatic-speech-recognition", model="openai/whisper-large-v3")
|
||||
pipeline("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
|
||||
{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>影像分類</summary>
|
||||
|
||||
<h3 align="center">
|
||||
<a><img src="https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png"></a>
|
||||
</h3>
|
||||
|
||||
```py
|
||||
from transformers import pipeline
|
||||
|
||||
pipeline = pipeline(task="image-classification", model="facebook/dinov2-small-imagenet1k-1-layer")
|
||||
pipeline("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
|
||||
[{'label': 'macaw', 'score': 0.997848391532898},
|
||||
{'label': 'sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita',
|
||||
'score': 0.0016551691805943847},
|
||||
{'label': 'lorikeet', 'score': 0.00018523589824326336},
|
||||
{'label': 'African grey, African gray, Psittacus erithacus',
|
||||
'score': 7.85409429227002e-05},
|
||||
{'label': 'quail', 'score': 5.502637941390276e-05}]
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>視覺問答</summary>
|
||||
|
||||
<h3 align="center">
|
||||
<a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg"></a>
|
||||
</h3>
|
||||
|
||||
```py
|
||||
from transformers import pipeline
|
||||
|
||||
pipeline = pipeline(task="visual-question-answering", model="Salesforce/blip-vqa-base")
|
||||
pipeline(
|
||||
image="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg",
|
||||
question="What is in the image?",
|
||||
)
|
||||
[{'answer': 'statue of liberty'}]
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
## 為什麼我應該使用 Transformers?
|
||||
|
||||
1. 易於使用的最先進模型:
|
||||
* 在自然語言理解與生成、電腦視覺、音訊、影片和多模態任務上表現卓越。
|
||||
* 為研究人員、工程師與開發者提供了低門檻的入門途徑。
|
||||
* 面向使用者的抽象層級少,只需學習三個核心類別。
|
||||
* 為所有預訓練模型提供了統一的 API 介面。
|
||||
|
||||
2. 更低的運算成本,更小的碳足跡:
|
||||
* 分享訓練好的模型,而不是從零開始訓練。
|
||||
* 減少運算時間和生產成本。
|
||||
* 擁有數十種模型架構和超過100萬個橫跨所有模態的預訓練檢查點。
|
||||
|
||||
3. 為模型的每個生命週期階段選擇合適的框架:
|
||||
* 僅用3行程式碼即可訓練最先進的模型。
|
||||
* 在PyTorch/JAX/TF2.0框架之間輕鬆切換單一模型。
|
||||
* 為訓練、評估和生產選擇最合適的框架。
|
||||
|
||||
4. 輕鬆根據您的需求客製化模型或範例:
|
||||
* 我們為每個架構提供了範例,以重現其原作者發表的結果。
|
||||
* 模型內部結構盡可能保持一致地暴露給使用者。
|
||||
* 模型檔案可以獨立於函式庫使用,便於快速實驗。
|
||||
|
||||
<a target="_blank" href="https://huggingface.co/enterprise">
|
||||
<img alt="Hugging Face Enterprise Hub" src="https://github.com/user-attachments/assets/247fb16d-d251-4583-96c4-d3d76dda4925">
|
||||
</a><br>
|
||||
|
||||
## 為什麼我不應該使用 Transformers?
|
||||
|
||||
- 本函式庫並非一個用於建構神經網路的模組化工具箱。模型檔案中的程式碼為了讓研究人員能快速在模型上迭代,而沒有進行過度的抽象重構,避免了深入額外的抽象層/檔案。
|
||||
- 訓練 API 針對 Transformers 提供的 PyTorch 模型進行了最佳化。對於通用的機器學習迴圈,您應該使用像 [Accelerate](https://huggingface.co/docs/accelerate) 這樣的其他函式庫。
|
||||
- [範例指令稿](https://github.com/huggingface/transformers/tree/main/examples)僅僅是*範例*。它們不一定能在您的特定用例上開箱即用,您可能需要修改程式碼才能使其正常運作。
|
||||
|
||||
## 100個使用 Transformers 的專案
|
||||
|
||||
Transformers 不僅僅是一個使用預訓練模型的工具包,它還是一個圍繞它和 Hugging Face Hub 建構的專案社群。我們希望 Transformers 能夠賦能開發者、研究人員、學生、教授、工程師以及其他任何人,去建構他們夢想中的專案。
|
||||
|
||||
為了慶祝 Transformers 獲得 10 萬顆星標,我們希望透過 [awesome-transformers](./awesome-transformers.md) 頁面來聚焦社群,該頁面列出了100個基於 Transformers 建構的精彩專案。
|
||||
|
||||
如果您擁有或使用一個您認為應該被列入其中的專案,請隨時提交 PR 將其加入!
|
||||
|
||||
## 範例模型
|
||||
|
||||
您可以在我們大多數模型的 [Hub 模型頁面](https://huggingface.co/models) 上直接進行測試。
|
||||
|
||||
展開下面的每個模態,查看一些用於不同用例的範例模型。
|
||||
|
||||
<details>
|
||||
<summary>音訊</summary>
|
||||
|
||||
- Audio classification with [Whisper](https://huggingface.co/openai/whisper-large-v3-turbo)
|
||||
- Automatic speech recognition with [Moonshine](https://huggingface.co/UsefulSensors/moonshine)
|
||||
- Keyword spotting with [Wav2Vec2](https://huggingface.co/superb/wav2vec2-base-superb-ks)
|
||||
- Speech to speech generation with [Moshi](https://huggingface.co/kyutai/moshiko-pytorch-bf16)
|
||||
- Text to audio with [MusicGen](https://huggingface.co/facebook/musicgen-large)
|
||||
- Text to speech with [Bark](https://huggingface.co/suno/bark)
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>電腦視覺</summary>
|
||||
|
||||
- Automatic mask generation with [SAM](https://huggingface.co/facebook/sam-vit-base)
|
||||
- Depth estimation with [DepthPro](https://huggingface.co/apple/DepthPro-hf)
|
||||
- Image classification with [DINO v2](https://huggingface.co/facebook/dinov2-base)
|
||||
- Keypoint detection with [SuperPoint](https://huggingface.co/magic-leap-community/superpoint)
|
||||
- Keypoint matching with [SuperGlue](https://huggingface.co/magic-leap-community/superglue_outdoor)
|
||||
- Object detection with [RT-DETRv2](https://huggingface.co/PekingU/rtdetr_v2_r50vd)
|
||||
- Pose Estimation with [VitPose](https://huggingface.co/usyd-community/vitpose-base-simple)
|
||||
- Universal segmentation with [OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_swin_large)
|
||||
- Video classification with [VideoMAE](https://huggingface.co/MCG-NJU/videomae-large)
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>多模態</summary>
|
||||
|
||||
- Audio or text to text with [Qwen2-Audio](https://huggingface.co/Qwen/Qwen2-Audio-7B)
|
||||
- Document question answering with [LayoutLMv3](https://huggingface.co/microsoft/layoutlmv3-base)
|
||||
- Image or text to text with [Qwen-VL](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)
|
||||
- Image captioning [BLIP-2](https://huggingface.co/Salesforce/blip2-opt-2.7b)
|
||||
- OCR-based document understanding with [GOT-OCR2](https://huggingface.co/stepfun-ai/GOT-OCR-2.0-hf)
|
||||
- Table question answering with [TAPAS](https://huggingface.co/google/tapas-base)
|
||||
- Unified multimodal understanding and generation with [Emu3](https://huggingface.co/BAAI/Emu3-Gen)
|
||||
- Vision to text with [Llava-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf)
|
||||
- Visual question answering with [Llava](https://huggingface.co/llava-hf/llava-1.5-7b-hf)
|
||||
- Visual referring expression segmentation with [Kosmos-2](https://huggingface.co/microsoft/kosmos-2-patch14-224)
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>自然語言處理 (NLP)</summary>
|
||||
|
||||
- Masked word completion with [ModernBERT](https://huggingface.co/answerdotai/ModernBERT-base)
|
||||
- Named entity recognition with [Gemma](https://huggingface.co/google/gemma-2-2b)
|
||||
- Question answering with [Mixtral](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)
|
||||
- Summarization with [BART](https://huggingface.co/facebook/bart-large-cnn)
|
||||
- Translation with [T5](https://huggingface.co/google-t5/t5-base)
|
||||
- Text generation with [Llama](https://huggingface.co/meta-llama/Llama-3.2-1B)
|
||||
- Text classification with [Qwen](https://huggingface.co/Qwen/Qwen2.5-0.5B)
|
||||
|
||||
</details>
|
||||
|
||||
## 引用
|
||||
|
||||
我們已將此函式庫的[論文](https://www.aclweb.org/anthology/2020.emnlp-demos.6/)正式發表。如果你使用了 🤗 Transformers 函式庫,可以引用:
|
||||
現在我們有一篇可供您引用的關於 🤗 Transformers 函式庫的 [論文](https://www.aclweb.org/anthology/2020.emnlp-demos.6/):
|
||||
```bibtex
|
||||
@inproceedings{wolf-etal-2020-transformers,
|
||||
title = "Transformers: State-of-the-Art Natural Language Processing",
|
||||
@ -285,4 +324,4 @@ conda install conda-forge::transformers
|
||||
url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
|
||||
pages = "38--45"
|
||||
}
|
||||
```
|
||||
```
|
||||
2
setup.py
2
setup.py
@ -137,7 +137,7 @@ _deps = [
|
||||
"psutil",
|
||||
"pyyaml>=5.1",
|
||||
"pydantic>=2",
|
||||
"pytest>=7.2.0",
|
||||
"pytest>=7.2.0,<9.0.0",
|
||||
"pytest-asyncio>=1.2.0",
|
||||
"pytest-rerunfailures<16.0",
|
||||
"pytest-timeout",
|
||||
|
||||
@ -14,7 +14,9 @@
|
||||
import asyncio
|
||||
import base64
|
||||
import copy
|
||||
import datetime
|
||||
import enum
|
||||
import functools
|
||||
import gc
|
||||
import io
|
||||
import json
|
||||
@ -30,7 +32,8 @@ from threading import Thread
|
||||
from typing import Annotated, Optional, TypedDict, Union
|
||||
|
||||
import typer
|
||||
from huggingface_hub import scan_cache_dir
|
||||
from huggingface_hub import model_info
|
||||
from huggingface_hub.constants import HF_HUB_OFFLINE
|
||||
from openai.types.chat.chat_completion import Choice
|
||||
from tokenizers.decoders import DecodeStream
|
||||
|
||||
@ -717,45 +720,51 @@ class Serve:
|
||||
"""
|
||||
return f"data: {chunk.model_dump_json(exclude_none=True)}\n\n"
|
||||
|
||||
@staticmethod
|
||||
def get_gen_models(cache_dir: Optional[str] = None) -> list[dict[str, any]]:
|
||||
@functools.cache
|
||||
def get_gen_models(self) -> list[dict[str, any]]:
|
||||
"""
|
||||
List generative models in the cache.
|
||||
This is by no means a limit to which models may be instantiated with `transformers serve`: any chat-based
|
||||
model working with generate can work.
|
||||
|
||||
This is a limited list of models to ensure we have a discoverable /v1/models endpoint for third-party
|
||||
integrations.
|
||||
"""
|
||||
generative_models = []
|
||||
models = [
|
||||
"Menlo/Jan-nano",
|
||||
"Menlo/Jan-nano-128k",
|
||||
"Qwen/Qwen2.5-0.5B-Instruct",
|
||||
"Qwen/Qwen2.5-3B-Instruct",
|
||||
"Qwen/Qwen2.5-7B-Instruct",
|
||||
"Qwen/Qwen2.5-14B-Instruct",
|
||||
"meta-llama/Llama-3.1-8B-Instruct",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
"meta-llama/Llama-3.3-70B-Instruct",
|
||||
"HuggingFaceTB/SmolVLM-Instruct",
|
||||
"ibm-granite/granite-vision-3.2-2b",
|
||||
"Qwen/Qwen2.5-VL-7B-Instruct",
|
||||
]
|
||||
|
||||
for repo in scan_cache_dir(cache_dir).repos:
|
||||
if repo.repo_type != "model":
|
||||
continue
|
||||
|
||||
refs = repo.refs
|
||||
for ref, revision_info in refs.items():
|
||||
files = revision_info.files
|
||||
config_path = next((f.file_path for f in files if f.file_name == "config.json"), None)
|
||||
|
||||
if not config_path:
|
||||
continue
|
||||
|
||||
config = json.loads(config_path.open().read())
|
||||
|
||||
if "architectures" not in config:
|
||||
continue
|
||||
|
||||
architectures = config["architectures"]
|
||||
if any(arch for arch in architectures if arch in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values()):
|
||||
print(repo.repo_id, ref)
|
||||
author = repo.repo_id.split("/") if "/" in repo.repo_id else ""
|
||||
repo_id = repo.repo_id + (f"@{ref}" if ref != "main" else "")
|
||||
generative_models.append(
|
||||
{
|
||||
"owned_by": author,
|
||||
"id": repo_id,
|
||||
"object": "model",
|
||||
"created": repo.last_modified,
|
||||
}
|
||||
)
|
||||
|
||||
return generative_models
|
||||
if HF_HUB_OFFLINE:
|
||||
return [
|
||||
{
|
||||
"id": model,
|
||||
"object": "model",
|
||||
"created": datetime.datetime.now().timestamp(),
|
||||
"owned_by": model.split("/")[0],
|
||||
}
|
||||
for model in models
|
||||
]
|
||||
else:
|
||||
model_infos = [model_info(model) for model in models]
|
||||
return [
|
||||
{
|
||||
"id": model.id,
|
||||
"object": "model",
|
||||
"created": model.created_at.timestamp(),
|
||||
"owned_by": model.author,
|
||||
}
|
||||
for model in model_infos
|
||||
]
|
||||
|
||||
def continuous_batching_chat_completion(self, req: dict, request_id: str) -> StreamingResponse | JSONResponse:
|
||||
"""
|
||||
|
||||
@ -723,7 +723,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
|
||||
|
||||
if self.mask_replace_prob < 1:
|
||||
warnings.warn(
|
||||
"Random token replacement is not supported with whole word masking.",
|
||||
"Random token replacement is not supported with whole word masking. "
|
||||
"Setting mask_replace_prob to 1.",
|
||||
)
|
||||
self.mask_replace_prob = 1
|
||||
|
||||
@ -82,7 +82,7 @@ class GlueDataset(Dataset):
|
||||
cache_dir: Optional[str] = None,
|
||||
):
|
||||
warnings.warn(
|
||||
"This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
|
||||
"This dataset will be removed from the library soon, preprocessing should be handled with the Hugging Face Datasets "
|
||||
"library. You can have a look at this example script for pointers: "
|
||||
"https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py",
|
||||
FutureWarning,
|
||||
|
||||
@ -21,7 +21,7 @@ if is_sklearn_available():
|
||||
|
||||
|
||||
DEPRECATION_WARNING = (
|
||||
"This metric will be removed from the library soon, metrics should be handled with the 🤗 Evaluate "
|
||||
"This metric will be removed from the library soon, metrics should be handled with the Hugging Face Evaluate "
|
||||
"library. You can have a look at this example script for pointers: "
|
||||
"https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py"
|
||||
)
|
||||
|
||||
@ -28,7 +28,7 @@ from .utils import DataProcessor, InputExample, InputFeatures
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
DEPRECATION_WARNING = (
|
||||
"This {0} will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
|
||||
"This {0} will be removed from the library soon, preprocessing should be handled with the Hugging Face Datasets "
|
||||
"library. You can have a look at this example script for pointers: "
|
||||
"https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py"
|
||||
)
|
||||
|
||||
@ -47,7 +47,7 @@ deps = {
|
||||
"psutil": "psutil",
|
||||
"pyyaml": "pyyaml>=5.1",
|
||||
"pydantic": "pydantic>=2",
|
||||
"pytest": "pytest>=7.2.0",
|
||||
"pytest": "pytest>=7.2.0,<9.0.0",
|
||||
"pytest-asyncio": "pytest-asyncio>=1.2.0",
|
||||
"pytest-rerunfailures": "pytest-rerunfailures<16.0",
|
||||
"pytest-timeout": "pytest-timeout",
|
||||
|
||||
@ -39,6 +39,7 @@ from .utils import (
|
||||
is_torch_dtype,
|
||||
logging,
|
||||
requires_backends,
|
||||
safe_load_json_file,
|
||||
)
|
||||
from .utils.hub import cached_file
|
||||
|
||||
@ -427,35 +428,42 @@ class FeatureExtractionMixin(PushToHubMixin):
|
||||
feature_extractor_file = os.path.join(pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME)
|
||||
if os.path.isfile(pretrained_model_name_or_path):
|
||||
resolved_feature_extractor_file = pretrained_model_name_or_path
|
||||
resolved_processor_file = None
|
||||
is_local = True
|
||||
elif is_remote_url(pretrained_model_name_or_path):
|
||||
feature_extractor_file = pretrained_model_name_or_path
|
||||
resolved_processor_file = None
|
||||
resolved_feature_extractor_file = download_url(pretrained_model_name_or_path)
|
||||
else:
|
||||
feature_extractor_file = FEATURE_EXTRACTOR_NAME
|
||||
try:
|
||||
# Load from local folder or from cache or download from model Hub and cache
|
||||
resolved_feature_extractor_files = [
|
||||
resolved_file
|
||||
for filename in [feature_extractor_file, PROCESSOR_NAME]
|
||||
if (
|
||||
resolved_file := cached_file(
|
||||
pretrained_model_name_or_path,
|
||||
filename=filename,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
local_files_only=local_files_only,
|
||||
subfolder=subfolder,
|
||||
token=token,
|
||||
user_agent=user_agent,
|
||||
revision=revision,
|
||||
_raise_exceptions_for_missing_entries=False,
|
||||
)
|
||||
)
|
||||
is not None
|
||||
]
|
||||
resolved_feature_extractor_file = resolved_feature_extractor_files[0]
|
||||
resolved_processor_file = cached_file(
|
||||
pretrained_model_name_or_path,
|
||||
filename=PROCESSOR_NAME,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
local_files_only=local_files_only,
|
||||
token=token,
|
||||
user_agent=user_agent,
|
||||
revision=revision,
|
||||
subfolder=subfolder,
|
||||
_raise_exceptions_for_missing_entries=False,
|
||||
)
|
||||
resolved_feature_extractor_file = cached_file(
|
||||
pretrained_model_name_or_path,
|
||||
filename=feature_extractor_file,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
local_files_only=local_files_only,
|
||||
token=token,
|
||||
user_agent=user_agent,
|
||||
revision=revision,
|
||||
subfolder=subfolder,
|
||||
_raise_exceptions_for_missing_entries=False,
|
||||
)
|
||||
except OSError:
|
||||
# Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
|
||||
# the original exception.
|
||||
@ -469,19 +477,24 @@ class FeatureExtractionMixin(PushToHubMixin):
|
||||
f" directory containing a {FEATURE_EXTRACTOR_NAME} file"
|
||||
)
|
||||
|
||||
try:
|
||||
# Load feature_extractor dict
|
||||
with open(resolved_feature_extractor_file, encoding="utf-8") as reader:
|
||||
text = reader.read()
|
||||
feature_extractor_dict = json.loads(text)
|
||||
if "audio_processor" in feature_extractor_dict:
|
||||
feature_extractor_dict = feature_extractor_dict["audio_processor"]
|
||||
else:
|
||||
feature_extractor_dict = feature_extractor_dict.get("feature_extractor", feature_extractor_dict)
|
||||
# Load feature_extractor dict. Priority goes as (nested config if found -> image processor config)
|
||||
# We are downloading both configs because almost all models have a `processor_config.json` but
|
||||
# not all of these are nested. We need to check if it was saved recebtly as nested or if it is legacy style
|
||||
feature_extractor_dict = None
|
||||
if resolved_processor_file is not None:
|
||||
processor_dict = safe_load_json_file(resolved_processor_file)
|
||||
if "feature_extractor" in processor_dict or "audio_processor" in processor_dict:
|
||||
feature_extractor_dict = processor_dict.get("feature_extractor", processor_dict.get("audio_processor"))
|
||||
|
||||
except json.JSONDecodeError:
|
||||
if resolved_feature_extractor_file is not None and feature_extractor_dict is None:
|
||||
feature_extractor_dict = safe_load_json_file(resolved_feature_extractor_file)
|
||||
|
||||
if feature_extractor_dict is None:
|
||||
raise OSError(
|
||||
f"It looks like the config file at '{resolved_feature_extractor_file}' is not a valid JSON file."
|
||||
f"Can't load feature extractor for '{pretrained_model_name_or_path}'. If you were trying to load"
|
||||
" it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
|
||||
f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
|
||||
f" directory containing a {feature_extractor_file} file"
|
||||
)
|
||||
|
||||
if is_local:
|
||||
|
||||
@ -189,7 +189,9 @@ class PagedAttentionCache:
|
||||
num_blocks, max_batch_tokens = memory_handler.infer_num_blocks_and_max_batch_tokens(
|
||||
num_blocks=getattr(generation_config, "num_blocks", None),
|
||||
max_batch_tokens=getattr(generation_config, "max_batch_tokens", None),
|
||||
max_memory_percent=getattr(generation_config, "max_memory", 0.9),
|
||||
max_memory_percent=getattr(
|
||||
generation_config, "max_memory", 0.8
|
||||
), # FIXME: it seems we overcommit memory, was changed from 0.9 which caused OOMs in our benchmarking CI
|
||||
cache_dtype=self.dtype,
|
||||
)
|
||||
|
||||
@ -414,7 +416,7 @@ class PagedAttentionMemoryHandler:
|
||||
self,
|
||||
num_blocks: Optional[int] = None,
|
||||
max_batch_tokens: Optional[int] = None,
|
||||
max_memory_percent: float = 0.9,
|
||||
max_memory_percent: float = 0.8, # FIXME: it seems we overcommit memory, was changed from 0.9 which caused OOMs in our benchmarking CI
|
||||
cache_dtype: torch.dtype = torch.float16,
|
||||
) -> tuple[int, int]:
|
||||
"""Determine optimal number of blocks and maximum number of tokens per batch based on available memory and
|
||||
@ -454,7 +456,7 @@ class PagedAttentionMemoryHandler:
|
||||
|
||||
def compute_num_blocks_and_max_batch_tokens(
|
||||
self,
|
||||
max_memory_percent: float = 0.9,
|
||||
max_memory_percent: float,
|
||||
cache_dtype: torch.dtype = torch.float16,
|
||||
m: float = 0.01,
|
||||
) -> tuple[int, int]:
|
||||
@ -503,7 +505,7 @@ class PagedAttentionMemoryHandler:
|
||||
def compute_max_batch_tokens(
|
||||
self,
|
||||
num_blocks: int,
|
||||
max_memory_percent: float = 0.9,
|
||||
max_memory_percent: float,
|
||||
cache_dtype: torch.dtype = torch.float16,
|
||||
) -> int:
|
||||
"""Calculate maximum batch tokens M given a fixed number of cache blocks. The formula for M is given by:
|
||||
@ -531,7 +533,7 @@ class PagedAttentionMemoryHandler:
|
||||
def compute_num_blocks(
|
||||
self,
|
||||
max_batch_tokens: int,
|
||||
max_memory_percent: float = 0.9,
|
||||
max_memory_percent: float,
|
||||
cache_dtype: torch.dtype = torch.float16,
|
||||
) -> int:
|
||||
"""Calculate number of cache blocks N given a fixed maximum token per token M. The formula for N is given by:
|
||||
|
||||
@ -826,6 +826,8 @@ class ContinuousBatchingManager:
|
||||
if block:
|
||||
self.join(stop_trigger_time, timeout)
|
||||
|
||||
self.batch_processor = None
|
||||
|
||||
def join(self, stop_trigger_time: float, timeout: Optional[float] = None) -> None:
|
||||
"""Wait for the background thread to finish.
|
||||
|
||||
|
||||
@ -608,7 +608,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
use_cache = kwargs.get("use_cache")
|
||||
if use_cache is None:
|
||||
use_cache = getattr(self.config, "use_cache", False)
|
||||
if past_key_values is None or use_cache:
|
||||
if past_key_values is not None or use_cache:
|
||||
# TODO (joao): handle the case where cache length == input_ids length. The function below results in an
|
||||
# exception because we get empty input_ids after slicing. In essence, we need to roll back the cache 1
|
||||
# token to recompute the logits for the first token to be generated (but not all caches support roll backs)
|
||||
@ -2170,7 +2170,7 @@ class GenerationMixin(ContinuousMixin):
|
||||
return False
|
||||
|
||||
# Base logic
|
||||
valid_hardware = self.device.type == "cuda" or bool(
|
||||
valid_hardware = self.device.type in ["cuda", "xpu"] or bool(
|
||||
generation_config.compile_config is not None and generation_config.compile_config._compile_all_devices
|
||||
)
|
||||
using_compilable_cache = (
|
||||
|
||||
@ -32,6 +32,7 @@ from .utils import (
|
||||
is_offline_mode,
|
||||
is_remote_url,
|
||||
logging,
|
||||
safe_load_json_file,
|
||||
)
|
||||
from .utils.hub import cached_file
|
||||
|
||||
@ -280,35 +281,41 @@ class ImageProcessingMixin(PushToHubMixin):
|
||||
image_processor_file = os.path.join(pretrained_model_name_or_path, image_processor_filename)
|
||||
if os.path.isfile(pretrained_model_name_or_path):
|
||||
resolved_image_processor_file = pretrained_model_name_or_path
|
||||
resolved_processor_file = None
|
||||
is_local = True
|
||||
elif is_remote_url(pretrained_model_name_or_path):
|
||||
image_processor_file = pretrained_model_name_or_path
|
||||
resolved_processor_file = None
|
||||
resolved_image_processor_file = download_url(pretrained_model_name_or_path)
|
||||
else:
|
||||
image_processor_file = image_processor_filename
|
||||
try:
|
||||
# Load from local folder or from cache or download from model Hub and cache
|
||||
resolved_image_processor_files = [
|
||||
resolved_file
|
||||
for filename in [image_processor_file, PROCESSOR_NAME]
|
||||
if (
|
||||
resolved_file := cached_file(
|
||||
pretrained_model_name_or_path,
|
||||
filename=filename,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
local_files_only=local_files_only,
|
||||
token=token,
|
||||
user_agent=user_agent,
|
||||
revision=revision,
|
||||
subfolder=subfolder,
|
||||
_raise_exceptions_for_missing_entries=False,
|
||||
)
|
||||
)
|
||||
is not None
|
||||
]
|
||||
resolved_image_processor_file = resolved_image_processor_files[0]
|
||||
resolved_processor_file = cached_file(
|
||||
pretrained_model_name_or_path,
|
||||
filename=PROCESSOR_NAME,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
local_files_only=local_files_only,
|
||||
token=token,
|
||||
user_agent=user_agent,
|
||||
revision=revision,
|
||||
subfolder=subfolder,
|
||||
_raise_exceptions_for_missing_entries=False,
|
||||
)
|
||||
resolved_image_processor_file = cached_file(
|
||||
pretrained_model_name_or_path,
|
||||
filename=image_processor_file,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
local_files_only=local_files_only,
|
||||
token=token,
|
||||
user_agent=user_agent,
|
||||
revision=revision,
|
||||
subfolder=subfolder,
|
||||
_raise_exceptions_for_missing_entries=False,
|
||||
)
|
||||
except OSError:
|
||||
# Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
|
||||
# the original exception.
|
||||
@ -322,16 +329,24 @@ class ImageProcessingMixin(PushToHubMixin):
|
||||
f" directory containing a {image_processor_filename} file"
|
||||
)
|
||||
|
||||
try:
|
||||
# Load image_processor dict
|
||||
with open(resolved_image_processor_file, encoding="utf-8") as reader:
|
||||
text = reader.read()
|
||||
image_processor_dict = json.loads(text)
|
||||
image_processor_dict = image_processor_dict.get("image_processor", image_processor_dict)
|
||||
# Load image_processor dict. Priority goes as (nested config if found -> image processor config)
|
||||
# We are downloading both configs because almost all models have a `processor_config.json` but
|
||||
# not all of these are nested. We need to check if it was saved recebtly as nested or if it is legacy style
|
||||
image_processor_dict = None
|
||||
if resolved_processor_file is not None:
|
||||
processor_dict = safe_load_json_file(resolved_processor_file)
|
||||
if "image_processor" in processor_dict:
|
||||
image_processor_dict = processor_dict["image_processor"]
|
||||
|
||||
except json.JSONDecodeError:
|
||||
if resolved_image_processor_file is not None and image_processor_dict is None:
|
||||
image_processor_dict = safe_load_json_file(resolved_image_processor_file)
|
||||
|
||||
if image_processor_dict is None:
|
||||
raise OSError(
|
||||
f"It looks like the config file at '{resolved_image_processor_file}' is not a valid JSON file."
|
||||
f"Can't load image processor for '{pretrained_model_name_or_path}'. If you were trying to load"
|
||||
" it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
|
||||
f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
|
||||
f" directory containing a {image_processor_filename} file"
|
||||
)
|
||||
|
||||
if is_local:
|
||||
|
||||
@ -11,6 +11,7 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
import re
|
||||
from collections.abc import Callable
|
||||
from functools import partial
|
||||
@ -18,7 +19,7 @@ from types import ModuleType
|
||||
from typing import Optional, Union
|
||||
|
||||
from ..modeling_flash_attention_utils import lazy_import_flash_attention
|
||||
from ..utils import logging
|
||||
from ..utils import ENV_VARS_TRUE_VALUES, logging
|
||||
from ..utils.import_utils import is_kernels_available
|
||||
from .flash_attention import flash_attention_forward
|
||||
|
||||
@ -33,10 +34,22 @@ try:
|
||||
get_kernel,
|
||||
register_kernel_mapping,
|
||||
replace_kernel_forward_from_hub,
|
||||
use_kernel_forward_from_hub,
|
||||
)
|
||||
|
||||
_TRANSFORMERS_USE_HUB_KERNELS = os.environ.get("USE_HUB_KERNELS", "YES").upper()
|
||||
_kernels_available = True
|
||||
_kernels_enabled = _TRANSFORMERS_USE_HUB_KERNELS in ENV_VARS_TRUE_VALUES
|
||||
|
||||
def use_kernel_forward_from_hub(layer_name: str):
|
||||
if _kernels_enabled:
|
||||
from kernels import use_kernel_forward_from_hub as _kernels_use_kernel_forward_from_hub
|
||||
|
||||
return _kernels_use_kernel_forward_from_hub(layer_name)
|
||||
else:
|
||||
logger.warning_once(
|
||||
f"kernels hub usage is disabled through the environment USE_HUB_KERNELS={_TRANSFORMERS_USE_HUB_KERNELS}"
|
||||
)
|
||||
return lambda cls: cls
|
||||
|
||||
_KERNEL_MAPPING: dict[str, dict[Union[Device, str], LayerRepository]] = {
|
||||
"MultiScaleDeformableAttention": {
|
||||
@ -71,6 +84,12 @@ try:
|
||||
layer_name="RMSNorm",
|
||||
)
|
||||
},
|
||||
"npu": {
|
||||
Mode.INFERENCE: LayerRepository(
|
||||
repo_id="kernels-community/liger_kernels",
|
||||
layer_name="LigerRMSNorm",
|
||||
)
|
||||
},
|
||||
},
|
||||
"MLP": {
|
||||
"cuda": LayerRepository(
|
||||
@ -161,6 +180,7 @@ try:
|
||||
|
||||
except ImportError:
|
||||
_kernels_available = False
|
||||
_kernels_enabled = False
|
||||
|
||||
# Stub to make decorators int transformers work when `kernels`
|
||||
# is not installed.
|
||||
|
||||
@ -38,7 +38,7 @@ from transformers.utils.import_utils import _is_package_available
|
||||
|
||||
|
||||
if os.getenv("WANDB_MODE") == "offline":
|
||||
print("⚙️ Running in WANDB offline mode")
|
||||
print("[INFO] Running in WANDB offline mode")
|
||||
|
||||
from .. import PreTrainedModel, TrainingArguments
|
||||
from .. import __version__ as version
|
||||
|
||||
@ -63,9 +63,6 @@ def sdpa_attention_forward(
|
||||
else:
|
||||
sdpa_kwargs = {"enable_gqa": True}
|
||||
|
||||
if attention_mask is not None and attention_mask.ndim == 4:
|
||||
attention_mask = attention_mask[:, :, :, : key.shape[-2]]
|
||||
|
||||
# Instead of relying on the value set in the module directly, we use the is_causal passed in kwargs if it is presented
|
||||
is_causal = is_causal if is_causal is not None else getattr(module, "is_causal", True)
|
||||
|
||||
|
||||
@ -177,6 +177,42 @@ def prepare_padding_mask(
|
||||
return local_padding_mask
|
||||
|
||||
|
||||
def _can_skip_causal_mask_xpu(
|
||||
padding_mask: Optional[torch.Tensor],
|
||||
query_length: int,
|
||||
kv_length: int,
|
||||
local_attention_size: Optional[int],
|
||||
) -> bool:
|
||||
"""
|
||||
XPU-specific logic for determining if we can skip causal mask creation.
|
||||
|
||||
For XPU devices, we have special handling:
|
||||
- Single query tokens (query_length == 1) use the same logic as CUDA
|
||||
- Multi-query tokens can skip if padding_mask is provided and correctly structured
|
||||
The mask must have all True values in the query window and all False after
|
||||
"""
|
||||
|
||||
if is_tracing(padding_mask):
|
||||
return False
|
||||
|
||||
# Check local attention constraint (same as CUDA)
|
||||
if local_attention_size is not None and kv_length >= local_attention_size:
|
||||
return False
|
||||
|
||||
if padding_mask is None:
|
||||
# Without padding mask, can skip if single query token or full causal attention
|
||||
return query_length == 1 or kv_length == query_length
|
||||
|
||||
# XPU allows skipping under additional conditions when padding_mask is provided
|
||||
if query_length == 1:
|
||||
# Single query token: skip only if no padding tokens present
|
||||
return padding_mask.all()
|
||||
|
||||
# XPU-specific: check if query window is all True and rest is all False
|
||||
# This allows XPU to optimize the 1st token in static cache
|
||||
return padding_mask[:, :query_length].all() and not padding_mask[:, query_length:].any()
|
||||
|
||||
|
||||
def _ignore_causal_mask_sdpa(
|
||||
padding_mask: Optional[torch.Tensor],
|
||||
query_length: int,
|
||||
@ -197,6 +233,12 @@ def _ignore_causal_mask_sdpa(
|
||||
mask_indices += kv_offset
|
||||
padding_mask = padding_mask[:, mask_indices]
|
||||
|
||||
if _is_torch_xpu_available:
|
||||
# XPU devices have special handling for mask skipping:
|
||||
# - Single query tokens use the same logic as CUDA
|
||||
# - Multi-query tokens can skip if padding_mask is provided and correctly structured
|
||||
# (all True in query window, all False after)
|
||||
return _can_skip_causal_mask_xpu(padding_mask, query_length, kv_length, local_attention_size)
|
||||
# When using `torch.export` or `torch.onnx.dynamo_export`, we must pass an example input, and `is_causal` behavior is
|
||||
# hard-coded to the forward. If a user exports a model with query_length > 1, the exported model will hard-code `is_causal=True`
|
||||
# which is in general wrong (see https://github.com/pytorch/pytorch/issues/108108). Thus, we only set
|
||||
@ -204,18 +246,11 @@ def _ignore_causal_mask_sdpa(
|
||||
if (
|
||||
not is_tracing(padding_mask)
|
||||
# only cases when lower and upper diags are the same, see https://github.com/pytorch/pytorch/issues/108108
|
||||
and (query_length == 1 or (kv_length == query_length or _is_torch_xpu_available))
|
||||
and (query_length == 1 or kv_length == query_length)
|
||||
# in this case we need to add special patterns to the mask so cannot be skipped otherwise
|
||||
and (local_attention_size is None or kv_length < local_attention_size)
|
||||
# In this case, we need to add padding to the mask, so cannot be skipped otherwise
|
||||
and (
|
||||
padding_mask is None
|
||||
or (
|
||||
padding_mask.all()
|
||||
if not _is_torch_xpu_available or query_length == 1
|
||||
else padding_mask[:, :query_length].all()
|
||||
)
|
||||
)
|
||||
and (padding_mask is None or padding_mask.all())
|
||||
):
|
||||
return True
|
||||
|
||||
@ -698,9 +733,19 @@ def _preprocess_mask_arguments(
|
||||
# If using a cache, it can give all information about mask sizes based on seen tokens
|
||||
if past_key_values is not None:
|
||||
kv_length, kv_offset = past_key_values.get_mask_sizes(cache_position, layer_idx)
|
||||
# Otherwise, the sizes are simply the input sizes
|
||||
# Otherwise, we infer based on our input
|
||||
else:
|
||||
kv_length, kv_offset = input_embeds.shape[1], 0
|
||||
# 1. Rely on input directly
|
||||
if attention_mask is None:
|
||||
kv_length, kv_offset = input_embeds.shape[1], 0
|
||||
# 2. Rely on the mask instead - needed for special cases like prefix tuning in PEFT
|
||||
#
|
||||
# This is a very unique and special case where an encoder utilizes a cache and expects its length
|
||||
# to be accounted for (usually, they should never use a cache). In general, the mask should always
|
||||
# match with the input sizes nonetheless (i.e. it does not affect others).
|
||||
# Conclusion: "prefix tuning is evil"
|
||||
else:
|
||||
kv_length, kv_offset = attention_mask.shape[-1], 0
|
||||
|
||||
# We check the position_ids for potential packed sequence format (only if the 2D attention mask is explicitly None,
|
||||
# and we don't have past_key_values, i.e. generally a training setup)
|
||||
|
||||
@ -25,6 +25,7 @@ if TYPE_CHECKING:
|
||||
from .arcee import *
|
||||
from .aria import *
|
||||
from .audio_spectrogram_transformer import *
|
||||
from .audioflamingo3 import *
|
||||
from .auto import *
|
||||
from .autoformer import *
|
||||
from .aya_vision import *
|
||||
|
||||
@ -272,7 +272,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
31
src/transformers/models/audioflamingo3/__init__.py
Normal file
31
src/transformers/models/audioflamingo3/__init__.py
Normal file
@ -0,0 +1,31 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
|
||||
# reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ...utils import _LazyModule
|
||||
from ...utils.import_utils import define_import_structure
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_audioflamingo3 import *
|
||||
from .modeling_audioflamingo3 import *
|
||||
from .processing_audioflamingo3 import *
|
||||
else:
|
||||
import sys
|
||||
|
||||
_file = globals()["__file__"]
|
||||
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|
||||
@ -0,0 +1,210 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
|
||||
# reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
from ..auto import CONFIG_MAPPING, AutoConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class AudioFlamingo3EncoderConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of an [`AudioFlamingo3Encoder`]. It is used to instantiate an
|
||||
AudioFlamingo3 audio encoder according to the specified arguments, defining the model architecture. Instantiating a
|
||||
configuration with the defaults will yield a similar configuration to that of the audio encoder of the AudioFlamingo3
|
||||
architecture.
|
||||
|
||||
e.g. [nvidia/audio-flamingo-3-hf](https://huggingface.co/nvidia/audio-flamingo-3-hf)
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
num_mel_bins (`int`, *optional*, defaults to 128):
|
||||
Number of mel features used per input features. Should correspond to the value used in the
|
||||
`AudioFlamingo3Processor` class.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 32):
|
||||
Number of encoder layers.
|
||||
num_attention_heads (`int`, *optional*, defaults to 20):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
intermediate_size (`int`, *optional*, defaults to 5120):
|
||||
Dimensionality of the "intermediate" (often named feed-forward) layer in encoder.
|
||||
layerdrop (`float`, *optional*, defaults to 0.0):
|
||||
The LayerDrop probability for the encoder. See the [LayerDrop paper](https://huggingface.co/papers/1909.11556)
|
||||
for more details.
|
||||
activation_function (`str`, *optional*, defaults to `"gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"silu"` and `"gelu_new"` are supported.
|
||||
hidden_size (`int`, *optional*, defaults to 1280):
|
||||
Dimensionality of the layers.
|
||||
dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
activation_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for activations inside the fully connected layer.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
scale_embedding (`bool`, *optional*, defaults to `False`):
|
||||
Scale embeddings by dividing by sqrt(hidden_size).
|
||||
max_source_positions (`int`, *optional*, defaults to 1500):
|
||||
The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import AudioFlamingo3EncoderConfig, AudioFlamingo3Encoder
|
||||
|
||||
>>> # Initializing an AudioFlamingo3EncoderConfig
|
||||
>>> configuration = AudioFlamingo3EncoderConfig()
|
||||
|
||||
>>> # Initializing an AudioFlamingo3Encoder (with random weights)
|
||||
>>> model = AudioFlamingo3Encoder(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "audioflamingo3_encoder"
|
||||
|
||||
attribute_map = {
|
||||
"d_model": "hidden_size",
|
||||
"encoder_layers": "num_hidden_layers",
|
||||
"encoder_attention_heads": "num_attention_heads",
|
||||
"encoder_ffn_dim": "intermediate_size",
|
||||
"encoder_layerdrop": "layerdrop",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_mel_bins=128,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=20,
|
||||
intermediate_size=5120,
|
||||
layerdrop=0.0,
|
||||
activation_function="gelu",
|
||||
hidden_size=1280,
|
||||
dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
activation_dropout=0.0,
|
||||
initializer_range=0.02,
|
||||
scale_embedding=False,
|
||||
max_source_positions=1500,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.num_mel_bins = num_mel_bins
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.intermediate_size = intermediate_size
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.activation_dropout = activation_dropout
|
||||
self.activation_function = activation_function
|
||||
self.initializer_range = initializer_range
|
||||
self.layerdrop = layerdrop
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.scale_embedding = scale_embedding
|
||||
self.max_source_positions = max_source_positions
|
||||
|
||||
|
||||
class AudioFlamingo3Config(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of an [`AudioFlamingo3ForConditionalGeneration`]. It is used to instantiate an
|
||||
AudioFlamingo3 model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
||||
with the defaults will yield a similar configuration to that of the AudioFlamingo3.
|
||||
|
||||
e.g. [nvidia/audio-flamingo-3-hf](https://huggingface.co/nvidia/audio-flamingo-3-hf)
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
audio_config (`Union[AudioFlamingo3EncoderConfig, dict]`, *optional*, defaults to `AudioFlamingo3EncoderConfig`):
|
||||
The config object or dictionary of the audio backbone.
|
||||
text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Qwen2Config`):
|
||||
The config object or dictionary of the text backbone.
|
||||
audio_token_id (`int`, *optional*, defaults to 151669):
|
||||
The audio token index to encode the audio prompt.
|
||||
projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
|
||||
Activation function used in the projector.
|
||||
projector_bias (`bool`, *optional*, defaults to `True`):
|
||||
Whether to include bias terms in the projector.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import AudioFlamingo3ForConditionalGeneration, AudioFlamingo3Config, AudioFlamingo3EncoderConfig, Qwen2Config
|
||||
|
||||
>>> # Initializing an AudioFlamingo3Encoder config
|
||||
>>> audio_config = AudioFlamingo3EncoderConfig()
|
||||
|
||||
>>> # Initializing a Qwen2 config
|
||||
>>> text_config = Qwen2Config()
|
||||
|
||||
>>> # Initializing an AudioFlamingo3 configuration
|
||||
>>> configuration = AudioFlamingo3Config(audio_config, text_config)
|
||||
|
||||
>>> # Initializing a model from the audioflamingo3 style configuration
|
||||
>>> model = AudioFlamingo3ForConditionalGeneration(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "audioflamingo3"
|
||||
sub_configs = {
|
||||
"audio_config": AudioFlamingo3EncoderConfig,
|
||||
"text_config": AutoConfig,
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
audio_config=None,
|
||||
text_config=None,
|
||||
audio_token_id=151669,
|
||||
projector_hidden_act="gelu",
|
||||
projector_bias=True,
|
||||
**kwargs,
|
||||
):
|
||||
self.audio_token_id = audio_token_id
|
||||
|
||||
if isinstance(audio_config, dict):
|
||||
audio_config["model_type"] = audio_config.get("model_type", "audioflamingo3_encoder")
|
||||
audio_config = CONFIG_MAPPING[audio_config["model_type"]](**audio_config)
|
||||
elif audio_config is None:
|
||||
audio_config = CONFIG_MAPPING["audioflamingo3_encoder"]()
|
||||
|
||||
self.audio_config = audio_config
|
||||
|
||||
if isinstance(text_config, dict):
|
||||
text_config["model_type"] = text_config.get("model_type", "qwen2")
|
||||
text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
|
||||
elif text_config is None:
|
||||
text_config = CONFIG_MAPPING["qwen2"]()
|
||||
|
||||
self.text_config = text_config
|
||||
self.projector_hidden_act = projector_hidden_act
|
||||
self.projector_bias = projector_bias
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
__all__ = ["AudioFlamingo3Config", "AudioFlamingo3EncoderConfig"]
|
||||
@ -0,0 +1,286 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
|
||||
# reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Convert AudioFlamingo3 checkpoints into a Hugging Face repository layout."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
from safetensors.torch import safe_open
|
||||
|
||||
from transformers import (
|
||||
AudioFlamingo3Config,
|
||||
AudioFlamingo3ForConditionalGeneration,
|
||||
AudioFlamingo3Processor,
|
||||
AutoTokenizer,
|
||||
GenerationConfig,
|
||||
Qwen2Config,
|
||||
WhisperFeatureExtractor,
|
||||
)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
|
||||
|
||||
|
||||
def _load_json(p: Path):
|
||||
if not p.is_file():
|
||||
raise FileNotFoundError(f"Missing JSON: {p}")
|
||||
with p.open("r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def write_processor(src_root: Path, dst_root: Path):
|
||||
llm_dir = src_root / "llm"
|
||||
|
||||
# fmt: off
|
||||
tokenizer_chat_template = (
|
||||
"{% if messages[0]['role'] != 'system' %}"
|
||||
"{{ '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}"
|
||||
"{% endif %}"
|
||||
"{% for message in messages if message['content'] is not none %}"
|
||||
"{{ '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n' }}"
|
||||
"{% endfor %}"
|
||||
"{% if add_generation_prompt %}"
|
||||
"{{ '<|im_start|>assistant\\n' }}"
|
||||
"{% endif %}"
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
# fmt: off
|
||||
processor_chat_template = (
|
||||
"{% if messages[0]['role'] != 'system' %}"
|
||||
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
||||
"{% endif %}"
|
||||
"{% for m in messages if m['content'] is not none %}"
|
||||
"<|im_start|>{{ m['role'] }}\n"
|
||||
"{% if m['content'] is string %}"
|
||||
"{{ m['content'] }}"
|
||||
"{% else %}"
|
||||
"{% set audio = namespace(found=False) %}"
|
||||
"{% set text_buf = namespace(v='') %}"
|
||||
"{% for c in m['content'] %}"
|
||||
"{% if c.get('type') == 'audio' or 'audio' in c %}"
|
||||
"{% set audio.found = True %}"
|
||||
"{% elif c.get('type') == 'text' or 'text' in c %}"
|
||||
"{% set text_buf.v = text_buf.v + c['text'] %}"
|
||||
"{% endif %}"
|
||||
"{% endfor %}"
|
||||
"{% if audio.found %}{{ '<sound>' }}{% endif %}{{ text_buf.v }}"
|
||||
"{% endif %}"
|
||||
"<|im_end|>\n"
|
||||
"{% endfor %}"
|
||||
"{% if add_generation_prompt %}"
|
||||
"<|im_start|>assistant\n"
|
||||
"{% endif %}"
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
processor = AudioFlamingo3Processor(
|
||||
feature_extractor=WhisperFeatureExtractor(feature_size=128, return_attention_mask=True),
|
||||
tokenizer=AutoTokenizer.from_pretrained(str(llm_dir), chat_template=tokenizer_chat_template, use_fast=True),
|
||||
chat_template=processor_chat_template,
|
||||
)
|
||||
processor.save_pretrained(str(dst_root))
|
||||
|
||||
logger.info("processor (tokenizer + preprocessor)")
|
||||
return processor
|
||||
|
||||
|
||||
PREFIX_MAP = {
|
||||
"llm": "language_model",
|
||||
"sound_tower": "audio_tower",
|
||||
"sound_mm_projector": "multi_modal_projector",
|
||||
}
|
||||
|
||||
|
||||
def _resolve_component_dir(dirpath: Path):
|
||||
if not dirpath.is_dir():
|
||||
return None
|
||||
idx = dirpath / "model.safetensors.index.json"
|
||||
mono = dirpath / "model.safetensors"
|
||||
if idx.exists():
|
||||
wm = _load_json(idx).get("weight_map") or {}
|
||||
by_shard: dict[str, list[str]] = defaultdict(list)
|
||||
for k, shard in wm.items():
|
||||
by_shard[shard].append(k)
|
||||
return ("sharded", dirpath, {k: sorted(v) for k, v in sorted(by_shard.items())})
|
||||
if mono.exists():
|
||||
return ("file", mono)
|
||||
cands = sorted([x for x in dirpath.iterdir() if x.suffix == ".safetensors"])
|
||||
return ("file", cands[0]) if len(cands) == 1 else None
|
||||
|
||||
|
||||
def merge_and_shard_weights(src_root: Path, dst_root: Path, processor: AudioFlamingo3Processor):
|
||||
state: dict[str, Any] = {}
|
||||
for tag in PREFIX_MAP.keys():
|
||||
comp = _resolve_component_dir(src_root / tag)
|
||||
if not comp:
|
||||
continue
|
||||
|
||||
out_prefix = PREFIX_MAP.get(tag, tag)
|
||||
|
||||
if comp[0] == "file":
|
||||
fp: Path = comp[1]
|
||||
with safe_open(str(fp), framework="pt", device="cpu") as f:
|
||||
for k in f.keys():
|
||||
if k == "__metadata__":
|
||||
continue
|
||||
state[f"{out_prefix}.{k}"] = f.get_tensor(k)
|
||||
else:
|
||||
base: Path = comp[1]
|
||||
shard_map: dict[str, list[str]] = comp[2]
|
||||
for shard, keys in shard_map.items():
|
||||
sp = base / shard
|
||||
with safe_open(str(sp), framework="pt", device="cpu") as f:
|
||||
for k in keys:
|
||||
state[f"{out_prefix}.{k}"] = f.get_tensor(k)
|
||||
|
||||
if not state:
|
||||
raise FileNotFoundError("No tensors found in llm/, sound_tower/, or sound_mm_projector/.")
|
||||
|
||||
tok = processor.tokenizer
|
||||
|
||||
text_config = Qwen2Config(
|
||||
bos_token_id=tok.bos_token_id,
|
||||
eos_token_id=tok.eos_token_id,
|
||||
pad_token_id=tok.pad_token_id,
|
||||
vocab_size=len(tok),
|
||||
hidden_size=3584,
|
||||
intermediate_size=18944,
|
||||
model_max_length=8192,
|
||||
num_attention_heads=28,
|
||||
num_hidden_layers=28,
|
||||
num_key_value_heads=4,
|
||||
rope_theta=1000000.0,
|
||||
use_cache=False,
|
||||
)
|
||||
config = AudioFlamingo3Config(text_config=text_config, audio_token_id=tok.get_vocab()["<sound>"])
|
||||
model = AudioFlamingo3ForConditionalGeneration(config).to(dtype=torch.bfloat16)
|
||||
|
||||
# Update state dict to new key names if necessary
|
||||
projector_key_mapping = {
|
||||
"multi_modal_projector.layers.0.weight": "multi_modal_projector.linear_1.weight",
|
||||
"multi_modal_projector.layers.0.bias": "multi_modal_projector.linear_1.bias",
|
||||
"multi_modal_projector.layers.2.weight": "multi_modal_projector.linear_2.weight",
|
||||
"multi_modal_projector.layers.2.bias": "multi_modal_projector.linear_2.bias",
|
||||
}
|
||||
for old_key, new_key in projector_key_mapping.items():
|
||||
if old_key in state:
|
||||
state[new_key] = state.pop(old_key)
|
||||
|
||||
# Load weights into the instantiated model so we can push via `push_to_hub` later.
|
||||
load_res = model.load_state_dict(state, strict=True)
|
||||
# Enforce a clean load
|
||||
if getattr(load_res, "missing_keys", None) and load_res.missing_keys:
|
||||
mk = load_res.missing_keys
|
||||
raise ValueError(f"Missing keys when loading: {mk[:10]}{' ...' if len(mk) > 10 else ''}")
|
||||
if getattr(load_res, "unexpected_keys", None) and load_res.unexpected_keys:
|
||||
uk = load_res.unexpected_keys
|
||||
raise ValueError(f"Unexpected keys when loading: {uk[:10]}{' ...' if len(uk) > 10 else ''}")
|
||||
|
||||
generation_config = GenerationConfig(
|
||||
bos_token_id=tok.bos_token_id,
|
||||
eos_token_id=tok.eos_token_id,
|
||||
pad_token_id=tok.pad_token_id,
|
||||
max_new_tokens=2048,
|
||||
)
|
||||
model.generation_config = generation_config
|
||||
|
||||
model.save_pretrained(save_directory=str(dst_root))
|
||||
logger.info("model.safetensors index and shards")
|
||||
return model
|
||||
|
||||
|
||||
"""
|
||||
Reproducible Usage
|
||||
==================
|
||||
|
||||
1) Download the original AudioFlamingo-3 weights from NVIDIA (requires Git LFS):
|
||||
|
||||
```
|
||||
git lfs install
|
||||
git clone https://huggingface.co/nvidia/audio-flamingo-3
|
||||
```
|
||||
|
||||
This will create a folder `audio-flamingo-3/` containing the original components:
|
||||
`llm/`, `sound_tower/`, and `sound_mm_projector/`.
|
||||
|
||||
2) Convert to the Hugging Face Transformers format (locally):
|
||||
|
||||
```
|
||||
python src/transformers/models/audioflamingo3/convert_audioflamingo3_to_hf.py \
|
||||
--src_dir audio-flamingo-3 \
|
||||
--dst_dir audio-flamingo-3-hf
|
||||
```
|
||||
|
||||
3) Convert and push directly to the Hub (requires `huggingface-cli login` or `HF_TOKEN`):
|
||||
|
||||
```
|
||||
python src/transformers/models/audioflamingo3/convert_audioflamingo3_to_hf.py \
|
||||
--src_dir audio-flamingo-3 \
|
||||
--dst_dir audio-flamingo-3-hf \
|
||||
--push_to_hub <username-or-org>/audio-flamingo-3
|
||||
```
|
||||
|
||||
This command uploads both the processor (tokenizer + feature extractor) and the converted
|
||||
model (sharded safetensors + configs) to the specified Hub repository.
|
||||
"""
|
||||
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser(description="Convert AudioFlamingo3 to Hugging Face format.")
|
||||
ap.add_argument("--src_dir", required=True, help="Source model root directory")
|
||||
ap.add_argument("--dst_dir", required=True, help="Destination directory for converted model")
|
||||
ap.add_argument(
|
||||
"--push_to_hub",
|
||||
default=None,
|
||||
type=str,
|
||||
help=(
|
||||
"Optional repository ID to push the converted assets to the Hugging Face Hub, "
|
||||
"e.g. 'username/audio-flamingo-3'."
|
||||
),
|
||||
)
|
||||
args = ap.parse_args()
|
||||
|
||||
src_root = Path(args.src_dir).resolve()
|
||||
if not src_root.is_dir():
|
||||
raise FileNotFoundError(f"Source directory not found: {src_root}")
|
||||
|
||||
dst_root = Path(args.dst_dir).resolve()
|
||||
if dst_root.exists():
|
||||
raise FileExistsError(f"Destination already exists: {dst_root}")
|
||||
|
||||
processor = write_processor(src_root, dst_root)
|
||||
model = merge_and_shard_weights(src_root, dst_root, processor)
|
||||
|
||||
# Optionally push converted assets using native push_to_hub only
|
||||
if args.push_to_hub:
|
||||
logger.info("Pushing processor to the Hub ...")
|
||||
processor.push_to_hub(args.push_to_hub)
|
||||
logger.info("Pushing model to the Hub ...")
|
||||
model.push_to_hub(args.push_to_hub)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -0,0 +1,628 @@
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
# This file was automatically generated from src/transformers/models/audioflamingo3/modular_audioflamingo3.py.
|
||||
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
||||
# the file from the modular. If any change should be done, please apply the change to the
|
||||
# modular_audioflamingo3.py file directly. One of our CI enforces this.
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
# coding=utf-8
|
||||
# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
|
||||
# reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
from collections.abc import Callable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, EncoderDecoderCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...masking_utils import eager_mask, padding_mask_function
|
||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BaseModelOutput, CausalLMOutputWithPast
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
|
||||
from ..auto import AutoModel, AutoModelForCausalLM
|
||||
from .configuration_audioflamingo3 import AudioFlamingo3Config, AudioFlamingo3EncoderConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
def eager_attention_forward(
|
||||
module: nn.Module,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
attention_mask: Optional[torch.Tensor],
|
||||
scaling: Optional[float] = None,
|
||||
dropout: float = 0.0,
|
||||
**kwargs,
|
||||
):
|
||||
if scaling is None:
|
||||
scaling = query.size(-1) ** -0.5
|
||||
|
||||
attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
|
||||
if attention_mask is not None and attention_mask.ndim == 4:
|
||||
attn_weights = attn_weights + attention_mask[:, :, :, : key.shape[-2]]
|
||||
|
||||
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
|
||||
|
||||
attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
|
||||
attn_output = torch.matmul(attn_weights, value)
|
||||
attn_output = attn_output.transpose(1, 2).contiguous()
|
||||
|
||||
return attn_output, attn_weights
|
||||
|
||||
|
||||
class AudioFlamingo3Attention(nn.Module):
|
||||
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embed_dim: int,
|
||||
num_heads: int,
|
||||
dropout: float = 0.0,
|
||||
is_decoder: bool = False,
|
||||
bias: bool = True,
|
||||
is_causal: bool = False,
|
||||
layer_idx: Optional[int] = None,
|
||||
config: Optional[AudioFlamingo3Config] = None,
|
||||
):
|
||||
super().__init__()
|
||||
self.embed_dim = embed_dim
|
||||
self.num_heads = num_heads
|
||||
self.dropout = dropout
|
||||
self.head_dim = embed_dim // num_heads
|
||||
self.config = config
|
||||
|
||||
if (self.head_dim * num_heads) != self.embed_dim:
|
||||
raise ValueError(
|
||||
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
|
||||
f" and `num_heads`: {num_heads})."
|
||||
)
|
||||
self.scaling = self.head_dim**-0.5
|
||||
self.is_decoder = is_decoder
|
||||
self.is_causal = is_causal
|
||||
|
||||
if layer_idx is None and is_decoder:
|
||||
logger.warning_once(
|
||||
f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and "
|
||||
"will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
|
||||
"when creating this class."
|
||||
)
|
||||
self.layer_idx = layer_idx
|
||||
|
||||
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
|
||||
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
|
||||
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
|
||||
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
key_value_states: Optional[torch.Tensor] = None,
|
||||
past_key_values: Optional[Cache] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
output_attentions: bool = False,
|
||||
cache_position: Optional[torch.Tensor] = None,
|
||||
# TODO: we need a refactor so that the different attention modules can get their specific kwargs
|
||||
# ATM, we have mixed things encoder, decoder, and encoder-decoder attn
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
|
||||
"""Input shape: Batch x Time x Channel"""
|
||||
|
||||
# if key_value_states are provided this layer is used as a cross-attention layer
|
||||
# for the decoder
|
||||
is_cross_attention = key_value_states is not None
|
||||
|
||||
# determine input shapes
|
||||
bsz, tgt_len = hidden_states.shape[:-1]
|
||||
q_input_shape = (bsz, tgt_len, -1, self.head_dim)
|
||||
|
||||
# Scaling is susceptible to floating point arithmetics' inprecisions
|
||||
# which can lead to different results (this is dependent from model
|
||||
# to model, e.g. audioflamingo3 is one such case). We therefore keep the
|
||||
# original order of scaling to follow the original implementation
|
||||
# and enforce no scaling (1.0) in the attention call below.
|
||||
query_states = self.q_proj(hidden_states) * self.scaling
|
||||
query_states = query_states.view(*q_input_shape)
|
||||
query_states = query_states.transpose(1, 2).contiguous()
|
||||
|
||||
# Check is encoder-decoder model is being used. Otherwise we'll get `DynamicCache`
|
||||
if past_key_values is not None and isinstance(past_key_values, EncoderDecoderCache):
|
||||
is_updated = past_key_values.is_updated.get(self.layer_idx)
|
||||
if is_cross_attention:
|
||||
# after the first generated id, we can subsequently re-use all key/value_states from cache
|
||||
past_key_values.is_updated[self.layer_idx] = True
|
||||
past_key_values = past_key_values.cross_attention_cache
|
||||
else:
|
||||
past_key_values = past_key_values.self_attention_cache
|
||||
|
||||
# use key_value_states if cross attention
|
||||
current_states = key_value_states if key_value_states is not None else hidden_states
|
||||
if is_cross_attention and past_key_values and is_updated:
|
||||
# reuse k,v, cross_attentions
|
||||
key_states = past_key_values.layers[self.layer_idx].keys
|
||||
value_states = past_key_values.layers[self.layer_idx].values
|
||||
else:
|
||||
key_states = self.k_proj(current_states).view(bsz, -1, self.num_heads, self.head_dim)
|
||||
value_states = self.v_proj(current_states).view(bsz, -1, self.num_heads, self.head_dim)
|
||||
key_states = key_states.transpose(1, 2).contiguous()
|
||||
value_states = value_states.transpose(1, 2).contiguous()
|
||||
if past_key_values is not None:
|
||||
# save all key/value_states to cache to be re-used for fast auto-regressive generation
|
||||
cache_position = cache_position if not is_cross_attention else None
|
||||
key_states, value_states = past_key_values.update(
|
||||
key_states, value_states, self.layer_idx, {"cache_position": cache_position}
|
||||
)
|
||||
|
||||
attention_interface: Callable = eager_attention_forward
|
||||
if self.config._attn_implementation != "eager":
|
||||
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||
|
||||
attn_output, attn_weights = attention_interface(
|
||||
self,
|
||||
query_states,
|
||||
key_states,
|
||||
value_states,
|
||||
attention_mask,
|
||||
dropout=0.0 if not self.training else self.dropout,
|
||||
scaling=1.0,
|
||||
output_attentions=output_attentions,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous()
|
||||
attn_output = self.out_proj(attn_output)
|
||||
|
||||
return attn_output, attn_weights
|
||||
|
||||
|
||||
class AudioFlamingo3EncoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: AudioFlamingo3Config):
|
||||
super().__init__()
|
||||
self.embed_dim = config.d_model
|
||||
|
||||
self.self_attn = AudioFlamingo3Attention(
|
||||
embed_dim=self.embed_dim,
|
||||
num_heads=config.encoder_attention_heads,
|
||||
dropout=config.attention_dropout,
|
||||
config=config,
|
||||
)
|
||||
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
|
||||
self.dropout = config.dropout
|
||||
self.activation_fn = ACT2FN[config.activation_function]
|
||||
self.activation_dropout = config.activation_dropout
|
||||
self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
|
||||
self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
|
||||
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
attention_mask: torch.Tensor,
|
||||
output_attentions: bool = False,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Args:
|
||||
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
|
||||
attention_mask (`torch.FloatTensor`): attention mask of size
|
||||
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
||||
returned tensors for more detail.
|
||||
"""
|
||||
residual = hidden_states
|
||||
hidden_states = self.self_attn_layer_norm(hidden_states)
|
||||
hidden_states, attn_weights = self.self_attn(
|
||||
hidden_states=hidden_states,
|
||||
attention_mask=attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
|
||||
hidden_states = residual + hidden_states
|
||||
|
||||
residual = hidden_states
|
||||
hidden_states = self.final_layer_norm(hidden_states)
|
||||
hidden_states = self.activation_fn(self.fc1(hidden_states))
|
||||
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
|
||||
hidden_states = self.fc2(hidden_states)
|
||||
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
|
||||
hidden_states = residual + hidden_states
|
||||
|
||||
if hidden_states.dtype == torch.float16:
|
||||
clamp_value = torch.finfo(hidden_states.dtype).max - 1000
|
||||
hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
|
||||
|
||||
return hidden_states, attn_weights
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class AudioFlamingo3PreTrainedModel(PreTrainedModel):
|
||||
config: AudioFlamingo3Config
|
||||
base_model_prefix = "model"
|
||||
input_modalities = ["audio", "text"]
|
||||
supports_gradient_checkpointing = True
|
||||
_no_split_modules = ["AudioFlamingo3Attention"]
|
||||
_skip_keys_device_placement = "past_key_values"
|
||||
_supports_flash_attn = True
|
||||
_supports_sdpa = True
|
||||
|
||||
def _init_weights(self, module):
|
||||
# important: this ported version of AudioFlamingo3 isn't meant for training from scratch - only
|
||||
# inference and fine-tuning - so the proper init weights code has been removed
|
||||
std = (
|
||||
self.config.initializer_range
|
||||
if hasattr(self.config, "initializer_range")
|
||||
else self.config.audio_config.initializer_range
|
||||
)
|
||||
|
||||
if isinstance(module, (nn.Linear, nn.Conv1d)):
|
||||
module.weight.data.normal_(mean=0.0, std=std)
|
||||
if module.bias is not None:
|
||||
module.bias.data.zero_()
|
||||
elif isinstance(module, nn.LayerNorm):
|
||||
module.weight.data.fill_(1.0)
|
||||
module.bias.data.zero_()
|
||||
elif isinstance(module, nn.Embedding):
|
||||
module.weight.data.normal_(mean=0.0, std=std)
|
||||
if module.padding_idx is not None:
|
||||
module.weight.data[module.padding_idx].zero_()
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The audio model from AudioFlamingo3 without any head or projection on top.
|
||||
"""
|
||||
)
|
||||
class AudioFlamingo3Encoder(AudioFlamingo3PreTrainedModel):
|
||||
"""
|
||||
AudioFlamingo3 encoder: Whisper encoder, average pool (time/2), then LayerNorm.
|
||||
"""
|
||||
|
||||
# Ignore copy
|
||||
config: AudioFlamingo3EncoderConfig
|
||||
main_input_name = "input_features"
|
||||
input_modalities = "audio"
|
||||
_no_split_modules = ["AudioFlamingo3EncoderLayer"]
|
||||
|
||||
def __init__(self, config: AudioFlamingo3EncoderConfig):
|
||||
super().__init__(config)
|
||||
self.dropout = config.dropout
|
||||
self.layerdrop = config.encoder_layerdrop
|
||||
|
||||
embed_dim = config.d_model
|
||||
self.num_mel_bins = config.num_mel_bins
|
||||
self.padding_idx = config.pad_token_id
|
||||
self.max_source_positions = config.max_source_positions
|
||||
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
|
||||
|
||||
self.conv1 = nn.Conv1d(self.num_mel_bins, embed_dim, kernel_size=3, padding=1)
|
||||
self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1)
|
||||
|
||||
self.embed_positions = nn.Embedding(self.max_source_positions, embed_dim)
|
||||
self.embed_positions.requires_grad_(False)
|
||||
|
||||
self.layers = nn.ModuleList([AudioFlamingo3EncoderLayer(config) for _ in range(config.encoder_layers)])
|
||||
self.layer_norm = nn.LayerNorm(config.d_model)
|
||||
# Ignore copy
|
||||
self.avg_pooler = nn.AvgPool1d(2, stride=2)
|
||||
|
||||
self.gradient_checkpointing = False
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
def _freeze_parameters(self):
|
||||
for param in self.parameters():
|
||||
param.requires_grad = False
|
||||
self._requires_grad = False
|
||||
|
||||
def get_input_embeddings(self) -> nn.Module:
|
||||
return self.conv1
|
||||
|
||||
def set_input_embeddings(self, value: nn.Module):
|
||||
self.conv1 = value
|
||||
|
||||
@can_return_tuple
|
||||
def forward(
|
||||
self,
|
||||
input_features: torch.Tensor,
|
||||
input_features_mask: Optional[torch.Tensor] = None,
|
||||
):
|
||||
r"""
|
||||
Args:
|
||||
input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
|
||||
Log-Mel features extracted from raw audio. Use the processor/feature extractor to compute and pad
|
||||
these features from waveform input.
|
||||
input_features_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
"""
|
||||
|
||||
# Prepare attention mask for transformer layers
|
||||
batch_size = input_features.shape[0]
|
||||
seq_len = (input_features.shape[-1] - 1) // 2 + 1 # After conv2 downsampling
|
||||
|
||||
input_features_lengths = input_features_mask.sum(-1)
|
||||
input_features_lengths = (input_features_lengths - 1) // 2 + 1 # conv2 downsampling
|
||||
input_features_mask = torch.arange(seq_len, device=input_features.device) < input_features_lengths[:, None]
|
||||
attention_mask = eager_mask(
|
||||
batch_size=batch_size,
|
||||
cache_position=torch.arange(seq_len, device=input_features.device),
|
||||
kv_length=seq_len,
|
||||
mask_function=padding_mask_function(input_features_mask),
|
||||
dtype=self.conv1.weight.dtype,
|
||||
)
|
||||
|
||||
# Conv front-end
|
||||
inputs_embeds = nn.functional.gelu(self.conv1(input_features))
|
||||
inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
|
||||
inputs_embeds = inputs_embeds.permute(0, 2, 1)
|
||||
|
||||
# Add positions, dropout
|
||||
hidden_states = inputs_embeds + self.embed_positions.weight
|
||||
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
|
||||
|
||||
# Transformer stack
|
||||
for layer in self.layers:
|
||||
drop = self.training and torch.rand([]) < self.layerdrop
|
||||
if not drop:
|
||||
hidden_states = layer(hidden_states, attention_mask)[0]
|
||||
|
||||
# AvgPool (time/2) + LayerNorm
|
||||
hidden_states = hidden_states.permute(0, 2, 1)
|
||||
hidden_states = self.avg_pooler(hidden_states).permute(0, 2, 1)
|
||||
hidden_states = self.layer_norm(hidden_states)
|
||||
|
||||
return BaseModelOutput(
|
||||
last_hidden_state=hidden_states,
|
||||
)
|
||||
|
||||
# Ignore copy
|
||||
def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
|
||||
"""
|
||||
Computes the output length of the convolutional layers and the output length of the audio encoder
|
||||
"""
|
||||
input_lengths = (input_lengths - 1) // 2 + 1
|
||||
output_lengths = (input_lengths - 2) // 2 + 1
|
||||
return input_lengths, output_lengths
|
||||
|
||||
|
||||
class AudioFlamingo3MultiModalProjector(nn.Module):
|
||||
"""
|
||||
Audio adaptor (small MLP) that projects AudioFlamingo3Encoder features
|
||||
to the LLM embedding space so they can replace `<sound>` tokens.
|
||||
"""
|
||||
|
||||
def __init__(self, config: AudioFlamingo3Config):
|
||||
super().__init__()
|
||||
self.linear_1 = nn.Linear(
|
||||
config.audio_config.hidden_size, config.text_config.hidden_size, bias=config.projector_bias
|
||||
)
|
||||
self.act = ACT2FN[config.projector_hidden_act]
|
||||
self.linear_2 = nn.Linear(
|
||||
config.text_config.hidden_size, config.text_config.hidden_size, bias=config.projector_bias
|
||||
)
|
||||
|
||||
def forward(self, audio_features):
|
||||
hidden_states = self.linear_1(audio_features)
|
||||
hidden_states = self.act(hidden_states)
|
||||
hidden_states = self.linear_2(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The AudioFlamingo3 model which consists of a fine-tuned Whisper encoder, a multi-modal projector and a Qwen2 language model.
|
||||
"""
|
||||
)
|
||||
class AudioFlamingo3ForConditionalGeneration(AudioFlamingo3PreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = None
|
||||
_tp_plan = None
|
||||
_pp_plan = None
|
||||
_keep_in_fp32_modules_strict = None
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.vocab_size = config.text_config.vocab_size
|
||||
self.audio_tower = AutoModel.from_config(config.audio_config)
|
||||
self.language_model = AutoModelForCausalLM.from_config(config.text_config)
|
||||
self.multi_modal_projector = AudioFlamingo3MultiModalProjector(config)
|
||||
# Similar to Qwen2Audio
|
||||
if self.language_model._tied_weights_keys is not None:
|
||||
self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.language_model.get_input_embeddings()
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.language_model.set_input_embeddings(value)
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.language_model.get_output_embeddings()
|
||||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.language_model.set_output_embeddings(new_embeddings)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.language_model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model.get_decoder()
|
||||
|
||||
def get_audio_features(
|
||||
self, input_features: torch.FloatTensor, input_features_mask: torch.Tensor
|
||||
) -> torch.FloatTensor:
|
||||
"""
|
||||
This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
|
||||
Args:
|
||||
input_features (`torch.FloatTensor`):
|
||||
Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
|
||||
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
|
||||
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
|
||||
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
|
||||
and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
|
||||
input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
|
||||
Mask to avoid performing attention on padded feature indices.
|
||||
|
||||
Returns:
|
||||
`torch.FloatTensor`:
|
||||
The audio embeddings.
|
||||
"""
|
||||
|
||||
# Encode audio
|
||||
encoder_output = self.audio_tower(input_features, input_features_mask=input_features_mask)
|
||||
audio_embeds = self.multi_modal_projector(encoder_output.last_hidden_state)
|
||||
|
||||
# Mask according to avg pooling (which is after attention blocks)
|
||||
post_lengths = (input_features_mask.sum(-1) - 2) // 2 + 1
|
||||
valid_mask = torch.arange(audio_embeds.shape[1], device=post_lengths.device)[None, :] < post_lengths[:, None]
|
||||
audio_embeds = audio_embeds[valid_mask.to(audio_embeds.device)]
|
||||
return audio_embeds
|
||||
|
||||
@can_return_tuple
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
input_features: Optional[torch.FloatTensor] = None,
|
||||
input_features_mask: Optional[torch.Tensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[Cache] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> CausalLMOutputWithPast:
|
||||
r"""
|
||||
input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
|
||||
Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
|
||||
|
||||
>>> model_id = "nvidia/audio-flamingo-3-hf"
|
||||
>>> processor = AutoProcessor.from_pretrained(model_id)
|
||||
>>> model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
|
||||
|
||||
>>> conversations = [
|
||||
>>> [
|
||||
>>> {
|
||||
>>> "role": "user",
|
||||
>>> "content": [
|
||||
>>> {"type": "text", "text": "Transcribe the input speech."},
|
||||
>>> {
|
||||
>>> "type": "audio",
|
||||
>>> "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav",
|
||||
>>> },
|
||||
>>> ],
|
||||
>>> }
|
||||
>>> ],
|
||||
>>> [
|
||||
>>> {
|
||||
>>> "role": "user",
|
||||
>>> "content": [
|
||||
>>> {
|
||||
>>> "type": "text",
|
||||
>>> "text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
|
||||
>>> },
|
||||
>>> {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
|
||||
>>> ],
|
||||
>>> }
|
||||
>>> ],
|
||||
>>> ]
|
||||
|
||||
>>> inputs = processor.apply_chat_template(
|
||||
>>> conversations,
|
||||
>>> tokenize=True,
|
||||
>>> add_generation_prompt=True,
|
||||
>>> return_dict=True,
|
||||
>>> ).to(model.device)
|
||||
|
||||
>>> outputs = model.generate(**inputs, max_new_tokens=500)
|
||||
|
||||
>>> decoded_outputs = processor.batch_decode(
|
||||
>>> outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True
|
||||
>>> )
|
||||
>>> print(decoded_outputs)
|
||||
["The spoken content of the audio is...", "The track's calming and meditative feel can be attributed to..."]
|
||||
```"""
|
||||
|
||||
if inputs_embeds is None:
|
||||
inputs_embeds = self.get_input_embeddings()(input_ids)
|
||||
|
||||
if input_features is not None and input_ids is not None:
|
||||
audio_embeds = self.get_audio_features(input_features, input_features_mask)
|
||||
|
||||
# replace text-audio token placeholders with audio embeddings
|
||||
audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1)
|
||||
inputs_embeds = inputs_embeds.masked_scatter(
|
||||
audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device)
|
||||
)
|
||||
|
||||
outputs: CausalLMOutputWithPast = self.language_model(
|
||||
inputs_embeds=inputs_embeds,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_values=past_key_values,
|
||||
labels=labels,
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
logits_to_keep=logits_to_keep,
|
||||
**kwargs,
|
||||
)
|
||||
return outputs
|
||||
|
||||
def prepare_inputs_for_generation(self, *args, **kwargs):
|
||||
# Overwritten -- we should not pass input_features when we are in cached decoding stage
|
||||
|
||||
input_features = kwargs.pop("input_features", None)
|
||||
input_features_mask = kwargs.pop("input_features_mask", None)
|
||||
cache_position = kwargs.get("cache_position")
|
||||
|
||||
model_inputs = super().prepare_inputs_for_generation(*args, **kwargs)
|
||||
|
||||
if cache_position is not None and cache_position[0] == 0:
|
||||
# input_features should only be passed when we are not in cached decoding stage
|
||||
if input_features is not None:
|
||||
model_inputs["input_features"] = input_features
|
||||
if input_features_mask is not None:
|
||||
model_inputs["input_features_mask"] = input_features_mask
|
||||
|
||||
return model_inputs
|
||||
|
||||
|
||||
__all__ = ["AudioFlamingo3ForConditionalGeneration", "AudioFlamingo3PreTrainedModel", "AudioFlamingo3Encoder"]
|
||||
307
src/transformers/models/audioflamingo3/modular_audioflamingo3.py
Normal file
307
src/transformers/models/audioflamingo3/modular_audioflamingo3.py
Normal file
@ -0,0 +1,307 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
|
||||
# reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache
|
||||
from ...masking_utils import eager_mask, padding_mask_function
|
||||
from ...modeling_outputs import BaseModelOutput, CausalLMOutputWithPast
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
|
||||
from ..qwen2_audio.modeling_qwen2_audio import (
|
||||
Qwen2AudioEncoder,
|
||||
Qwen2AudioPreTrainedModel,
|
||||
)
|
||||
from ..voxtral.modeling_voxtral import VoxtralForConditionalGeneration, VoxtralMultiModalProjector
|
||||
from ..whisper.modeling_whisper import WhisperEncoderLayer
|
||||
from .configuration_audioflamingo3 import AudioFlamingo3Config
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class AudioFlamingo3EncoderLayer(WhisperEncoderLayer):
|
||||
pass
|
||||
|
||||
|
||||
class AudioFlamingo3PreTrainedModel(Qwen2AudioPreTrainedModel):
|
||||
pass
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The audio model from AudioFlamingo3 without any head or projection on top.
|
||||
"""
|
||||
)
|
||||
class AudioFlamingo3Encoder(Qwen2AudioEncoder):
|
||||
"""
|
||||
AudioFlamingo3 encoder: Whisper encoder, average pool (time/2), then LayerNorm.
|
||||
"""
|
||||
|
||||
@can_return_tuple
|
||||
def forward(
|
||||
self,
|
||||
input_features: torch.Tensor,
|
||||
input_features_mask: Optional[torch.Tensor] = None,
|
||||
):
|
||||
r"""
|
||||
Args:
|
||||
input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
|
||||
Log-Mel features extracted from raw audio. Use the processor/feature extractor to compute and pad
|
||||
these features from waveform input.
|
||||
input_features_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
"""
|
||||
|
||||
# Prepare attention mask for transformer layers
|
||||
batch_size = input_features.shape[0]
|
||||
seq_len = (input_features.shape[-1] - 1) // 2 + 1 # After conv2 downsampling
|
||||
|
||||
input_features_lengths = input_features_mask.sum(-1)
|
||||
input_features_lengths = (input_features_lengths - 1) // 2 + 1 # conv2 downsampling
|
||||
input_features_mask = torch.arange(seq_len, device=input_features.device) < input_features_lengths[:, None]
|
||||
attention_mask = eager_mask(
|
||||
batch_size=batch_size,
|
||||
cache_position=torch.arange(seq_len, device=input_features.device),
|
||||
kv_length=seq_len,
|
||||
mask_function=padding_mask_function(input_features_mask),
|
||||
dtype=self.conv1.weight.dtype,
|
||||
)
|
||||
|
||||
# Conv front-end
|
||||
inputs_embeds = nn.functional.gelu(self.conv1(input_features))
|
||||
inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
|
||||
inputs_embeds = inputs_embeds.permute(0, 2, 1)
|
||||
|
||||
# Add positions, dropout
|
||||
hidden_states = inputs_embeds + self.embed_positions.weight
|
||||
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
|
||||
|
||||
# Transformer stack
|
||||
for layer in self.layers:
|
||||
drop = self.training and torch.rand([]) < self.layerdrop
|
||||
if not drop:
|
||||
hidden_states = layer(hidden_states, attention_mask)[0]
|
||||
|
||||
# AvgPool (time/2) + LayerNorm
|
||||
hidden_states = hidden_states.permute(0, 2, 1)
|
||||
hidden_states = self.avg_pooler(hidden_states).permute(0, 2, 1)
|
||||
hidden_states = self.layer_norm(hidden_states)
|
||||
|
||||
return BaseModelOutput(
|
||||
last_hidden_state=hidden_states,
|
||||
)
|
||||
|
||||
|
||||
class AudioFlamingo3MultiModalProjector(VoxtralMultiModalProjector):
|
||||
"""
|
||||
Audio adaptor (small MLP) that projects AudioFlamingo3Encoder features
|
||||
to the LLM embedding space so they can replace `<sound>` tokens.
|
||||
"""
|
||||
|
||||
def __init__(self, config: AudioFlamingo3Config):
|
||||
super().__init__()
|
||||
self.linear_1 = nn.Linear(
|
||||
config.audio_config.hidden_size, config.text_config.hidden_size, bias=config.projector_bias
|
||||
)
|
||||
self.act = ACT2FN[config.projector_hidden_act]
|
||||
self.linear_2 = nn.Linear(
|
||||
config.text_config.hidden_size, config.text_config.hidden_size, bias=config.projector_bias
|
||||
)
|
||||
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The AudioFlamingo3 model which consists of a fine-tuned Whisper encoder, a multi-modal projector and a Qwen2 language model.
|
||||
"""
|
||||
)
|
||||
class AudioFlamingo3ForConditionalGeneration(VoxtralForConditionalGeneration):
|
||||
_tied_weights_keys = None
|
||||
_tp_plan = None
|
||||
_pp_plan = None
|
||||
_keep_in_fp32_modules_strict = None
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
# Similar to Qwen2Audio
|
||||
if self.language_model._tied_weights_keys is not None:
|
||||
self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
|
||||
|
||||
def get_audio_features(
|
||||
self, input_features: torch.FloatTensor, input_features_mask: torch.Tensor
|
||||
) -> torch.FloatTensor:
|
||||
"""
|
||||
This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
|
||||
Args:
|
||||
input_features (`torch.FloatTensor`):
|
||||
Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
|
||||
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
|
||||
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
|
||||
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
|
||||
and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
|
||||
input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
|
||||
Mask to avoid performing attention on padded feature indices.
|
||||
|
||||
Returns:
|
||||
`torch.FloatTensor`:
|
||||
The audio embeddings.
|
||||
"""
|
||||
|
||||
# Encode audio
|
||||
encoder_output = self.audio_tower(input_features, input_features_mask=input_features_mask)
|
||||
audio_embeds = self.multi_modal_projector(encoder_output.last_hidden_state)
|
||||
|
||||
# Mask according to avg pooling (which is after attention blocks)
|
||||
post_lengths = (input_features_mask.sum(-1) - 2) // 2 + 1
|
||||
valid_mask = torch.arange(audio_embeds.shape[1], device=post_lengths.device)[None, :] < post_lengths[:, None]
|
||||
audio_embeds = audio_embeds[valid_mask.to(audio_embeds.device)]
|
||||
return audio_embeds
|
||||
|
||||
def get_audio_embeds(self):
|
||||
raise NotImplementedError("This method is not supported for AudioFlamingo3ForConditionalGeneration.")
|
||||
|
||||
@can_return_tuple
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
input_features: Optional[torch.FloatTensor] = None,
|
||||
input_features_mask: Optional[torch.Tensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[Cache] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> CausalLMOutputWithPast:
|
||||
r"""
|
||||
input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
|
||||
Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
|
||||
|
||||
>>> model_id = "nvidia/audio-flamingo-3-hf"
|
||||
>>> processor = AutoProcessor.from_pretrained(model_id)
|
||||
>>> model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
|
||||
|
||||
>>> conversations = [
|
||||
>>> [
|
||||
>>> {
|
||||
>>> "role": "user",
|
||||
>>> "content": [
|
||||
>>> {"type": "text", "text": "Transcribe the input speech."},
|
||||
>>> {
|
||||
>>> "type": "audio",
|
||||
>>> "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav",
|
||||
>>> },
|
||||
>>> ],
|
||||
>>> }
|
||||
>>> ],
|
||||
>>> [
|
||||
>>> {
|
||||
>>> "role": "user",
|
||||
>>> "content": [
|
||||
>>> {
|
||||
>>> "type": "text",
|
||||
>>> "text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
|
||||
>>> },
|
||||
>>> {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
|
||||
>>> ],
|
||||
>>> }
|
||||
>>> ],
|
||||
>>> ]
|
||||
|
||||
>>> inputs = processor.apply_chat_template(
|
||||
>>> conversations,
|
||||
>>> tokenize=True,
|
||||
>>> add_generation_prompt=True,
|
||||
>>> return_dict=True,
|
||||
>>> ).to(model.device)
|
||||
|
||||
>>> outputs = model.generate(**inputs, max_new_tokens=500)
|
||||
|
||||
>>> decoded_outputs = processor.batch_decode(
|
||||
>>> outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True
|
||||
>>> )
|
||||
>>> print(decoded_outputs)
|
||||
["The spoken content of the audio is...", "The track's calming and meditative feel can be attributed to..."]
|
||||
```"""
|
||||
|
||||
if inputs_embeds is None:
|
||||
inputs_embeds = self.get_input_embeddings()(input_ids)
|
||||
|
||||
if input_features is not None and input_ids is not None:
|
||||
audio_embeds = self.get_audio_features(input_features, input_features_mask)
|
||||
|
||||
# replace text-audio token placeholders with audio embeddings
|
||||
audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1)
|
||||
inputs_embeds = inputs_embeds.masked_scatter(
|
||||
audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device)
|
||||
)
|
||||
|
||||
outputs: CausalLMOutputWithPast = self.language_model(
|
||||
inputs_embeds=inputs_embeds,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_values=past_key_values,
|
||||
labels=labels,
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
logits_to_keep=logits_to_keep,
|
||||
**kwargs,
|
||||
)
|
||||
return outputs
|
||||
|
||||
def prepare_inputs_for_generation(self, *args, **kwargs):
|
||||
# Overwritten -- we should not pass input_features when we are in cached decoding stage
|
||||
|
||||
input_features = kwargs.pop("input_features", None)
|
||||
input_features_mask = kwargs.pop("input_features_mask", None)
|
||||
cache_position = kwargs.get("cache_position")
|
||||
|
||||
model_inputs = super().prepare_inputs_for_generation(*args, **kwargs)
|
||||
|
||||
if cache_position is not None and cache_position[0] == 0:
|
||||
# input_features should only be passed when we are not in cached decoding stage
|
||||
if input_features is not None:
|
||||
model_inputs["input_features"] = input_features
|
||||
if input_features_mask is not None:
|
||||
model_inputs["input_features_mask"] = input_features_mask
|
||||
|
||||
return model_inputs
|
||||
|
||||
|
||||
__all__ = ["AudioFlamingo3ForConditionalGeneration", "AudioFlamingo3PreTrainedModel", "AudioFlamingo3Encoder"]
|
||||
@ -0,0 +1,318 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
|
||||
# reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import re
|
||||
from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ...audio_utils import AudioInput, make_list_of_audio
|
||||
from ...feature_extraction_utils import BatchFeature
|
||||
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from ...tokenization_utils_base import TextInput
|
||||
from ...utils import is_torch_available, logging
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
MAX_AUDIO_LEN = 10 * 60 # 10 minutes
|
||||
DEFAULT_TRANSCRIPTION_PROMPT = "Transcribe the input speech."
|
||||
|
||||
|
||||
class AudioFlamingo3ProcessorKwargs(ProcessingKwargs, total=False):
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"padding": True,
|
||||
},
|
||||
"audio_kwargs": {
|
||||
"sampling_rate": 16000,
|
||||
"chunk_length": 30.0,
|
||||
"return_attention_mask": True,
|
||||
"padding": "max_length",
|
||||
},
|
||||
"common_kwargs": {
|
||||
"return_tensors": "pt",
|
||||
"padding_side": "left",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class AudioFlamingo3Processor(ProcessorMixin):
|
||||
r"""
|
||||
Constructs an AudioFlamingo3 processor which wraps an AudioFlamingo3 feature extractor and an AudioFlamingo3
|
||||
tokenizer into a single processor.
|
||||
|
||||
[`AudioFlamingo3Processor`] offers all the functionalities of [`WhisperFeatureExtractor`] and
|
||||
[`Qwen2TokenizerFast`]. See the [`~AudioFlamingo3Processor.__call__`] for more information.
|
||||
|
||||
Args:
|
||||
feature_extractor ([`WhisperFeatureExtractor`]):
|
||||
The feature extractor is a required input.
|
||||
tokenizer ([`Qwen2TokenizerFast`]):
|
||||
The tokenizer is a required input.
|
||||
chat_template (`Optional[str]`, *optional*):
|
||||
The Jinja template to use for formatting the conversation. If not provided, the tokenizer's default chat
|
||||
template will be used.
|
||||
audio_token (`Optional[str]`, *optional*, defaults to `"<sound>"`):
|
||||
Special token used to represent audio inputs in the chat template.
|
||||
"""
|
||||
|
||||
attributes = ["feature_extractor", "tokenizer"]
|
||||
feature_extractor_class = "WhisperFeatureExtractor"
|
||||
tokenizer_class = "Qwen2TokenizerFast"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
feature_extractor,
|
||||
tokenizer,
|
||||
chat_template=None,
|
||||
audio_token="<sound>",
|
||||
):
|
||||
self.audio_token = audio_token
|
||||
self.audio_token_id = tokenizer.convert_tokens_to_ids(audio_token)
|
||||
super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text: Union[TextInput, list[TextInput]],
|
||||
audio: Optional[AudioInput] = None,
|
||||
output_labels: Optional[bool] = False,
|
||||
**kwargs: Unpack[AudioFlamingo3ProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
r"""
|
||||
Main method to prepare one or several text sequence(s) and audio waveform(s) for the model. This
|
||||
method expands `<sound>` placeholders in the text based on the post-pool frame counts of the
|
||||
audio windows, then tokenizes the provided strings as-is, and extracts log-mel features
|
||||
with [`WhisperFeatureExtractor`]. If `audio` is `None`, no audio processing is performed and
|
||||
the text is tokenized as-is (LM-only behavior).
|
||||
|
||||
Args:
|
||||
text (`str` or `list[str]`):
|
||||
Input sequence or batch of sequences.
|
||||
audio (`np.ndarray` or `list[np.ndarray]`):
|
||||
Input audio or batch of audios as NumPy arrays. If provided, there must be as many `text` inputs as
|
||||
`audio` inputs.
|
||||
output_labels (bool, *optional*, default=False):
|
||||
Whether to return labels for training.
|
||||
|
||||
Returns:
|
||||
[`BatchFeature`]: A dictionary with tokenized text (`input_ids`, `attention_mask`) and
|
||||
audio features (`input_features`, `input_features_mask`).
|
||||
"""
|
||||
|
||||
# Merge defaults with user kwargs
|
||||
call_kwargs = self._merge_kwargs(
|
||||
AudioFlamingo3ProcessorKwargs,
|
||||
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
text_kwargs = call_kwargs["text_kwargs"]
|
||||
audio_kwargs = call_kwargs["audio_kwargs"]
|
||||
return_tensors = text_kwargs.get("return_tensors")
|
||||
if return_tensors != "pt":
|
||||
raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.")
|
||||
|
||||
if isinstance(text, str):
|
||||
text = [text]
|
||||
elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)):
|
||||
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
|
||||
|
||||
audio_inputs = {}
|
||||
if audio is not None:
|
||||
audio = make_list_of_audio(audio)
|
||||
if len(text) != len(audio):
|
||||
raise ValueError(f"Got {len(text)} text but {len(audio)} audios; they must match 1:1.")
|
||||
|
||||
# Determine number of chunks per sample, and flatten
|
||||
window_size = int(audio_kwargs["sampling_rate"] * audio_kwargs["chunk_length"])
|
||||
max_windows = int(MAX_AUDIO_LEN // audio_kwargs["chunk_length"])
|
||||
|
||||
per_sample_windows: list[int] = []
|
||||
flat_chunks: list[np.ndarray] = []
|
||||
|
||||
for audio_el in audio:
|
||||
n_samples = int(audio_el.shape[0])
|
||||
n_win = max(1, (n_samples + window_size - 1) // window_size)
|
||||
if n_win > max_windows:
|
||||
logger.warning(
|
||||
f"Audio duration ({n_samples / audio_kwargs['sampling_rate']:.1f}s) exceeds {MAX_AUDIO_LEN}s; truncating to first {MAX_AUDIO_LEN}s."
|
||||
)
|
||||
n_win = max_windows
|
||||
per_sample_windows.append(n_win)
|
||||
|
||||
time_cap = min(n_samples, n_win * window_size)
|
||||
for i in range(n_win):
|
||||
start = i * window_size
|
||||
end = min((i + 1) * window_size, time_cap)
|
||||
flat_chunks.append(audio_el[start:end])
|
||||
|
||||
# Feature extraction
|
||||
audio_inputs = self.feature_extractor(flat_chunks, **audio_kwargs)
|
||||
padding_mask = audio_inputs.pop("attention_mask")
|
||||
audio_inputs["input_features_mask"] = padding_mask
|
||||
|
||||
# Compute sequence lengths token counting
|
||||
audio_lenghts = torch.stack([s.sum() for s in torch.split(padding_mask.sum(-1), per_sample_windows)])
|
||||
conv_output_lengths = (audio_lenghts - 1) // 2 + 1 # After conv2 downsampling
|
||||
audio_tokens_lengths = (conv_output_lengths - 2) // 2 + 1 # After avg pooling
|
||||
|
||||
# expand audio tokens in text
|
||||
for i, audio_length in enumerate(audio_tokens_lengths):
|
||||
expanded = re.sub(re.escape(self.audio_token), self.audio_token * audio_length, text[i])
|
||||
text[i] = expanded
|
||||
|
||||
# Tokenize
|
||||
text_inputs = self.tokenizer(text, **text_kwargs)
|
||||
|
||||
data = {**text_inputs, **audio_inputs}
|
||||
if output_labels:
|
||||
labels = data["input_ids"].clone()
|
||||
labels[labels == self.audio_token_id] = -100
|
||||
labels[labels == self.tokenizer.pad_token_id] = -100
|
||||
data["labels"] = labels
|
||||
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
|
||||
@property
|
||||
def model_input_names(self) -> list[str]:
|
||||
tok_names = self.tokenizer.model_input_names
|
||||
fea_names = self.feature_extractor.model_input_names
|
||||
return list(dict.fromkeys(tok_names + fea_names + ["input_features_mask"]))
|
||||
|
||||
def apply_transcription_request(
|
||||
self,
|
||||
audio: Union[str, list[str], AudioInput],
|
||||
prompt: Optional[Union[str, list[str]]] = None,
|
||||
**kwargs: Unpack[AudioFlamingo3ProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Prepare inputs for automatic speech recognition without manually writing the default transcription prompt.
|
||||
|
||||
Args:
|
||||
audio (`str`, `list[str]`, `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
||||
Audio to transcribe. Strings are interpreted as local paths or URLs and will be loaded automatically by
|
||||
the chat template loader; NumPy arrays and PyTorch tensors are forwarded directly.
|
||||
prompt (`str` or `list[str]`, *optional*):
|
||||
Custom prompt(s) to include in the user turn. A list must be the same length as the batch. When `None`,
|
||||
each sample uses `"Transcribe the input speech."`.
|
||||
**kwargs:
|
||||
Additional keyword arguments forwarded to [`~AudioFlamingo3Processor.apply_chat_template`] (for example
|
||||
`text_kwargs`, `audio_kwargs`, ...).
|
||||
|
||||
Returns:
|
||||
[`BatchFeature`]: Processor outputs ready to be passed to [`AudioFlamingo3ForConditionalGeneration.generate`].
|
||||
|
||||
"""
|
||||
|
||||
if isinstance(audio, str):
|
||||
audio_items: list[Union[str, np.ndarray]] = [audio]
|
||||
elif isinstance(audio, (list, tuple)) and audio and all(isinstance(el, str) for el in audio):
|
||||
audio_items = list(audio)
|
||||
else:
|
||||
audio_items = list(make_list_of_audio(audio))
|
||||
if is_torch_available():
|
||||
audio_items = [el.detach().cpu().numpy() if isinstance(el, torch.Tensor) else el for el in audio_items]
|
||||
|
||||
batch_size = len(audio_items)
|
||||
if batch_size == 0:
|
||||
raise ValueError("`audio` must contain at least one sample.")
|
||||
|
||||
if prompt is None:
|
||||
prompts = [DEFAULT_TRANSCRIPTION_PROMPT] * batch_size
|
||||
elif isinstance(prompt, str):
|
||||
prompts = [prompt] * batch_size
|
||||
elif isinstance(prompt, (list, tuple)):
|
||||
if len(prompt) != batch_size:
|
||||
raise ValueError(
|
||||
f"Received {len(prompt)} prompt(s) for {batch_size} audio sample(s); counts must match."
|
||||
)
|
||||
prompts = []
|
||||
for item in prompt:
|
||||
if item is None:
|
||||
prompts.append(DEFAULT_TRANSCRIPTION_PROMPT)
|
||||
elif isinstance(item, str):
|
||||
prompts.append(item)
|
||||
else:
|
||||
raise TypeError("Each prompt must be a string or `None`.")
|
||||
else:
|
||||
raise TypeError("`prompt` must be a string, a sequence of strings, or `None`.")
|
||||
|
||||
conversations = [
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt_text},
|
||||
{"type": "audio", "path": audio_item}
|
||||
if isinstance(audio_item, str)
|
||||
else {"type": "audio", "audio": audio_item},
|
||||
],
|
||||
}
|
||||
]
|
||||
for prompt_text, audio_item in zip(prompts, audio_items)
|
||||
]
|
||||
|
||||
return self.apply_chat_template(
|
||||
conversations,
|
||||
tokenize=True,
|
||||
add_generation_prompt=True,
|
||||
return_dict=True,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def batch_decode(self, *args, strip_prefix=False, **kwargs):
|
||||
"""
|
||||
Forward arguments to [`~PreTrainedTokenizer.batch_decode`] and optionally remove the assistant framing the model
|
||||
was trained to produce.
|
||||
|
||||
AF3 transcription requests respond with sentences such as `"The spoken content of the audio is \"...\"."`.
|
||||
Setting `strip_prefix=True` trims the fixed prefix for just the transcription text.
|
||||
"""
|
||||
decoded = self.tokenizer.batch_decode(*args, **kwargs)
|
||||
if strip_prefix:
|
||||
decoded = [self._strip_assistant_prefix_and_quotes(text) for text in decoded]
|
||||
return decoded
|
||||
|
||||
def _strip_assistant_prefix_and_quotes(self, text: str) -> str:
|
||||
"""
|
||||
Remove the assistant prefix and surrounding quotes from a decoded transcription string.
|
||||
"""
|
||||
|
||||
stripped = text.strip()
|
||||
|
||||
for prefix in (
|
||||
"The spoken content of the audio is",
|
||||
"The transcription of the audio is",
|
||||
):
|
||||
if stripped.startswith(prefix):
|
||||
stripped = stripped[len(prefix) :].strip()
|
||||
break
|
||||
|
||||
if stripped.endswith("."):
|
||||
stripped = stripped[:-1].strip()
|
||||
|
||||
if len(stripped) >= 2 and stripped[0] == stripped[-1] and stripped[0] in {"'", '"'}:
|
||||
stripped = stripped[1:-1].strip()
|
||||
|
||||
return stripped
|
||||
|
||||
|
||||
__all__ = ["AudioFlamingo3Processor"]
|
||||
@ -45,6 +45,8 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
|
||||
("aria", "AriaConfig"),
|
||||
("aria_text", "AriaTextConfig"),
|
||||
("audio-spectrogram-transformer", "ASTConfig"),
|
||||
("audioflamingo3", "AudioFlamingo3Config"),
|
||||
("audioflamingo3_encoder", "AudioFlamingo3EncoderConfig"),
|
||||
("autoformer", "AutoformerConfig"),
|
||||
("aya_vision", "AyaVisionConfig"),
|
||||
("bamba", "BambaConfig"),
|
||||
@ -477,6 +479,8 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
||||
("aria", "Aria"),
|
||||
("aria_text", "AriaText"),
|
||||
("audio-spectrogram-transformer", "Audio Spectrogram Transformer"),
|
||||
("audioflamingo3", "AudioFlamingo3"),
|
||||
("audioflamingo3_encoder", "AudioFlamingo3Encoder"),
|
||||
("autoformer", "Autoformer"),
|
||||
("aya_vision", "AyaVision"),
|
||||
("bamba", "Bamba"),
|
||||
@ -960,6 +964,7 @@ DEPRECATED_MODELS = [
|
||||
|
||||
SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict[str, str](
|
||||
[
|
||||
("audioflamingo3_encoder", "audioflamingo3"),
|
||||
("openai-gpt", "openai"),
|
||||
("data2vec-audio", "data2vec"),
|
||||
("data2vec-text", "data2vec"),
|
||||
|
||||
@ -15,7 +15,6 @@
|
||||
"""AutoFeatureExtractor class."""
|
||||
|
||||
import importlib
|
||||
import json
|
||||
import os
|
||||
from collections import OrderedDict
|
||||
from typing import Optional, Union
|
||||
@ -24,7 +23,7 @@ from typing import Optional, Union
|
||||
from ...configuration_utils import PreTrainedConfig
|
||||
from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
|
||||
from ...feature_extraction_utils import FeatureExtractionMixin
|
||||
from ...utils import CONFIG_NAME, FEATURE_EXTRACTOR_NAME, cached_file, logging
|
||||
from ...utils import CONFIG_NAME, FEATURE_EXTRACTOR_NAME, PROCESSOR_NAME, cached_file, logging, safe_load_json_file
|
||||
from .auto_factory import _LazyAutoMapping
|
||||
from .configuration_auto import (
|
||||
CONFIG_MAPPING_NAMES,
|
||||
@ -175,9 +174,10 @@ def get_feature_extractor_config(
|
||||
feature_extractor.save_pretrained("feature-extractor-test")
|
||||
feature_extractor_config = get_feature_extractor_config("feature-extractor-test")
|
||||
```"""
|
||||
resolved_config_file = cached_file(
|
||||
# Load with a priority given to the nested processor config, if available in repo
|
||||
resolved_processor_file = cached_file(
|
||||
pretrained_model_name_or_path,
|
||||
FEATURE_EXTRACTOR_NAME,
|
||||
filename=PROCESSOR_NAME,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
@ -186,16 +186,37 @@ def get_feature_extractor_config(
|
||||
local_files_only=local_files_only,
|
||||
_raise_exceptions_for_gated_repo=False,
|
||||
_raise_exceptions_for_missing_entries=False,
|
||||
_raise_exceptions_for_connection_errors=False,
|
||||
)
|
||||
if resolved_config_file is None:
|
||||
logger.info(
|
||||
"Could not locate the feature extractor configuration file, will try to use the model config instead."
|
||||
)
|
||||
resolved_feature_extractor_file = cached_file(
|
||||
pretrained_model_name_or_path,
|
||||
filename=FEATURE_EXTRACTOR_NAME,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
token=token,
|
||||
revision=revision,
|
||||
local_files_only=local_files_only,
|
||||
_raise_exceptions_for_gated_repo=False,
|
||||
_raise_exceptions_for_missing_entries=False,
|
||||
)
|
||||
|
||||
# An empty list if none of the possible files is found in the repo
|
||||
if not resolved_feature_extractor_file and not resolved_processor_file:
|
||||
logger.info("Could not locate the feature extractor configuration file.")
|
||||
return {}
|
||||
|
||||
with open(resolved_config_file, encoding="utf-8") as reader:
|
||||
return json.load(reader)
|
||||
# Load feature_extractor dict. Priority goes as (nested config if found -> feature extractor config)
|
||||
# We are downloading both configs because almost all models have a `processor_config.json` but
|
||||
# not all of these are nested. We need to check if it was saved recently as nested or if it is legacy style
|
||||
feature_extractor_dict = {}
|
||||
if resolved_processor_file is not None:
|
||||
processor_dict = safe_load_json_file(resolved_processor_file)
|
||||
if "feature_extractor" in processor_dict:
|
||||
feature_extractor_dict = processor_dict["feature_extractor"]
|
||||
|
||||
if resolved_feature_extractor_file is not None and feature_extractor_dict is None:
|
||||
feature_extractor_dict = safe_load_json_file(resolved_feature_extractor_file)
|
||||
return feature_extractor_dict
|
||||
|
||||
|
||||
class AutoFeatureExtractor:
|
||||
|
||||
@ -15,7 +15,6 @@
|
||||
"""AutoImageProcessor class."""
|
||||
|
||||
import importlib
|
||||
import json
|
||||
import os
|
||||
import warnings
|
||||
from collections import OrderedDict
|
||||
@ -29,12 +28,14 @@ from ...image_processing_utils_fast import BaseImageProcessorFast
|
||||
from ...utils import (
|
||||
CONFIG_NAME,
|
||||
IMAGE_PROCESSOR_NAME,
|
||||
PROCESSOR_NAME,
|
||||
cached_file,
|
||||
is_timm_config_dict,
|
||||
is_timm_local_checkpoint,
|
||||
is_torchvision_available,
|
||||
is_vision_available,
|
||||
logging,
|
||||
safe_load_json_file,
|
||||
)
|
||||
from ...utils.import_utils import requires
|
||||
from .auto_factory import _LazyAutoMapping
|
||||
@ -319,9 +320,10 @@ def get_image_processor_config(
|
||||
image_processor.save_pretrained("image-processor-test")
|
||||
image_processor_config = get_image_processor_config("image-processor-test")
|
||||
```"""
|
||||
resolved_config_file = cached_file(
|
||||
# Load with a priority given to the nested processor config, if available in repo
|
||||
resolved_processor_file = cached_file(
|
||||
pretrained_model_name_or_path,
|
||||
IMAGE_PROCESSOR_NAME,
|
||||
filename=PROCESSOR_NAME,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
@ -330,16 +332,38 @@ def get_image_processor_config(
|
||||
local_files_only=local_files_only,
|
||||
_raise_exceptions_for_gated_repo=False,
|
||||
_raise_exceptions_for_missing_entries=False,
|
||||
_raise_exceptions_for_connection_errors=False,
|
||||
)
|
||||
if resolved_config_file is None:
|
||||
logger.info(
|
||||
"Could not locate the image processor configuration file, will try to use the model config instead."
|
||||
)
|
||||
resolved_image_processor_file = cached_file(
|
||||
pretrained_model_name_or_path,
|
||||
filename=IMAGE_PROCESSOR_NAME,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
token=token,
|
||||
revision=revision,
|
||||
local_files_only=local_files_only,
|
||||
_raise_exceptions_for_gated_repo=False,
|
||||
_raise_exceptions_for_missing_entries=False,
|
||||
)
|
||||
|
||||
# An empty list if none of the possible files is found in the repo
|
||||
if not resolved_image_processor_file and not resolved_processor_file:
|
||||
logger.info("Could not locate the image processor configuration file.")
|
||||
return {}
|
||||
|
||||
with open(resolved_config_file, encoding="utf-8") as reader:
|
||||
return json.load(reader)
|
||||
# Load image_processor dict. Priority goes as (nested config if found -> image processor config)
|
||||
# We are downloading both configs because almost all models have a `processor_config.json` but
|
||||
# not all of these are nested. We need to check if it was saved recently as nested or if it is legacy style
|
||||
image_processor_dict = {}
|
||||
if resolved_processor_file is not None:
|
||||
processor_dict = safe_load_json_file(resolved_processor_file)
|
||||
if "image_processor" in processor_dict:
|
||||
image_processor_dict = processor_dict["image_processor"]
|
||||
|
||||
if resolved_image_processor_file is not None and image_processor_dict is None:
|
||||
image_processor_dict = safe_load_json_file(resolved_image_processor_file)
|
||||
|
||||
return image_processor_dict
|
||||
|
||||
|
||||
def _warning_fast_image_processor_available(fast_class):
|
||||
|
||||
@ -53,6 +53,8 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("aria", "AriaModel"),
|
||||
("aria_text", "AriaTextModel"),
|
||||
("audio-spectrogram-transformer", "ASTModel"),
|
||||
("audioflamingo3", "AudioFlamingo3ForConditionalGeneration"),
|
||||
("audioflamingo3_encoder", "AudioFlamingo3Encoder"),
|
||||
("autoformer", "AutoformerModel"),
|
||||
("aya_vision", "AyaVisionModel"),
|
||||
("bamba", "BambaModel"),
|
||||
@ -445,6 +447,7 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
|
||||
[
|
||||
# Model for pre-training mapping
|
||||
("albert", "AlbertForPreTraining"),
|
||||
("audioflamingo3", "AudioFlamingo3ForConditionalGeneration"),
|
||||
("bart", "BartForConditionalGeneration"),
|
||||
("bert", "BertForPreTraining"),
|
||||
("big_bird", "BigBirdForPreTraining"),
|
||||
@ -1159,6 +1162,7 @@ MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = OrderedDict(
|
||||
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
|
||||
[
|
||||
# Model for Seq2Seq Causal LM mapping
|
||||
("audioflamingo3", "AudioFlamingo3ForConditionalGeneration"),
|
||||
("bart", "BartForConditionalGeneration"),
|
||||
("bigbird_pegasus", "BigBirdPegasusForConditionalGeneration"),
|
||||
("blenderbot", "BlenderbotForConditionalGeneration"),
|
||||
@ -1700,6 +1704,7 @@ MODEL_FOR_BACKBONE_MAPPING_NAMES = OrderedDict(
|
||||
("dinov2", "Dinov2Backbone"),
|
||||
("dinov2_with_registers", "Dinov2WithRegistersBackbone"),
|
||||
("dinov3_convnext", "DINOv3ConvNextBackbone"),
|
||||
("dinov3_vit", "DINOv3ViTBackbone"),
|
||||
("focalnet", "FocalNetBackbone"),
|
||||
("hgnet_v2", "HGNetV2Backbone"),
|
||||
("hiera", "HieraBackbone"),
|
||||
|
||||
@ -48,6 +48,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
|
||||
("align", "AlignProcessor"),
|
||||
("altclip", "AltCLIPProcessor"),
|
||||
("aria", "AriaProcessor"),
|
||||
("audioflamingo3", "AudioFlamingo3Processor"),
|
||||
("aya_vision", "AyaVisionProcessor"),
|
||||
("bark", "BarkProcessor"),
|
||||
("blip", "BlipProcessor"),
|
||||
|
||||
@ -15,7 +15,6 @@
|
||||
"""AutoVideoProcessor class."""
|
||||
|
||||
import importlib
|
||||
import json
|
||||
import os
|
||||
from collections import OrderedDict
|
||||
from typing import TYPE_CHECKING, Optional, Union
|
||||
@ -23,7 +22,16 @@ from typing import TYPE_CHECKING, Optional, Union
|
||||
# Build the list of all video processors
|
||||
from ...configuration_utils import PreTrainedConfig
|
||||
from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
|
||||
from ...utils import CONFIG_NAME, VIDEO_PROCESSOR_NAME, cached_file, is_torchvision_available, logging
|
||||
from ...utils import (
|
||||
CONFIG_NAME,
|
||||
IMAGE_PROCESSOR_NAME,
|
||||
PROCESSOR_NAME,
|
||||
VIDEO_PROCESSOR_NAME,
|
||||
cached_file,
|
||||
is_torchvision_available,
|
||||
logging,
|
||||
safe_load_json_file,
|
||||
)
|
||||
from ...utils.import_utils import requires
|
||||
from ...video_processing_utils import BaseVideoProcessor
|
||||
from .auto_factory import _LazyAutoMapping
|
||||
@ -168,24 +176,59 @@ def get_video_processor_config(
|
||||
video_processor.save_pretrained("video-processor-test")
|
||||
video_processor = get_video_processor_config("video-processor-test")
|
||||
```"""
|
||||
resolved_config_file = cached_file(
|
||||
# Load with a priority given to the nested processor config, if available in repo
|
||||
resolved_processor_file = cached_file(
|
||||
pretrained_model_name_or_path,
|
||||
VIDEO_PROCESSOR_NAME,
|
||||
filename=PROCESSOR_NAME,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
token=token,
|
||||
revision=revision,
|
||||
local_files_only=local_files_only,
|
||||
_raise_exceptions_for_gated_repo=False,
|
||||
_raise_exceptions_for_missing_entries=False,
|
||||
)
|
||||
if resolved_config_file is None:
|
||||
logger.info(
|
||||
"Could not locate the video processor configuration file, will try to use the model config instead."
|
||||
resolved_video_processor_files = [
|
||||
resolved_file
|
||||
for filename in [VIDEO_PROCESSOR_NAME, IMAGE_PROCESSOR_NAME]
|
||||
if (
|
||||
resolved_file := cached_file(
|
||||
pretrained_model_name_or_path,
|
||||
filename=filename,
|
||||
cache_dir=cache_dir,
|
||||
force_download=force_download,
|
||||
proxies=proxies,
|
||||
token=token,
|
||||
revision=revision,
|
||||
local_files_only=local_files_only,
|
||||
_raise_exceptions_for_gated_repo=False,
|
||||
_raise_exceptions_for_missing_entries=False,
|
||||
_raise_exceptions_for_connection_errors=False,
|
||||
)
|
||||
)
|
||||
is not None
|
||||
]
|
||||
resolved_video_processor_file = resolved_video_processor_files[0] if resolved_video_processor_files else None
|
||||
|
||||
# An empty list if none of the possible files is found in the repo
|
||||
if not resolved_video_processor_file and not resolved_processor_file:
|
||||
logger.info("Could not locate the video processor configuration file.")
|
||||
return {}
|
||||
|
||||
with open(resolved_config_file, encoding="utf-8") as reader:
|
||||
return json.load(reader)
|
||||
# Load video_processor dict. Priority goes as (nested config if found -> video processor config -> image processor config)
|
||||
# We are downloading both configs because almost all models have a `processor_config.json` but
|
||||
# not all of these are nested. We need to check if it was saved recebtly as nested or if it is legacy style
|
||||
video_processor_dict = {}
|
||||
if resolved_processor_file is not None:
|
||||
processor_dict = safe_load_json_file(resolved_processor_file)
|
||||
if "video_processor" in processor_dict:
|
||||
video_processor_dict = processor_dict["video_processor"]
|
||||
|
||||
if resolved_video_processor_file is not None and video_processor_dict is None:
|
||||
video_processor_dict = safe_load_json_file(resolved_video_processor_file)
|
||||
|
||||
return video_processor_dict
|
||||
|
||||
|
||||
@requires(backends=("vision", "torchvision"))
|
||||
|
||||
@ -22,7 +22,6 @@ if TYPE_CHECKING:
|
||||
from .modeling_bert import *
|
||||
from .tokenization_bert import *
|
||||
from .tokenization_bert_fast import *
|
||||
from .tokenization_bert_tf import *
|
||||
else:
|
||||
import sys
|
||||
|
||||
|
||||
@ -138,7 +138,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model and processor to the 🤗 hub.",
|
||||
help="Whether or not to push the converted model and processor to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -257,7 +257,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -274,7 +274,7 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the Hugging Face hub."
|
||||
)
|
||||
parser.add_argument("--sample_rate", default=None, type=str, help="Sample rate used by DacFeatureExtractor")
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -258,7 +258,7 @@ def convert_wav2vec2_checkpoint(
|
||||
max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
|
||||
print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7
|
||||
success = torch.allclose(our_output, their_output, atol=1e-3)
|
||||
print("Do both models output the same tensors?", "🔥" if success else "💩")
|
||||
print("Do both models output the same tensors?", "[PASS]" if success else "[FAIL]")
|
||||
if not success:
|
||||
raise Exception("Something went wRoNg")
|
||||
|
||||
|
||||
@ -180,7 +180,7 @@ def convert_data2vec_checkpoint_to_pytorch(
|
||||
max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
|
||||
print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7
|
||||
success = torch.allclose(our_output, their_output, atol=1e-3)
|
||||
print("Do both models output the same tensors?", "🔥" if success else "💩")
|
||||
print("Do both models output the same tensors?", "[PASS]" if success else "[FAIL]")
|
||||
if not success:
|
||||
raise Exception("Something went wRoNg")
|
||||
|
||||
|
||||
@ -341,7 +341,7 @@ def main():
|
||||
|
||||
print(f"max_absolute_diff = {max_absolute_diff}")
|
||||
success = torch.allclose(hf_output, orig_model_output, atol=1e-3)
|
||||
print("Do both models output the same tensors?", "🔥" if success else "💩")
|
||||
print("Do both models output the same tensors?", "[PASS]" if success else "[FAIL]")
|
||||
if not success:
|
||||
raise Exception("Something went wRoNg")
|
||||
|
||||
|
||||
@ -222,7 +222,9 @@ if __name__ == "__main__":
|
||||
help="Path to the folder to output PyTorch model.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
convert_deformable_detr_checkpoint(
|
||||
|
||||
@ -299,9 +299,9 @@ def convert_bort_checkpoint_to_pytorch(bort_checkpoint_path: str, pytorch_dump_f
|
||||
success = np.allclose(gluon_layer, hf_layer, atol=1e-3)
|
||||
|
||||
if success:
|
||||
print("✔️ Both model do output the same tensors")
|
||||
print("[SUCCESS] Both models do output the same tensors")
|
||||
else:
|
||||
print("❌ Both model do **NOT** output the same tensors")
|
||||
print("[FAIL] Both models do **NOT** output the same tensors")
|
||||
print("Absolute difference is:", max_absolute_diff)
|
||||
|
||||
|
||||
|
||||
@ -313,7 +313,9 @@ if __name__ == "__main__":
|
||||
help="Path to the folder to output PyTorch model.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
|
||||
|
||||
@ -320,7 +320,9 @@ if __name__ == "__main__":
|
||||
help="Path to the folder to output PyTorch model.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
|
||||
|
||||
@ -22,7 +22,7 @@ from ....activations import ACT2FN
|
||||
from ....cache_utils import Cache
|
||||
from ....modeling_outputs import MoECausalLMOutputWithPast, MoEModelOutputWithPastAndCrossAttentions
|
||||
from ....modeling_utils import PreTrainedModel
|
||||
from ....utils import DUMMY_INPUTS, DUMMY_MASK, auto_docstring
|
||||
from ....utils import DUMMY_INPUTS, DUMMY_MASK
|
||||
from .configuration_gptsan_japanese import GPTSanJapaneseConfig
|
||||
|
||||
|
||||
@ -635,7 +635,6 @@ class GPTSanJapaneseModel(GPTSanJapanesePreTrainedModel):
|
||||
def set_input_embeddings(self, new_embeddings):
|
||||
self.embed_tokens = new_embeddings
|
||||
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
|
||||
@ -278,7 +278,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -284,7 +284,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -18,12 +18,13 @@ from typing import Optional
|
||||
|
||||
from ...configuration_utils import PreTrainedConfig
|
||||
from ...utils import logging
|
||||
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class DINOv3ViTConfig(PreTrainedConfig):
|
||||
class DINOv3ViTConfig(BackboneConfigMixin, PreTrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`DINOv3Model`]. It is used to instantiate an
|
||||
DINOv3 model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
||||
@ -86,6 +87,16 @@ class DINOv3ViTConfig(PreTrainedConfig):
|
||||
pos_embed_rescale (`float`, *optional*, defaults to 2.0):
|
||||
Amount to randomly rescale position embedding coordinates in log-uniform value in [1/rescale, rescale],
|
||||
applied only in training mode if not `None`.
|
||||
out_features (`list[str]`, *optional*):
|
||||
If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
|
||||
(depending on how many stages the model has). Will default to the last stage if unset.
|
||||
out_indices (`list[int]`, *optional*):
|
||||
If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc.
|
||||
(depending on how many stages the model has). Will default to the last stage if unset.
|
||||
apply_layernorm (`bool`, *optional*, defaults to `True`):
|
||||
Whether to apply layer normalization to the feature maps when used as backbone.
|
||||
reshape_hidden_states (`bool`, *optional*, defaults to `True`):
|
||||
Whether to reshape the hidden states to spatial dimensions when used as backbone.
|
||||
|
||||
Example:
|
||||
|
||||
@ -131,6 +142,10 @@ class DINOv3ViTConfig(PreTrainedConfig):
|
||||
pos_embed_shift: Optional[float] = None,
|
||||
pos_embed_jitter: Optional[float] = None,
|
||||
pos_embed_rescale: Optional[float] = 2.0,
|
||||
out_features: Optional[list[str]] = None,
|
||||
out_indices: Optional[list[int]] = None,
|
||||
apply_layernorm: bool = True,
|
||||
reshape_hidden_states: bool = True,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
@ -161,6 +176,18 @@ class DINOv3ViTConfig(PreTrainedConfig):
|
||||
self.pos_embed_shift = pos_embed_shift
|
||||
self.pos_embed_jitter = pos_embed_jitter
|
||||
self.pos_embed_rescale = pos_embed_rescale
|
||||
# Initialize backbone-specific configuration
|
||||
self.apply_layernorm = apply_layernorm
|
||||
self.reshape_hidden_states = reshape_hidden_states
|
||||
|
||||
# Initialize backbone stage names
|
||||
stage_names = ["stem"] + [f"stage{i}" for i in range(1, num_hidden_layers + 1)]
|
||||
self.stage_names = stage_names
|
||||
|
||||
# Initialize backbone features/indices
|
||||
self._out_features, self._out_indices = get_aligned_output_features_output_indices(
|
||||
out_features=out_features, out_indices=out_indices, stage_names=stage_names
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["DINOv3ViTConfig"]
|
||||
|
||||
@ -29,11 +29,12 @@ from torch import nn
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BaseModelOutputWithPooling
|
||||
from ...modeling_outputs import BackboneOutput, BaseModelOutputWithPooling
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
||||
from ...processing_utils import Unpack
|
||||
from ...pytorch_utils import compile_compatible_method_lru_cache
|
||||
from ...utils import TransformersKwargs, auto_docstring
|
||||
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
|
||||
from ...utils.backbone_utils import BackboneMixin
|
||||
from ...utils.generic import check_model_inputs
|
||||
from .configuration_dinov3_vit import DINOv3ViTConfig
|
||||
|
||||
@ -522,10 +523,79 @@ class DINOv3ViTModel(DINOv3ViTPreTrainedModel):
|
||||
sequence_output = self.norm(hidden_states)
|
||||
pooled_output = sequence_output[:, 0, :]
|
||||
|
||||
return BaseModelOutputWithPooling(
|
||||
last_hidden_state=sequence_output,
|
||||
pooler_output=pooled_output,
|
||||
)
|
||||
return BaseModelOutputWithPooling(last_hidden_state=sequence_output, pooler_output=pooled_output)
|
||||
|
||||
|
||||
__all__ = ["DINOv3ViTModel", "DINOv3ViTPreTrainedModel"]
|
||||
@auto_docstring
|
||||
class DINOv3ViTBackbone(DINOv3ViTPreTrainedModel, BackboneMixin):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
super()._init_backbone(config)
|
||||
|
||||
self.embeddings = DINOv3ViTEmbeddings(config)
|
||||
self.rope_embeddings = DINOv3ViTRopePositionEmbedding(config)
|
||||
self.layer = nn.ModuleList([DINOv3ViTLayer(config) for _ in range(config.num_hidden_layers)])
|
||||
self.norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||
self.gradient_checkpointing = False
|
||||
|
||||
self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
|
||||
self.post_init()
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.embeddings.patch_embeddings
|
||||
|
||||
@check_model_inputs()
|
||||
@can_return_tuple
|
||||
def forward(
|
||||
self,
|
||||
pixel_values: torch.Tensor,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> BackboneOutput:
|
||||
pixel_values = pixel_values.to(self.embeddings.patch_embeddings.weight.dtype)
|
||||
hidden_states = self.embeddings(pixel_values)
|
||||
position_embeddings = self.rope_embeddings(pixel_values)
|
||||
|
||||
stage_hidden_states: list[torch.Tensor] = [hidden_states]
|
||||
|
||||
for layer_module in self.layer:
|
||||
hidden_states = layer_module(hidden_states, position_embeddings=position_embeddings)
|
||||
stage_hidden_states.append(hidden_states)
|
||||
|
||||
batch_size, _, image_height, image_width = pixel_values.shape
|
||||
patch_size = self.config.patch_size
|
||||
num_patches_height = image_height // patch_size
|
||||
num_patches_width = image_width // patch_size
|
||||
|
||||
num_prefix = 1 + getattr(self.config, "num_register_tokens", 0)
|
||||
|
||||
feature_maps = []
|
||||
sequence_output = None
|
||||
last_stage_idx = len(self.stage_names) - 1
|
||||
for idx, (stage_name, hidden_state) in enumerate(zip(self.stage_names, stage_hidden_states)):
|
||||
if idx == last_stage_idx:
|
||||
hidden_state = self.norm(hidden_state)
|
||||
sequence_output = hidden_state
|
||||
elif self.config.apply_layernorm:
|
||||
hidden_state = self.norm(hidden_state)
|
||||
|
||||
if stage_name in self.out_features:
|
||||
patch_tokens = hidden_state[:, num_prefix:, :]
|
||||
if self.config.reshape_hidden_states:
|
||||
fmap = (
|
||||
patch_tokens.reshape(batch_size, num_patches_height, num_patches_width, patch_tokens.shape[-1])
|
||||
.permute(0, 3, 1, 2)
|
||||
.contiguous()
|
||||
)
|
||||
else:
|
||||
fmap = patch_tokens
|
||||
|
||||
feature_maps.append(fmap)
|
||||
|
||||
output = BackboneOutput(feature_maps=tuple(feature_maps))
|
||||
output.last_hidden_state = sequence_output
|
||||
|
||||
return output
|
||||
|
||||
|
||||
__all__ = ["DINOv3ViTModel", "DINOv3ViTPreTrainedModel", "DINOv3ViTBackbone"]
|
||||
|
||||
@ -33,11 +33,12 @@ from transformers.models.llama.modeling_llama import LlamaMLP
|
||||
from transformers.models.pixtral.modeling_pixtral import PixtralAttention, rotate_half
|
||||
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BaseModelOutputWithPooling
|
||||
from ...modeling_outputs import BackboneOutput, BaseModelOutputWithPooling
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
||||
from ...processing_utils import Unpack
|
||||
from ...pytorch_utils import compile_compatible_method_lru_cache
|
||||
from ...utils import TransformersKwargs, auto_docstring, logging
|
||||
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
|
||||
from ...utils.backbone_utils import BackboneMixin
|
||||
from ...utils.generic import check_model_inputs
|
||||
from .configuration_dinov3_vit import DINOv3ViTConfig
|
||||
|
||||
@ -417,10 +418,79 @@ class DINOv3ViTModel(DINOv3ViTPreTrainedModel):
|
||||
sequence_output = self.norm(hidden_states)
|
||||
pooled_output = sequence_output[:, 0, :]
|
||||
|
||||
return BaseModelOutputWithPooling(
|
||||
last_hidden_state=sequence_output,
|
||||
pooler_output=pooled_output,
|
||||
)
|
||||
return BaseModelOutputWithPooling(last_hidden_state=sequence_output, pooler_output=pooled_output)
|
||||
|
||||
|
||||
__all__ = ["DINOv3ViTModel", "DINOv3ViTPreTrainedModel"]
|
||||
@auto_docstring
|
||||
class DINOv3ViTBackbone(DINOv3ViTPreTrainedModel, BackboneMixin):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
super()._init_backbone(config)
|
||||
|
||||
self.embeddings = DINOv3ViTEmbeddings(config)
|
||||
self.rope_embeddings = DINOv3ViTRopePositionEmbedding(config)
|
||||
self.layer = nn.ModuleList([DINOv3ViTLayer(config) for _ in range(config.num_hidden_layers)])
|
||||
self.norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||
self.gradient_checkpointing = False
|
||||
|
||||
self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
|
||||
self.post_init()
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.embeddings.patch_embeddings
|
||||
|
||||
@check_model_inputs()
|
||||
@can_return_tuple
|
||||
def forward(
|
||||
self,
|
||||
pixel_values: torch.Tensor,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
**kwargs: Unpack[TransformersKwargs],
|
||||
) -> BackboneOutput:
|
||||
pixel_values = pixel_values.to(self.embeddings.patch_embeddings.weight.dtype)
|
||||
hidden_states = self.embeddings(pixel_values)
|
||||
position_embeddings = self.rope_embeddings(pixel_values)
|
||||
|
||||
stage_hidden_states: list[torch.Tensor] = [hidden_states]
|
||||
|
||||
for layer_module in self.layer:
|
||||
hidden_states = layer_module(hidden_states, position_embeddings=position_embeddings)
|
||||
stage_hidden_states.append(hidden_states)
|
||||
|
||||
batch_size, _, image_height, image_width = pixel_values.shape
|
||||
patch_size = self.config.patch_size
|
||||
num_patches_height = image_height // patch_size
|
||||
num_patches_width = image_width // patch_size
|
||||
|
||||
num_prefix = 1 + getattr(self.config, "num_register_tokens", 0)
|
||||
|
||||
feature_maps = []
|
||||
sequence_output = None
|
||||
last_stage_idx = len(self.stage_names) - 1
|
||||
for idx, (stage_name, hidden_state) in enumerate(zip(self.stage_names, stage_hidden_states)):
|
||||
if idx == last_stage_idx:
|
||||
hidden_state = self.norm(hidden_state)
|
||||
sequence_output = hidden_state
|
||||
elif self.config.apply_layernorm:
|
||||
hidden_state = self.norm(hidden_state)
|
||||
|
||||
if stage_name in self.out_features:
|
||||
patch_tokens = hidden_state[:, num_prefix:, :]
|
||||
if self.config.reshape_hidden_states:
|
||||
fmap = (
|
||||
patch_tokens.reshape(batch_size, num_patches_height, num_patches_width, patch_tokens.shape[-1])
|
||||
.permute(0, 3, 1, 2)
|
||||
.contiguous()
|
||||
)
|
||||
else:
|
||||
fmap = patch_tokens
|
||||
|
||||
feature_maps.append(fmap)
|
||||
|
||||
output = BackboneOutput(feature_maps=tuple(feature_maps))
|
||||
output.last_hidden_state = sequence_output
|
||||
|
||||
return output
|
||||
|
||||
|
||||
__all__ = ["DINOv3ViTModel", "DINOv3ViTPreTrainedModel", "DINOv3ViTBackbone"]
|
||||
|
||||
@ -227,7 +227,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model and processor to the 🤗 hub.",
|
||||
help="Whether or not to push the converted model and processor to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -600,7 +600,7 @@ class EdgeTamPromptEncoder(nn.Module):
|
||||
|
||||
def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
|
||||
"""Embeds box prompts."""
|
||||
boxes += 0.5 # Shift to center of pixel
|
||||
boxes = boxes + 0.5 # Shift to center of pixel
|
||||
coords = boxes.view(*boxes.shape[:2], 2, 2)
|
||||
# add padding point for consistency with the original implementation
|
||||
coords = torch.nn.functional.pad(coords, (0, 0, 0, 1), mode="constant", value=0)
|
||||
|
||||
@ -1644,7 +1644,7 @@ class EdgeTamVideoPromptEncoder(nn.Module):
|
||||
|
||||
def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
|
||||
"""Embeds box prompts."""
|
||||
boxes += 0.5 # Shift to center of pixel
|
||||
boxes = boxes + 0.5 # Shift to center of pixel
|
||||
coords = boxes.view(*boxes.shape[:2], 2, 2)
|
||||
# add padding point for consistency with the original implementation
|
||||
coords = torch.nn.functional.pad(coords, (0, 0, 0, 1), mode="constant", value=0)
|
||||
|
||||
@ -352,7 +352,7 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the Hugging Face hub."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -316,7 +316,7 @@ def convert_esm_checkpoint_to_pytorch(
|
||||
hf_tokens = hf_tokenizer([row[1] for row in sample_data], return_tensors="pt", padding=True)
|
||||
success = torch.all(hf_tokens["input_ids"] == batch_tokens)
|
||||
|
||||
print("Do both models tokenizers output the same tokens?", "🔥" if success else "💩")
|
||||
print("Do both models tokenizers output the same tokens?", "[PASS]" if success else "[FAIL]")
|
||||
if not success:
|
||||
raise Exception("Tokenization does not match!")
|
||||
|
||||
@ -348,7 +348,7 @@ def convert_esm_checkpoint_to_pytorch(
|
||||
success = torch.allclose(our_output, their_output, atol=1e-5)
|
||||
|
||||
print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-5
|
||||
print("Do both models output the same tensors?", "🔥" if success else "💩")
|
||||
print("Do both models output the same tensors?", "[PASS]" if success else "[FAIL]")
|
||||
|
||||
if not success:
|
||||
raise Exception("Something went wRoNg")
|
||||
@ -362,7 +362,7 @@ def convert_esm_checkpoint_to_pytorch(
|
||||
|
||||
print("Contact prediction testing:")
|
||||
print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-5
|
||||
print("Do both models output the same tensors?", "🔥" if success else "💩")
|
||||
print("Do both models output the same tensors?", "[PASS]" if success else "[FAIL]")
|
||||
|
||||
if not success:
|
||||
raise Exception("Something went wRoNg")
|
||||
|
||||
@ -198,7 +198,7 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the Hugging Face hub."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -122,7 +122,7 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the Hugging Face hub."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -89,7 +89,7 @@ if __name__ == "__main__":
|
||||
help="Path to the output `FastSpeech2ConformerModel` PyTorch model.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the Hugging Face hub."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -250,7 +250,9 @@ def main():
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
write_tokenizer(
|
||||
|
||||
@ -26,6 +26,7 @@ from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...masking_utils import create_causal_mask
|
||||
from ...modeling_flash_attention_utils import is_flash_attn_available
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
CausalLMOutputWithCrossAttentions,
|
||||
@ -266,7 +267,7 @@ class GPTBigCodeMLP(nn.Module):
|
||||
return hidden_states
|
||||
|
||||
|
||||
class GPTBigCodeBlock(nn.Module):
|
||||
class GPTBigCodeBlock(GradientCheckpointingLayer):
|
||||
def __init__(self, config, layer_idx=None):
|
||||
super().__init__()
|
||||
hidden_size = config.hidden_size
|
||||
@ -291,9 +292,9 @@ class GPTBigCodeBlock(nn.Module):
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: Optional[tuple[torch.Tensor]],
|
||||
encoder_hidden_states: Optional[torch.Tensor] = None,
|
||||
layer_past: Optional[Cache] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
encoder_hidden_states: Optional[torch.Tensor] = None,
|
||||
encoder_attention_mask: Optional[torch.Tensor] = None,
|
||||
use_cache: Optional[bool] = False,
|
||||
output_attentions: Optional[bool] = False,
|
||||
@ -536,10 +537,10 @@ class GPTBigCodeModel(GPTBigCodePreTrainedModel):
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
outputs = block(
|
||||
hidden_states,
|
||||
past_key_values,
|
||||
causal_mask,
|
||||
hidden_states, # as a positional argument for gradient checkpointing
|
||||
encoder_hidden_states, # as a positional argument for gradient checkpointing
|
||||
layer_past=past_key_values, # as keyword argument so it can be removed by GradientCheckpointingLayer
|
||||
attention_mask=causal_mask,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
|
||||
@ -481,7 +481,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verify_logits", action="store_false", help="Whether or not to verify logits after conversion."
|
||||
|
||||
@ -210,7 +210,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model and processor to the 🤗 hub using the provided `model_name`.",
|
||||
help="Whether or not to push the converted model and processor to the Hugging Face hub using the provided `model_name`.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
@ -353,7 +353,9 @@ if __name__ == "__main__":
|
||||
help="Whether or not to verify the logits against the original implementation.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push-to-hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push-to-hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-model",
|
||||
|
||||
@ -24,7 +24,7 @@ from torch import nn
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
|
||||
from ...masking_utils import create_bidirectional_mask
|
||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BaseModelOutput, ModelOutput
|
||||
@ -506,13 +506,12 @@ class Idefics3VisionTransformer(Idefics3PreTrainedModel):
|
||||
hidden_states = self.embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
|
||||
|
||||
patch_attention_mask = patch_attention_mask.view(batch_size, -1)
|
||||
# The call to `_upad_input` in `_flash_attention_forward` is expensive
|
||||
# So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
|
||||
# avoiding passing the attention_mask, which is equivalent to attending to the full sequence
|
||||
if self.config._attn_implementation != "flash_attention_2":
|
||||
patch_attention_mask = _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)
|
||||
elif not torch.any(~patch_attention_mask):
|
||||
patch_attention_mask = None
|
||||
# Create the correct attention mask based on the attention implementation
|
||||
patch_attention_mask = create_bidirectional_mask(
|
||||
config=self.config,
|
||||
input_embeds=hidden_states,
|
||||
attention_mask=patch_attention_mask,
|
||||
)
|
||||
|
||||
encoder_outputs: BaseModelOutput = self.encoder(
|
||||
inputs_embeds=hidden_states,
|
||||
|
||||
@ -253,7 +253,7 @@ def main():
|
||||
parser.add_argument(
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the model to the 🤗 Hub.",
|
||||
help="Whether or not to push the model to the Hugging Face Hub.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verify_logits", action="store_false", help="Whether or not to verify logits after conversion."
|
||||
|
||||
@ -433,7 +433,9 @@ def main():
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
write_tokenizer(
|
||||
|
||||
@ -387,7 +387,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", type=str, required=True, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
@ -267,7 +267,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
@ -379,7 +379,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", type=str, required=True, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
@ -826,7 +826,7 @@ class OriginalMask2FormerCheckpointToOursConverter:
|
||||
checkpoints: list[Path] = checkpoints_dir.glob("**/*.pkl")
|
||||
|
||||
for checkpoint in checkpoints:
|
||||
logger.info(f"💪 Converting {checkpoint.stem}")
|
||||
logger.info(f"Converting {checkpoint.stem}")
|
||||
# find associated config file
|
||||
|
||||
# dataset_name e.g 'coco'
|
||||
@ -902,7 +902,7 @@ def test(
|
||||
"The predicted masks are not the same."
|
||||
)
|
||||
|
||||
logger.info("✅ Test passed!")
|
||||
logger.info("Test passed!")
|
||||
|
||||
|
||||
def get_model_name(checkpoint_file: Path):
|
||||
@ -1012,9 +1012,9 @@ if __name__ == "__main__":
|
||||
if model_name in high_tolerance_models:
|
||||
tolerance = 3e-1
|
||||
|
||||
logger.info(f"🪄 Testing {model_name}...")
|
||||
logger.info(f"Testing {model_name}...")
|
||||
test(original_model, mask2former_for_segmentation, image_processor, tolerance)
|
||||
logger.info(f"🪄 Pushing {model_name} to hub...")
|
||||
logger.info(f"Pushing {model_name} to hub...")
|
||||
|
||||
image_processor.push_to_hub(model_name)
|
||||
mask2former_for_segmentation.push_to_hub(model_name)
|
||||
|
||||
@ -548,7 +548,7 @@ class OriginalMaskFormerCheckpointToOursConverter:
|
||||
checkpoints: list[Path] = checkpoints_dir.glob("**/*.pkl")
|
||||
|
||||
for checkpoint in checkpoints:
|
||||
logger.info(f"💪 Converting {checkpoint.stem}")
|
||||
logger.info(f"Converting {checkpoint.stem}")
|
||||
# find associated config file
|
||||
config: Path = config_dir / checkpoint.parents[0].stem / "swin" / f"{checkpoint.stem}.yaml"
|
||||
|
||||
@ -607,7 +607,7 @@ def test(original_model, our_model: MaskFormerForInstanceSegmentation, image_pro
|
||||
"The segmentation image is not the same."
|
||||
)
|
||||
|
||||
logger.info("✅ Test passed!")
|
||||
logger.info("Test passed!")
|
||||
|
||||
|
||||
def get_name(checkpoint_file: Path):
|
||||
@ -715,7 +715,7 @@ if __name__ == "__main__":
|
||||
test(original_model, mask_former_for_instance_segmentation, image_processor)
|
||||
|
||||
model_name = get_name(checkpoint_file)
|
||||
logger.info(f"🪄 Saving {model_name}")
|
||||
logger.info(f"Saving {model_name}")
|
||||
|
||||
image_processor.save_pretrained(save_directory / model_name)
|
||||
mask_former_for_instance_segmentation.save_pretrained(save_directory / model_name)
|
||||
|
||||
@ -391,7 +391,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -334,7 +334,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -349,10 +349,10 @@ def verify_conversion(
|
||||
|
||||
# Check if they're close
|
||||
if orig_logits.shape == hf_logits.shape and torch.allclose(orig_logits, hf_logits, atol=1e-4):
|
||||
print("✅ Conversion verified! Outputs match.")
|
||||
print("[SUCCESS] Conversion verified! Outputs match.")
|
||||
return True
|
||||
else:
|
||||
print("❌ Conversion failed! Outputs don't match.")
|
||||
print("[FAIL] Conversion failed! Outputs don't match.")
|
||||
if orig_logits.numel() > 0 and hf_logits.numel() > 0:
|
||||
print(f"Max difference: {(orig_logits - hf_logits).abs().max()}")
|
||||
return False
|
||||
@ -365,9 +365,9 @@ def push_to_hub(hf_model: MetaClip2Model, processor: CLIPProcessor, repo_name: s
|
||||
try:
|
||||
hf_model.push_to_hub(repo_name)
|
||||
processor.push_to_hub(repo_name)
|
||||
print(f"✅ Successfully pushed to {repo_name}")
|
||||
print(f"[SUCCESS] Successfully pushed to {repo_name}")
|
||||
except Exception as e:
|
||||
print(f"❌ Failed to push to hub: {e}")
|
||||
print(f"[FAIL] Failed to push to hub: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
@ -186,7 +186,7 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the Hugging Face hub."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -327,7 +327,9 @@ if __name__ == "__main__":
|
||||
help="Whether to verify hidden_state against the original implementation.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -234,7 +234,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", required=True, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -336,7 +336,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", required=True, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -302,7 +302,9 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", required=True, type=str, help="Path to the output PyTorch model directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -268,7 +268,7 @@ if __name__ == "__main__":
|
||||
"--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the Hugging Face hub."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -25,7 +25,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...masking_utils import create_bidirectional_mask, create_causal_mask
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
@ -41,18 +41,11 @@ from ...utils import (
|
||||
DUMMY_INPUTS,
|
||||
DUMMY_MASK,
|
||||
auto_docstring,
|
||||
is_torch_flex_attn_available,
|
||||
is_torchdynamo_compiling,
|
||||
logging,
|
||||
)
|
||||
from .configuration_mt5 import MT5Config
|
||||
|
||||
|
||||
if is_torch_flex_attn_available():
|
||||
from torch.nn.attention.flex_attention import BlockMask
|
||||
|
||||
from ...integrations.flex_attention import make_flex_block_causal_mask
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@ -735,40 +728,31 @@ class MT5Stack(MT5PreTrainedModel):
|
||||
past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device
|
||||
)
|
||||
|
||||
if attention_mask is None and not is_torchdynamo_compiling():
|
||||
# required mask seq length can be calculated via length of past cache
|
||||
mask_seq_length = past_key_values_length + seq_length
|
||||
attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
|
||||
|
||||
if self.config.is_decoder:
|
||||
causal_mask = self._update_causal_mask(
|
||||
attention_mask,
|
||||
inputs_embeds,
|
||||
cache_position,
|
||||
past_key_values.self_attention_cache
|
||||
attention_mask = create_causal_mask(
|
||||
config=self.config,
|
||||
input_embeds=inputs_embeds,
|
||||
attention_mask=attention_mask,
|
||||
cache_position=cache_position,
|
||||
past_key_values=past_key_values.self_attention_cache
|
||||
if isinstance(past_key_values, EncoderDecoderCache)
|
||||
else past_key_values,
|
||||
output_attentions,
|
||||
)
|
||||
elif attention_mask is not None:
|
||||
causal_mask = attention_mask[:, None, None, :]
|
||||
causal_mask = causal_mask.to(dtype=inputs_embeds.dtype)
|
||||
causal_mask = (1.0 - causal_mask) * torch.finfo(inputs_embeds.dtype).min
|
||||
else:
|
||||
causal_mask = None
|
||||
attention_mask = create_bidirectional_mask(
|
||||
config=self.config,
|
||||
input_embeds=inputs_embeds,
|
||||
attention_mask=attention_mask,
|
||||
)
|
||||
|
||||
# If a 2D or 3D attention mask is provided for the cross-attention
|
||||
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||
encoder_extended_attention_mask = None
|
||||
if self.is_decoder and encoder_hidden_states is not None:
|
||||
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
|
||||
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
|
||||
if encoder_attention_mask is None:
|
||||
encoder_attention_mask = torch.ones(
|
||||
encoder_hidden_shape, device=inputs_embeds.device, dtype=torch.long
|
||||
)
|
||||
encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
|
||||
else:
|
||||
encoder_extended_attention_mask = None
|
||||
encoder_extended_attention_mask = create_bidirectional_mask(
|
||||
config=self.config,
|
||||
input_embeds=inputs_embeds,
|
||||
attention_mask=encoder_attention_mask,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
)
|
||||
|
||||
all_hidden_states = () if output_hidden_states else None
|
||||
all_attentions = () if output_attentions else None
|
||||
@ -778,13 +762,13 @@ class MT5Stack(MT5PreTrainedModel):
|
||||
|
||||
hidden_states = self.dropout(inputs_embeds)
|
||||
|
||||
for i, layer_module in enumerate(self.block):
|
||||
for layer_module in self.block:
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
causal_mask,
|
||||
attention_mask,
|
||||
position_bias,
|
||||
encoder_hidden_states,
|
||||
encoder_extended_attention_mask,
|
||||
@ -837,131 +821,6 @@ class MT5Stack(MT5PreTrainedModel):
|
||||
cross_attentions=all_cross_attentions,
|
||||
)
|
||||
|
||||
# Copied from transformers.models.gptj.modeling_gptj.GPTJModel._update_causal_mask
|
||||
def _update_causal_mask(
|
||||
self,
|
||||
attention_mask: Union[torch.Tensor, "BlockMask"],
|
||||
input_tensor: torch.Tensor,
|
||||
cache_position: torch.Tensor,
|
||||
past_key_values: Cache,
|
||||
output_attentions: bool = False,
|
||||
):
|
||||
if self.config._attn_implementation == "flash_attention_2":
|
||||
if attention_mask is not None and (attention_mask == 0.0).any():
|
||||
return attention_mask
|
||||
return None
|
||||
if self.config._attn_implementation == "flex_attention":
|
||||
if isinstance(attention_mask, torch.Tensor):
|
||||
attention_mask = make_flex_block_causal_mask(attention_mask)
|
||||
return attention_mask
|
||||
|
||||
# For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
|
||||
# order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
|
||||
# to infer the attention mask.
|
||||
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
|
||||
using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False
|
||||
|
||||
# When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
|
||||
if self.config._attn_implementation == "sdpa" and not using_compilable_cache and not output_attentions:
|
||||
if AttentionMaskConverter._ignore_causal_mask_sdpa(
|
||||
attention_mask,
|
||||
inputs_embeds=input_tensor,
|
||||
past_key_values_length=past_seen_tokens,
|
||||
is_training=self.training,
|
||||
):
|
||||
return None
|
||||
|
||||
dtype = input_tensor.dtype
|
||||
sequence_length = input_tensor.shape[1]
|
||||
if using_compilable_cache:
|
||||
target_length = past_key_values.get_max_cache_shape()
|
||||
else:
|
||||
target_length = (
|
||||
attention_mask.shape[-1]
|
||||
if isinstance(attention_mask, torch.Tensor)
|
||||
else past_seen_tokens + sequence_length + 1
|
||||
)
|
||||
|
||||
# In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
|
||||
causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
|
||||
attention_mask,
|
||||
sequence_length=sequence_length,
|
||||
target_length=target_length,
|
||||
dtype=dtype,
|
||||
cache_position=cache_position,
|
||||
batch_size=input_tensor.shape[0],
|
||||
)
|
||||
|
||||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type in ["cuda", "xpu", "npu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
# using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
|
||||
# Details: https://github.com/pytorch/pytorch/issues/110213
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
|
||||
|
||||
return causal_mask
|
||||
|
||||
@staticmethod
|
||||
# Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position
|
||||
def _prepare_4d_causal_attention_mask_with_cache_position(
|
||||
attention_mask: torch.Tensor,
|
||||
sequence_length: int,
|
||||
target_length: int,
|
||||
dtype: torch.dtype,
|
||||
cache_position: torch.Tensor,
|
||||
batch_size: int,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
|
||||
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
|
||||
|
||||
Args:
|
||||
attention_mask (`torch.Tensor`):
|
||||
A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
|
||||
`(batch_size, 1, query_length, key_value_length)`.
|
||||
sequence_length (`int`):
|
||||
The sequence length being processed.
|
||||
target_length (`int`):
|
||||
The target length: when generating with static cache, the mask should be as long as the static cache,
|
||||
to account for the 0 padding, the part of the cache that is not filled yet.
|
||||
dtype (`torch.dtype`):
|
||||
The dtype to use for the 4D attention mask.
|
||||
cache_position (`torch.Tensor`):
|
||||
Indices depicting the position of the input sequence tokens in the sequence.
|
||||
batch_size (`torch.Tensor`):
|
||||
Batch size.
|
||||
"""
|
||||
if attention_mask is not None and attention_mask.dim() == 4:
|
||||
# In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
|
||||
causal_mask = attention_mask
|
||||
else:
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
causal_mask = torch.full(
|
||||
(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
|
||||
)
|
||||
if sequence_length != 1:
|
||||
causal_mask = torch.triu(causal_mask, diagonal=1)
|
||||
causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
|
||||
causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
|
||||
if attention_mask is not None:
|
||||
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
|
||||
mask_length = attention_mask.shape[-1]
|
||||
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
|
||||
causal_mask.device
|
||||
)
|
||||
padding_mask = padding_mask == 0
|
||||
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
|
||||
padding_mask, min_dtype
|
||||
)
|
||||
|
||||
return causal_mask
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class MT5Model(MT5PreTrainedModel):
|
||||
|
||||
@ -221,7 +221,7 @@ if __name__ == "__main__":
|
||||
help="Path to the output PyTorch model directory.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
|
||||
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the Hugging Face hub."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda."
|
||||
|
||||
@ -254,7 +254,7 @@ if __name__ == "__main__":
|
||||
"--push_to_hub",
|
||||
default="musicgen-melody",
|
||||
type=str,
|
||||
help="Where to upload the converted model on the 🤗 hub.",
|
||||
help="Where to upload the converted model on the Hugging Face hub.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda."
|
||||
|
||||
@ -275,7 +275,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
help="Whether or not to push the converted model and processor to the 🤗 hub.",
|
||||
help="Whether or not to push the converted model and processor to the Hugging Face hub.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -23,9 +23,7 @@ from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from transformers.tokenization_utils_base import INIT_TOKENIZER_DOCSTRING
|
||||
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
|
||||
from transformers.utils import add_end_docstrings
|
||||
|
||||
from ...utils import is_levenshtein_available, is_nltk_available, logging, requires_backends
|
||||
|
||||
@ -40,16 +38,6 @@ if is_nltk_available():
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
INIT_TOKENIZER_DOCSTRING += """
|
||||
tokenizer_object ([`tokenizers.Tokenizer`]):
|
||||
A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
|
||||
tokenizers](../fast_tokenizers) for more information.
|
||||
tokenizer_file ([`str`]):
|
||||
A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
|
||||
tokenizers.
|
||||
"""
|
||||
|
||||
|
||||
VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"}
|
||||
|
||||
|
||||
@ -358,7 +346,6 @@ def remove_slice_from_lines(lines, clean_text, slice) -> str:
|
||||
return to_delete.strip()
|
||||
|
||||
|
||||
@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
|
||||
class NougatTokenizerFast(PreTrainedTokenizerFast):
|
||||
"""
|
||||
Fast tokenizer for Nougat (backed by HuggingFace tokenizers library).
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user