Compare commits

..

4 Commits

Author SHA1 Message Date
2be1dd7af2 no changes 2025-10-24 10:47:55 +02:00
a20a326408 Merge branch 'main' into test-bart-dummy 2025-10-24 10:31:20 +02:00
12b253e921 test 2025-10-23 20:23:42 +02:00
cecef75790 init 2025-10-23 20:17:07 +02:00
42 changed files with 486 additions and 642 deletions

View File

@ -28,9 +28,6 @@ on:
report_repo_id:
required: false
type: string
pytest_marker:
required: false
type: string
env:
HF_HOME: /mnt/cache
@ -140,7 +137,7 @@ jobs:
- name: Run all tests on GPU
working-directory: /transformers
run: |
script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports python3 -m pytest -rsfE -v -m '${{ inputs.pytest_marker }}' --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports tests/${{ matrix.folders }}" test_outputs.txt
script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports tests/${{ matrix.folders }}" test_outputs.txt
ls -la
# Extract the exit code from the output file
EXIT_CODE=$(tail -1 test_outputs.txt | grep -o 'COMMAND_EXIT_CODE="[0-9]*"' | cut -d'"' -f2)

View File

@ -1,60 +0,0 @@
name: Nvidia CI - Flash Attn
on:
repository_dispatch:
schedule:
- cron: "17 2 * * *"
push:
branches:
- run_nvidia_ci_flash_attn*
workflow_dispatch:
inputs:
prev_workflow_run_id:
description: 'previous workflow run id to compare'
type: string
required: false
default: ""
other_workflow_run_id:
description: 'other workflow run id to compare'
type: string
required: false
default: ""
# Used for `push` to easily modify the target workflow runs to compare against
env:
prev_workflow_run_id: ""
other_workflow_run_id: ""
jobs:
setup:
name: Setup
runs-on: ubuntu-22.04
steps:
- name: Setup
run: |
mkdir "setup_values"
echo "${{ inputs.prev_workflow_run_id || env.prev_workflow_run_id }}" > "setup_values/prev_workflow_run_id.txt"
echo "${{ inputs.other_workflow_run_id || env.other_workflow_run_id }}" > "setup_values/other_workflow_run_id.txt"
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
name: setup_values
path: setup_values
model-ci:
name: Model CI
uses: ./.github/workflows/self-scheduled.yml
with:
job: run_models_gpu
slack_report_channel: "#transformers-ci-flash-attn"
docker: huggingface/transformers-all-latest-gpu
ci_event: Daily CI
runner_type: "a10"
report_repo_id: hf-internal-testing/transformers_flash_attn_ci
commit_sha: ${{ github.sha }}
pytest_marker: "flash_attn_test or flash_attn_3_test"
secrets: inherit

View File

@ -38,10 +38,6 @@ on:
default: ""
required: false
type: string
pytest_marker:
required: false
type: string
env:
HF_HOME: /mnt/cache
@ -131,7 +127,6 @@ jobs:
commit_sha: ${{ inputs.commit_sha || github.sha }}
runner_type: ${{ inputs.runner_type }}
report_repo_id: ${{ inputs.report_repo_id }}
pytest_marker: ${{ inputs.pytest_marker }}
secrets: inherit
run_trainer_and_fsdp_gpu:

View File

@ -14,7 +14,7 @@ This AGENTS.md file provides guidance for code agents working with this codebase
- PRs should be as brief as possible. Bugfix PRs in particular can often be only one or two lines long, and do not need large comments, docstrings or new functions in this case. Aim to minimize the size of the diff.
- When writing tests, they should be added to an existing file. The only exception is for PRs to add a new model, when a new test directory should be created for that model.
- Code style is enforced in the CI. You can install the style tools with `pip install -e ".[quality]"`. You can then run `make fixup` to apply style and consistency fixes to your code.
- Code style is enforced in the CI. You can install the style tools with `pip install -e .[quality]`. You can then run `make fixup` to apply style and consistency fixes to your code.
## Copying and inheritance
@ -36,4 +36,4 @@ After making changes, you should usually run `make fixup` to ensure any copies a
the model you made the changes in and any other models that were updated by `make fixup`. Tests can be run with `pytest tests/models/[name]/test_modeling_[name].py`
If your changes affect code in other classes like tokenizers or processors, you should run those tests instead, like `test_processing_[name].py` or `test_tokenization_[name].py`.
In order to run tests, you may need to install dependencies. You can do this with `pip install -e ".[testing]"`. You will probably also need to `pip install torch accelerate` if your environment does not already have them.
In order to run tests, you may need to install dependencies. You can do this with `pip install -e .[testing]`. You will probably also need to `pip install torch accelerate` if your environment does not already have them.

View File

@ -9,12 +9,6 @@ In this list, we showcase incredibly impactful and novel projects that have push
adding other projects to the list. If you believe a project should be here and it's not, then please, open a PR
to add it.
## [◉ Universal Intelligence](https://github.com/blueraai/universal-intelligence)
[Universal Intelligence](https://github.com/blueraai/universal-intelligence) aims to standardize models, tools, and agents —transforming them into simple, composable, portable, interoperable, framework-agnostic, hardware-agnostic interfaces (through auto-negotiation and resource sharing); for fast and accessible development of AI applications.
Keywords: Protocol, Open-source, LLMs, Large Language Models, Agents, Low-code
## [gpt4all](https://github.com/nomic-ai/gpt4all)
[gpt4all](https://github.com/nomic-ai/gpt4all) is an ecosystem of open-source chatbots trained on massive collections of clean assistant data including code, stories and dialogue. It offers open-source, large language models such as LLaMA and GPT-J trained in an assistant-style.

View File

@ -1,235 +0,0 @@
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
---
# Brainstorm
## Persona
Model developer that wants to evaluate his model implementation on a dataset or a model "trainer" that wants to run inference for his GRPO policy.
Pre reqs to understand the docs:
- knows what KV Cache is
- familiarity with transformers and infernece
## what we want do include in the doc
- [ ] CB usage examples
- [ ] CB API reference
- [x] light refresher on what is CB + links to blog post
- [x] installation / setup instructions
- [x] open telemetry support
- [ ] subsection in Transformers > Inference
- [x] supported & unsupported features
- [ ] performance considerations
- [ ] note on benchmarks (CI + space)
- [ ] cuda graphs
- [ ] compile
- [ ] attn impl
- [x] explicit intended use cases, the why of CB in transformers
- [x] integration with serving
---
# Continuous Batching
Continuous Batching (CB) is an advanced technique to optimize the inference of transformer models by dynamically grouping multiple requests into batches. This approach maximizes GPU utilization and throughput, specifically for workloads with many variable-length inputs.
We are particularly interested in having Continuous Batching in transformers for the following use cases:
- Evaluation of models on large datasets with variable-length inputs
- Generating outputs for multiple sequences for GRPO policies
CB is what makes inference engines like vLLM or SGLang efficient. That being said, transformers does not aim to be a production-ready inference engine, but a complete framework for model development. For this reason, CB is available in `transformers serve`.
If you are not familiar with some of the core concepts CB is built upon, we invite you to read the associated blog post: [Continuous Batching: Efficient Inference for Large Language Models](https://huggingface.co/blog/continuous-batching). _broken link for now_
## Installation
Nothing to do, it comes built-in with `transformers`! :nice:
## API Reference
## Usage Examples
The main way to use CB in transformers is via the `generate_batch` method.
Unlike `generate`, CB takes already tokenized inputs, known as input IDs. Each sequence of input IDs is represented as a list of integers, in python: `list[int]`. Since
For a more detailed example, please refer to: [examples/continuous_batching](./path/to/example)
### `generate_batch` example
We have created a `ContinuousMixin` that is inherited by the `GenerationMixin` so that all auto regressive text models support CB.
This adds the `generate_batch` method to all models that inherit from `GenerationMixin`.
You can use it as follows:
```py
import datasets
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen3-4B-Instruct-2507",
attn_implementation="spda_paged",
device_map="cuda", # if you need cuda
dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left")
# prepare a batch of inputs
dataset = datasets.load_dataset("openai/gsm8k", "socratic", split="test")
dataset = dataset.select(range(args.samples))
tokenized_datasets = dataset.map(lambda x: tokenizer(x["question"]), batched=True)
simple_batch_inputs = [item["input_ids"] for item in tokenized_datasets]
generation_config = GenerationConfig(
max_new_tokens=32,
use_cuda_graph=False, # Not supported for simple version
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
do_sample=False,
max_batch_tokens=512, # max number of tokens in a batch, this is just a default value you should tune based on your hardware
)
batch_outputs = model.generate_batch(
inputs=simple_batch_inputs,
generation_config=generation_config,
)
for request_id, output in batch_outputs.items():
generated_text = tokenizer.decode(output.generated_tokens, skip_special_tokens=True)
print(f"Request {request_id} output: {generated_text}")
```
### `ContinuousBatchingManager` example
If you want more control w.r.t. how you want to schedule requests using CB, you can use the `ContinuousBatchingManager` class directly.
This is what we use in `transformers serve` because requests arrive asynchronously and we can leverage the asynchronous nature of the CB process to make things more efficient.
Under the hood, the `ContinuousBatchingManager` creates a background thread that receives inputs from a python `queue.Queue` which it uses to get requests to batch in each forward pass.
Note that the manager is thread safe!
```py
import datasets
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
from transformers.generation.continuous_batching import RequestStatus
model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen3-4B-Instruct-2507",
attn_implementation="spda_paged",
device_map="cuda", # if you need cuda
dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left")
# prepare a batch of inputs
dataset = datasets.load_dataset("openai/gsm8k", "socratic", split="test")
dataset = dataset.select(range(args.samples))
tokenized_datasets = dataset.map(lambda x: tokenizer(x["question"]), batched=True)
simple_batch_inputs = [item["input_ids"] for item in tokenized_datasets]
# initialize the manager, available method thanks to the `ContinuousMixin`
manager = model.init_continuous_batching(generation_config=generation_config)
# start the background thread
manager.start()
# this is for demonstration purposes only, in practice this is most useful to do concurrently
for i, input in enumerate(simple_batch_inputs):
request_id = manager.add_request(input_ids=input, request_id=f"request_{i}") # if you do not specify a request_id, one will be generated for you
# Can be done in an other thread
for id, request in manager.get_result():
generated_text = tokenizer.decode(request.generated_tokens, skip_special_tokens=True)
print(f"Request {id} output: {generated_text}")
# you can also get results for a specific request id
result = manager.get_result(request_id="request_5") # this is blocking and will wait for the result to be ready
# or get results for a request that is streaming
manager.add_request(
input_ids=input,
request_id="streaming_request",
stream=True,
)
for chunk in manager.request_id_iter(request_id="streaming_request"):
generated_text = tokenizer.decode(chunk.generated_tokens, skip_special_tokens=True)
print(generated_text)
# FIXME: stop iteration in `request_id_iter` when finished instead of doing it externeally
if chunk.status == RequestStatus.FINISHED:
break
# stop the background thread before exiting the process
manager.stop()
```
## Supported & Unsupported Features
### Supported Features
- Dynamic scheduling of variable-length requests
- Chunked prefill
- Paged Attention Cache
- Sliding window attention
- Chat templates
### Unsupported Features
- Prefix caching
- Beam search
- speculative (including assisted) decoding
- tool calling
- MTP (multi token prediction)
- Medusa
- anything related to `do_sample`
Note that these aren't supported, at the moment. Some features like prefix caching, beam search or tool calling are on our roadmap and others will be best effort!
Please do let us know if you'd like to see support for any of these features!
## Performance Considerations
## Integration with Serving
You can use CB in `transformers serve` by passing the `--continuous-batching` flag when starting the server.
## Monitoring
We have added `opentelemetry` support to Continuous Batching to help you monitor its performance in production. To enable it, you need to install the `opentelemetry` extra when installing `transformers`:
```sh
# this installs `opentelemetry-api`, `opentelemetry-sdk` and `opentelemetry-exporter-otlp`
pip install transformers[open-telemetry]
```
This will enable traces and metrics collection in CB. You will then have to setup the backend to collect and visualize the traces and metrics.

View File

@ -38,7 +38,7 @@ pip install transformers[dev]
or for an editable install:
```bash
pip install -e ".[dev]"
pip install -e .[dev]
```
inside the Transformers repo. Since the number of optional dependencies of Transformers has grown a lot, it's possible you don't manage to get all of them. If the dev install fails, make sure to install PyTorch then do
@ -50,7 +50,7 @@ pip install transformers[quality]
or for an editable install:
```bash
pip install -e ".[quality]"
pip install -e .[quality]
```
## Tests

View File

@ -37,7 +37,7 @@ pip install transformers[dev]
o una instalación editable:
```bash
pip install -e ".[dev]"
pip install -e .[dev]
```
del repositorio de Transformers.

View File

@ -37,7 +37,7 @@ pip install transformers[dev]
o un'installazione modificabile:
```bash
pip install -e ".[dev]"
pip install -e .[dev]
```
all'interno del repo Transformers.

View File

@ -40,7 +40,7 @@ pip install transformers[dev]
```bash
pip install -e ".[dev]"
pip install -e .[dev]
```
トランスフォーマーズのリポジトリ内で作業しています。トランスフォーマーズのオプションの依存関係の数が増えたため、すべてを取得できない可能性があります。開発用インストールが失敗した場合、作業しているディープラーニングフレームワークPyTorch、TensorFlow、および/またはFlaxをインストールし、次の手順を実行してください。
@ -53,7 +53,7 @@ pip install transformers[quality]
または編集可能なインストールの場合:
```bash
pip install -e ".[quality]"
pip install -e .[quality]
```
## Tests

View File

@ -37,7 +37,7 @@ pip install transformers[dev]
또는 Transformers 저장소 내에 편집 가능한 설치가 필요합니다:
```bash
pip install -e ".[dev]"
pip install -e .[dev]
```
Transformers의 선택적 종속성 수가 많이 늘어났기 때문에 개발 설치를 실패할 수도 있습니다. 개발 설치가 실패하는 경우, 작업 중인 Deep Learning 프레임워크 (PyTorch, TensorFlow 및/또는 Flax)를 설치하고 다음 명령을 실행하세요.
@ -49,7 +49,7 @@ pip install transformers[quality]
편집 가능한 설치의 경우는 다음 명령을 실행하세요.
```bash
pip install -e ".[quality]"
pip install -e .[quality]
```

View File

@ -392,7 +392,6 @@ extras["torchhub"] = deps_list(
extras["benchmark"] = deps_list("optimum-benchmark")
# OpenTelemetry dependencies for metrics collection in continuous batching
# TODO: refactor this to split API and SDK; SDK and exporter should only be needed to run code that collects metrics whereas API is what people will need to instrument their code and handle exporter themselves
extras["open-telemetry"] = deps_list("opentelemetry-api") + ["opentelemetry-exporter-otlp", "opentelemetry-sdk"]
# when modifying the following list, make sure to update src/transformers/dependency_versions_check.py

View File

@ -919,7 +919,6 @@ class ContinuousBatchingManager:
if result is not None:
yield result
# FIXME: stop iteration when request status is finished?
def request_id_iter(self, request_id: str) -> Generator[GenerationOutput]:
"""Iterate over results matching a specific request id as they become available."""
request_cancelled = False

View File

@ -628,7 +628,7 @@ def maybe_load_adapters(
**adapter_kwargs,
):
if pretrained_model_name_or_path is None or not is_peft_available():
return None, pretrained_model_name_or_path, adapter_kwargs
return None, pretrained_model_name_or_path
token = download_kwargs.get("token")
@ -670,4 +670,4 @@ def maybe_load_adapters(
_adapter_model_path = pretrained_model_name_or_path
pretrained_model_name_or_path = json.load(f)["base_model_name_or_path"]
return _adapter_model_path, pretrained_model_name_or_path, adapter_kwargs
return _adapter_model_path, pretrained_model_name_or_path

View File

@ -4353,7 +4353,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
if adapter_kwargs is None:
adapter_kwargs = {}
_adapter_model_path, pretrained_model_name_or_path, adapter_kwargs = maybe_load_adapters(
_adapter_model_path, pretrained_model_name_or_path = maybe_load_adapters(
pretrained_model_name_or_path,
download_kwargs_with_commit,
**adapter_kwargs,

View File

@ -538,12 +538,12 @@ class BartEncoder(BartPreTrainedModel):
self.max_source_positions = config.max_position_embeddings
embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
self.embed_tokens = BartScaledWordEmbedding(
config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
)
if embed_tokens is not None:
self.embed_tokens = embed_tokens
else:
self.embed_tokens = BartScaledWordEmbedding(
config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
)
self.embed_tokens.weight = embed_tokens.weight
self.embed_positions = BartLearnedPositionalEmbedding(
config.max_position_embeddings,
@ -682,12 +682,12 @@ class BartDecoder(BartPreTrainedModel):
self.max_target_positions = config.max_position_embeddings
embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
self.embed_tokens = BartScaledWordEmbedding(
config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
)
if embed_tokens is not None:
self.embed_tokens = embed_tokens
else:
self.embed_tokens = BartScaledWordEmbedding(
config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
)
self.embed_tokens.weight = embed_tokens.weight
self.embed_positions = BartLearnedPositionalEmbedding(
config.max_position_embeddings,

View File

@ -22,7 +22,7 @@ import torch
from torch import nn
from ...activations import ACT2FN
from ...masking_utils import create_causal_mask
from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
from ...modeling_layers import GradientCheckpointingLayer
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
@ -310,6 +310,7 @@ class CLIPAttention(nn.Module):
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
causal_attention_mask: Optional[torch.Tensor] = None,
**kwargs: Unpack[TransformersKwargs],
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
"""Input shape: Batch x Time x Channel"""
@ -323,6 +324,15 @@ class CLIPAttention(nn.Module):
queries = queries.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
keys = keys.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
values = values.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
# CLIP text model uses both `causal_attention_mask` and `attention_mask`
# in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask`
if self.config._attn_implementation == "flash_attention_2":
self.is_causal = causal_attention_mask is not None
else:
if attention_mask is not None and causal_attention_mask is not None:
attention_mask = attention_mask + causal_attention_mask
elif causal_attention_mask is not None:
attention_mask = causal_attention_mask
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
@ -334,12 +344,13 @@ class CLIPAttention(nn.Module):
keys,
values,
attention_mask,
is_causal=self.is_causal,
scaling=self.scale,
dropout=0.0 if not self.training else self.dropout,
**kwargs,
)
attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
attn_output = self.out_proj(attn_output)
return attn_output, attn_weights
@ -373,14 +384,16 @@ class CLIPEncoderLayer(GradientCheckpointingLayer):
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
causal_attention_mask: torch.Tensor,
**kwargs: Unpack[TransformersKwargs],
) -> torch.FloatTensor:
residual = hidden_states
hidden_states = self.layer_norm1(hidden_states)
hidden_states, _ = self.self_attn(
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
causal_attention_mask=causal_attention_mask,
**kwargs,
)
hidden_states = residual + hidden_states
@ -484,6 +497,7 @@ class CLIPEncoder(nn.Module):
self,
inputs_embeds,
attention_mask: Optional[torch.Tensor] = None,
causal_attention_mask: Optional[torch.Tensor] = None,
**kwargs: Unpack[TransformersKwargs],
) -> BaseModelOutput:
r"""
@ -498,6 +512,13 @@ class CLIPEncoder(nn.Module):
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Causal mask for the text model. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
"""
hidden_states = inputs_embeds
@ -505,6 +526,7 @@ class CLIPEncoder(nn.Module):
hidden_states = encoder_layer(
hidden_states,
attention_mask,
causal_attention_mask,
**kwargs,
)
@ -541,19 +563,17 @@ class CLIPTextTransformer(nn.Module):
hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
attention_mask = create_causal_mask(
config=self.config,
input_embeds=hidden_states,
attention_mask=attention_mask,
cache_position=torch.arange(hidden_states.shape[1], device=hidden_states.device),
past_key_values=None,
causal_attention_mask = _create_4d_causal_attention_mask(
input_shape, hidden_states.dtype, device=hidden_states.device
)
kwargs.pop("is_causal", None)
if attention_mask is not None and self.config._attn_implementation != "flash_attention_2":
attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
encoder_outputs: BaseModelOutput = self.encoder(
inputs_embeds=hidden_states,
attention_mask=attention_mask,
is_causal=True,
causal_attention_mask=causal_attention_mask,
**kwargs,
)
@ -598,6 +618,7 @@ class CLIPTextModel(CLIPPreTrainedModel):
input_modalities = "text"
_no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]
_supports_flash_attn = False # mask creation only accounts for sdpa/eager
def __init__(self, config: CLIPTextConfig):
super().__init__(config)
@ -611,7 +632,8 @@ class CLIPTextModel(CLIPPreTrainedModel):
def set_input_embeddings(self, value):
self.text_model.embeddings.token_embedding = value
@check_model_inputs(tie_last_hidden_states=False)
@check_model_inputs()
@can_return_tuple
@auto_docstring
def forward(
self,
@ -704,6 +726,7 @@ class CLIPVisionModel(CLIPPreTrainedModel):
return self.vision_model.embeddings.patch_embedding
@check_model_inputs(tie_last_hidden_states=False)
@can_return_tuple
@auto_docstring
def forward(
self,
@ -743,6 +766,7 @@ class CLIPVisionModel(CLIPPreTrainedModel):
class CLIPModel(CLIPPreTrainedModel):
config: CLIPConfig
_no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer", "CLIPVisionEmbeddings"]
_supports_flash_attn = False # mask creation only accounts for sdpa/eager
def __init__(self, config: CLIPConfig):
super().__init__(config)
@ -942,6 +966,7 @@ class CLIPTextModelWithProjection(CLIPPreTrainedModel):
config: CLIPTextConfig
input_modalities = "text"
_supports_flash_attn = False
_no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]
def __init__(self, config: CLIPTextConfig):
@ -961,7 +986,8 @@ class CLIPTextModelWithProjection(CLIPPreTrainedModel):
def set_input_embeddings(self, value):
self.text_model.embeddings.token_embedding = value
@check_model_inputs(tie_last_hidden_states=False)
@check_model_inputs()
@can_return_tuple
@auto_docstring
def forward(
self,
@ -1023,6 +1049,7 @@ class CLIPVisionModelWithProjection(CLIPPreTrainedModel):
return self.vision_model.embeddings.patch_embedding
@check_model_inputs(tie_last_hidden_states=False)
@can_return_tuple
@auto_docstring
def forward(
self,
@ -1090,7 +1117,8 @@ class CLIPForImageClassification(CLIPPreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
@check_model_inputs(tie_last_hidden_states=False)
@check_model_inputs()
@can_return_tuple
@auto_docstring
def forward(
self,

View File

@ -1392,7 +1392,7 @@ class Emu3Model(Emu3PreTrainedModel):
image_features = torch.split(image_features, split_sizes)
return image_features
@torch.no_grad()
@torch.no_grad
def decode_image_tokens(self, image_tokens: torch.LongTensor, height: int, width: int):
"""
Decodes generated image tokens from language model to continuous pixel values

View File

@ -946,7 +946,7 @@ class Emu3Model(Emu3PreTrainedModel):
image_features = torch.split(image_features, split_sizes)
return image_features
@torch.no_grad()
@torch.no_grad
def decode_image_tokens(self, image_tokens: torch.LongTensor, height: int, width: int):
"""
Decodes generated image tokens from language model to continuous pixel values

View File

@ -1283,7 +1283,7 @@ class JanusForConditionalGeneration(JanusPreTrainedModel, GenerationMixin):
decoded_image = decoded_image.permute(0, 2, 3, 1)
return decoded_image
@torch.no_grad()
@torch.no_grad
def generate(
self,
inputs: Optional[torch.Tensor] = None,

View File

@ -1099,7 +1099,7 @@ class JanusForConditionalGeneration(JanusPreTrainedModel, GenerationMixin):
decoded_image = decoded_image.permute(0, 2, 3, 1)
return decoded_image
@torch.no_grad()
@torch.no_grad
def generate(
self,
inputs: Optional[torch.Tensor] = None,

View File

@ -12,7 +12,7 @@ import torch
from torch import nn
from ...activations import ACT2FN
from ...masking_utils import create_causal_mask
from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
from ...modeling_layers import GradientCheckpointingLayer
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
@ -200,6 +200,7 @@ class MetaClip2Attention(nn.Module):
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
causal_attention_mask: Optional[torch.Tensor] = None,
**kwargs: Unpack[TransformersKwargs],
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
"""Input shape: Batch x Time x Channel"""
@ -213,6 +214,15 @@ class MetaClip2Attention(nn.Module):
queries = queries.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
keys = keys.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
values = values.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
# METACLIP_2 text model uses both `causal_attention_mask` and `attention_mask`
# in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask`
if self.config._attn_implementation == "flash_attention_2":
self.is_causal = causal_attention_mask is not None
else:
if attention_mask is not None and causal_attention_mask is not None:
attention_mask = attention_mask + causal_attention_mask
elif causal_attention_mask is not None:
attention_mask = causal_attention_mask
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
@ -224,12 +234,13 @@ class MetaClip2Attention(nn.Module):
keys,
values,
attention_mask,
is_causal=self.is_causal,
scaling=self.scale,
dropout=0.0 if not self.training else self.dropout,
**kwargs,
)
attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
attn_output = self.out_proj(attn_output)
return attn_output, attn_weights
@ -263,14 +274,16 @@ class MetaClip2EncoderLayer(GradientCheckpointingLayer):
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
causal_attention_mask: torch.Tensor,
**kwargs: Unpack[TransformersKwargs],
) -> torch.FloatTensor:
residual = hidden_states
hidden_states = self.layer_norm1(hidden_states)
hidden_states, _ = self.self_attn(
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
causal_attention_mask=causal_attention_mask,
**kwargs,
)
hidden_states = residual + hidden_states
@ -374,6 +387,7 @@ class MetaClip2Encoder(nn.Module):
self,
inputs_embeds,
attention_mask: Optional[torch.Tensor] = None,
causal_attention_mask: Optional[torch.Tensor] = None,
**kwargs: Unpack[TransformersKwargs],
) -> BaseModelOutput:
r"""
@ -388,6 +402,13 @@ class MetaClip2Encoder(nn.Module):
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Causal mask for the text model. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
"""
hidden_states = inputs_embeds
@ -395,6 +416,7 @@ class MetaClip2Encoder(nn.Module):
hidden_states = encoder_layer(
hidden_states,
attention_mask,
causal_attention_mask,
**kwargs,
)
@ -415,12 +437,14 @@ class MetaClip2TextTransformer(nn.Module):
# For `pooled_output` computation
self.eos_token_id = config.eos_token_id
@check_model_inputs(tie_last_hidden_states=False)
@auto_docstring
def forward(
self,
input_ids,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
**kwargs: Unpack[TransformersKwargs],
) -> BaseModelOutputWithPooling:
input_shape = input_ids.size()
@ -428,19 +452,21 @@ class MetaClip2TextTransformer(nn.Module):
hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
attention_mask = create_causal_mask(
config=self.config,
input_embeds=hidden_states,
attention_mask=attention_mask,
cache_position=torch.arange(hidden_states.shape[1], device=hidden_states.device),
past_key_values=None,
# CLIP's text model uses causal mask, prepare it here.
# https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
causal_attention_mask = _create_4d_causal_attention_mask(
input_shape, hidden_states.dtype, device=hidden_states.device
)
kwargs.pop("is_causal", None)
# expand attention_mask
if attention_mask is not None and self.config._attn_implementation != "flash_attention_2":
# [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
encoder_outputs: BaseModelOutput = self.encoder(
inputs_embeds=hidden_states,
attention_mask=attention_mask,
is_causal=True,
causal_attention_mask=causal_attention_mask,
**kwargs,
)
@ -501,6 +527,7 @@ class MetaClip2TextModel(MetaClip2PreTrainedModel):
input_modalities = "text"
_no_split_modules = ["MetaClip2TextEmbeddings", "MetaClip2EncoderLayer"]
_supports_flash_attn = False # mask creation only accounts for sdpa/eager
def __init__(self, config: MetaClip2TextConfig):
super().__init__(config)
@ -514,13 +541,16 @@ class MetaClip2TextModel(MetaClip2PreTrainedModel):
def set_input_embeddings(self, value):
self.text_model.embeddings.token_embedding = value
@check_model_inputs(tie_last_hidden_states=False)
@check_model_inputs()
@can_return_tuple
@auto_docstring
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
**kwargs: Unpack[TransformersKwargs],
) -> BaseModelOutputWithPooling:
r"""
@ -600,6 +630,7 @@ class MetaClip2TextModelWithProjection(MetaClip2PreTrainedModel):
config: MetaClip2TextConfig
input_modalities = "text"
_supports_flash_attn = False
_no_split_modules = ["MetaClip2TextEmbeddings", "MetaClip2EncoderLayer"]
def __init__(self, config: MetaClip2TextConfig):
@ -619,13 +650,16 @@ class MetaClip2TextModelWithProjection(MetaClip2PreTrainedModel):
def set_input_embeddings(self, value):
self.text_model.embeddings.token_embedding = value
@check_model_inputs(tie_last_hidden_states=False)
@check_model_inputs()
@can_return_tuple
@auto_docstring
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
**kwargs: Unpack[TransformersKwargs],
) -> MetaClip2TextModelOutput:
r"""
@ -758,6 +792,7 @@ class MetaClip2Model(MetaClip2PreTrainedModel):
config: MetaClip2Config
_no_split_modules = ["MetaClip2TextEmbeddings", "MetaClip2EncoderLayer", "MetaClip2VisionEmbeddings"]
_supports_flash_attn = False # mask creation only accounts for sdpa/eager
def __init__(self, config: MetaClip2Config):
super().__init__(config)
@ -1043,7 +1078,7 @@ class MetaClip2VisionModel(MetaClip2PreTrainedModel):
return self.vision_model.embeddings.patch_embedding
@check_model_inputs(tie_last_hidden_states=False)
@auto_docstring
@can_return_tuple
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
@ -1152,6 +1187,7 @@ class MetaClip2VisionModelWithProjection(MetaClip2PreTrainedModel):
return self.vision_model.embeddings.patch_embedding
@check_model_inputs(tie_last_hidden_states=False)
@can_return_tuple
@auto_docstring
def forward(
self,
@ -1218,7 +1254,8 @@ class MetaClip2ForImageClassification(MetaClip2PreTrainedModel):
# Initialize weights and apply final processing
self.post_init()
@check_model_inputs(tie_last_hidden_states=False)
@check_model_inputs()
@can_return_tuple
@auto_docstring
def forward(
self,

View File

@ -3,8 +3,9 @@ from typing import Optional
import torch
from torch import nn
from ...masking_utils import create_causal_mask
from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
from ...modeling_utils import PreTrainedModel
from ...processing_utils import Unpack
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
from ...utils.generic import check_model_inputs
@ -12,9 +13,9 @@ from ..clip.configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConf
from ..clip.modeling_clip import (
CLIPMLP,
CLIPAttention,
CLIPEncoderLayer,
CLIPForImageClassification,
CLIPModel,
CLIPPreTrainedModel,
CLIPTextEmbeddings,
CLIPTextModel,
CLIPTextModelWithProjection,
@ -213,9 +214,24 @@ class MetaClip2MLP(CLIPMLP):
pass
class MetaClip2EncoderLayer(CLIPEncoderLayer):
pass
@auto_docstring
class MetaClip2PreTrainedModel(CLIPPreTrainedModel):
class MetaClip2PreTrainedModel(PreTrainedModel):
config: MetaClip2Config
base_model_prefix = "metaclip_2"
input_modalities = ["image", "text"]
supports_gradient_checkpointing = True
_supports_sdpa = True
_supports_flash_attn = True
_supports_flex_attn = True
_supports_attention_backend = True
_can_record_outputs = {
"hidden_states": MetaClip2EncoderLayer,
"attentions": MetaClip2Attention,
}
def _init_weights(self, module):
"""Initialize the weights"""
@ -275,12 +291,14 @@ class MetaClip2PreTrainedModel(CLIPPreTrainedModel):
class MetaClip2TextTransformer(CLIPTextTransformer):
@check_model_inputs(tie_last_hidden_states=False)
@auto_docstring
def forward(
self,
input_ids,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
use_cache: Optional[bool] = None,
**kwargs: Unpack[TransformersKwargs],
) -> BaseModelOutputWithPooling:
input_shape = input_ids.size()
@ -288,19 +306,21 @@ class MetaClip2TextTransformer(CLIPTextTransformer):
hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
attention_mask = create_causal_mask(
config=self.config,
input_embeds=hidden_states,
attention_mask=attention_mask,
cache_position=torch.arange(hidden_states.shape[1], device=hidden_states.device),
past_key_values=None,
# CLIP's text model uses causal mask, prepare it here.
# https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
causal_attention_mask = _create_4d_causal_attention_mask(
input_shape, hidden_states.dtype, device=hidden_states.device
)
kwargs.pop("is_causal", None)
# expand attention_mask
if attention_mask is not None and self.config._attn_implementation != "flash_attention_2":
# [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
encoder_outputs: BaseModelOutput = self.encoder(
inputs_embeds=hidden_states,
attention_mask=attention_mask,
is_causal=True,
causal_attention_mask=causal_attention_mask,
**kwargs,
)
@ -352,13 +372,22 @@ class MetaClip2TextModel(CLIPTextModel):
>>> pooled_output = outputs.pooler_output # pooled (EOS token) states
```"""
@check_model_inputs(tie_last_hidden_states=False)
def __init__(self, config: MetaClip2TextConfig):
super().__init__(config)
self.text_model = MetaClip2TextTransformer(config)
# Initialize weights and apply final processing
self.post_init()
@check_model_inputs()
@can_return_tuple
@auto_docstring
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
**kwargs: Unpack[TransformersKwargs],
):
r"""
@ -380,6 +409,8 @@ class MetaClip2TextModel(CLIPTextModel):
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
**kwargs,
)
@ -415,13 +446,24 @@ class MetaClip2TextModelWithProjection(CLIPTextModelWithProjection):
>>> text_embeds = outputs.text_embeds
```"""
@check_model_inputs(tie_last_hidden_states=False)
@auto_docstring
def __init__(self, config: MetaClip2TextConfig):
super().__init__(config)
text_model = MetaClip2TextModel._from_config(config)
self.text_model = text_model.text_model
self.text_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
# Initialize weights and apply final processing
self.post_init()
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
**kwargs: Unpack[TransformersKwargs],
):
r"""
@ -442,6 +484,8 @@ class MetaClip2TextModelWithProjection(CLIPTextModelWithProjection):
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
**kwargs,
)
@ -506,8 +550,6 @@ class MetaClip2Model(CLIPModel):
# Initialize weights and apply final processing
self.post_init()
@can_return_tuple
@auto_docstring
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
@ -652,7 +694,7 @@ class MetaClip2VisionModel(CLIPVisionModel):
```"""
@check_model_inputs(tie_last_hidden_states=False)
@auto_docstring
@can_return_tuple
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
@ -722,8 +764,6 @@ class MetaClip2VisionModelWithProjection(CLIPVisionModelWithProjection):
>>> image_embeds = outputs.image_embeds
```"""
@check_model_inputs(tie_last_hidden_states=False)
@auto_docstring
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,

View File

@ -25,12 +25,12 @@ import torch
import torch.nn as nn
from ...activations import ACT2FN
from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_layers import GradientCheckpointingLayer
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from ...processing_utils import Unpack
from ...utils import TransformersKwargs, auto_docstring, torch_int
from ...utils.generic import check_model_inputs
from .configuration_mlcd import MLCDVisionConfig
@ -259,7 +259,7 @@ class MLCDAttention(nn.Module):
hidden_states: torch.Tensor,
position_embeddings: tuple[torch.Tensor, torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
**kwargs: Unpack[TransformersKwargs],
**kwargs: Unpack[FlashAttentionKwargs],
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
"""Input shape: Batch x Time x Channel"""
batch_size, seq_length = hidden_states.shape[:-1]
@ -316,7 +316,7 @@ class MLCDEncoderLayer(GradientCheckpointingLayer):
hidden_states: torch.Tensor,
position_embeddings: tuple[torch.Tensor, torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
**kwargs: Unpack[TransformersKwargs],
output_attentions: Optional[bool] = False,
) -> tuple[torch.FloatTensor]:
"""
Args:
@ -328,15 +328,18 @@ class MLCDEncoderLayer(GradientCheckpointingLayer):
Represents absolute positional embeddings for the query and key in the attention mechanism.
attention_mask (`torch.FloatTensor`):
Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
output_attentions (`bool`, *optional*, defaults to `False`):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
hidden_states = self.layer_norm1(hidden_states)
hidden_states, _ = self.self_attn(
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
position_embeddings=position_embeddings,
attention_mask=attention_mask,
**kwargs,
output_attentions=output_attentions,
)
hidden_states = residual + hidden_states
@ -345,7 +348,12 @@ class MLCDEncoderLayer(GradientCheckpointingLayer):
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states
return hidden_states
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
class MLCDEncoder(nn.Module):
@ -369,7 +377,9 @@ class MLCDEncoder(nn.Module):
inputs_embeds: torch.FloatTensor,
position_embeddings: tuple[torch.Tensor, torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
**kwargs: Unpack[TransformersKwargs],
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, BaseModelOutput]:
r"""
Args:
@ -385,18 +395,114 @@ class MLCDEncoder(nn.Module):
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
encoder_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
hidden_states = inputs_embeds
for encoder_layer in self.layers:
hidden_states = encoder_layer(
hidden_states,
position_embeddings,
attention_mask,
**kwargs,
for idx, encoder_layer in enumerate(self.layers):
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
layer_outputs = encoder_layer(
hidden_states=hidden_states,
position_embeddings=position_embeddings,
attention_mask=attention_mask,
output_attentions=output_attentions,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=encoder_states,
attentions=all_attentions,
)
class MLCDVisionTransformer(nn.Module):
def __init__(self, config: MLCDVisionConfig):
super().__init__()
self.config = config
embed_dim = config.hidden_size
self.embeddings = MLCDVisionEmbeddings(config)
self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self.encoder = MLCDEncoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self.vision_rotary_embedding = MLCDRotaryEmbedding(config.hidden_size // config.num_attention_heads // 2)
self.class_pos_emb = nn.Parameter(torch.randn(1, config.hidden_size // config.num_attention_heads // 2))
@auto_docstring
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, BaseModelOutputWithPooling]:
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
num_patches_height = pixel_values.shape[-2] // self.config.patch_size
num_patches_width = pixel_values.shape[-1] // self.config.patch_size
rotary_pos_emb = self.vision_rotary_embedding(num_patches_height, num_patches_width)
rotary_pos_emb = rotary_pos_emb.to(self.class_pos_emb.device)
rotary_pos_emb = torch.cat([self.class_pos_emb, rotary_pos_emb], dim=0)
emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
position_embeddings = (emb.cos(), emb.sin())
hidden_states = self.embeddings(pixel_values)
hidden_states = self.pre_layrnorm(hidden_states)
encoder_outputs = self.encoder(
inputs_embeds=hidden_states,
position_embeddings=position_embeddings,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
last_hidden_state = encoder_outputs[0]
pooled_output = last_hidden_state[:, 0, :]
pooled_output = self.post_layernorm(pooled_output)
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPooling(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
@ -405,15 +511,8 @@ class MLCDPreTrainedModel(PreTrainedModel):
config: MLCDVisionConfig
base_model_prefix = "mlcd"
supports_gradient_checkpointing = True
accepts_loss_kwargs = False
_supports_flash_attn = True
_supports_sdpa = True
_supports_flex_attn = True
_supports_attention_backend = True
_can_record_outputs = {
"hidden_states": MLCDEncoderLayer,
"attentions": MLCDAttention,
}
def _init_weights(self, module):
"""Initialize the weights"""
@ -447,55 +546,6 @@ class MLCDPreTrainedModel(PreTrainedModel):
module.bias.data.zero_()
class MLCDVisionTransformer(nn.Module):
def __init__(self, config: MLCDVisionConfig):
super().__init__()
self.config = config
embed_dim = config.hidden_size
self.embeddings = MLCDVisionEmbeddings(config)
self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self.encoder = MLCDEncoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self.vision_rotary_embedding = MLCDRotaryEmbedding(config.hidden_size // config.num_attention_heads // 2)
self.class_pos_emb = nn.Parameter(torch.randn(1, config.hidden_size // config.num_attention_heads // 2))
@auto_docstring
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, BaseModelOutputWithPooling]:
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
num_patches_height = pixel_values.shape[-2] // self.config.patch_size
num_patches_width = pixel_values.shape[-1] // self.config.patch_size
rotary_pos_emb = self.vision_rotary_embedding(num_patches_height, num_patches_width)
rotary_pos_emb = rotary_pos_emb.to(self.class_pos_emb.device)
rotary_pos_emb = torch.cat([self.class_pos_emb, rotary_pos_emb], dim=0)
emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
position_embeddings = (emb.cos(), emb.sin())
hidden_states = self.embeddings(pixel_values)
hidden_states = self.pre_layrnorm(hidden_states)
encoder_outputs = self.encoder(
inputs_embeds=hidden_states,
position_embeddings=position_embeddings,
**kwargs,
)
last_hidden_state = encoder_outputs[0]
pooled_output = last_hidden_state[:, 0, :]
pooled_output = self.post_layernorm(pooled_output)
return BaseModelOutputWithPooling(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
)
@auto_docstring(
custom_intro="""
The vision model from M_L_C_D without any head or projection on top.
@ -516,12 +566,13 @@ class MLCDVisionModel(MLCDPreTrainedModel):
def get_input_embeddings(self) -> nn.Module:
return self.vision_model.embeddings.patch_embedding
@check_model_inputs(tie_last_hidden_states=False)
@auto_docstring
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
**kwargs: Unpack[TransformersKwargs],
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, BaseModelOutputWithPooling]:
r"""
Example:
@ -545,9 +596,17 @@ class MLCDVisionModel(MLCDPreTrainedModel):
>>> print(f"Number of attention layers: {len(outputs.attentions)}")
>>> print(f"Attention shape: {outputs.attentions[0].shape}")
```"""
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
return self.vision_model(
pixel_values=pixel_values,
**kwargs,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)

View File

@ -19,11 +19,11 @@ import torch
import torch.nn as nn
from ...configuration_utils import PreTrainedConfig
from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from ...processing_utils import Unpack
from ...utils import TransformersKwargs, auto_docstring, logging
from ...utils.generic import check_model_inputs
from ...utils import auto_docstring, logging
from ..clip.modeling_clip import (
CLIPMLP,
CLIPAttention,
@ -206,7 +206,7 @@ class MLCDAttention(CLIPAttention):
hidden_states: torch.Tensor,
position_embeddings: tuple[torch.Tensor, torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
**kwargs: Unpack[TransformersKwargs],
**kwargs: Unpack[FlashAttentionKwargs],
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
batch_size, seq_length = hidden_states.shape[:-1]
@ -258,7 +258,7 @@ class MLCDEncoderLayer(CLIPEncoderLayer):
hidden_states: torch.Tensor,
position_embeddings: tuple[torch.Tensor, torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
**kwargs: Unpack[TransformersKwargs],
output_attentions: Optional[bool] = False,
) -> tuple[torch.FloatTensor]:
"""
Args:
@ -270,15 +270,18 @@ class MLCDEncoderLayer(CLIPEncoderLayer):
Represents absolute positional embeddings for the query and key in the attention mechanism.
attention_mask (`torch.FloatTensor`):
Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
output_attentions (`bool`, *optional*, defaults to `False`):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
hidden_states = self.layer_norm1(hidden_states)
hidden_states, _ = self.self_attn(
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
position_embeddings=position_embeddings,
attention_mask=attention_mask,
**kwargs,
output_attentions=output_attentions,
)
hidden_states = residual + hidden_states
@ -287,7 +290,12 @@ class MLCDEncoderLayer(CLIPEncoderLayer):
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states
return hidden_states
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
class MLCDEncoder(CLIPEncoder):
@ -308,7 +316,9 @@ class MLCDEncoder(CLIPEncoder):
inputs_embeds: torch.FloatTensor,
position_embeddings: tuple[torch.Tensor, torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
**kwargs: Unpack[TransformersKwargs],
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, BaseModelOutput]:
r"""
Args:
@ -324,18 +334,107 @@ class MLCDEncoder(CLIPEncoder):
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
encoder_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
hidden_states = inputs_embeds
for encoder_layer in self.layers:
hidden_states = encoder_layer(
hidden_states,
position_embeddings,
attention_mask,
**kwargs,
for idx, encoder_layer in enumerate(self.layers):
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
layer_outputs = encoder_layer(
hidden_states=hidden_states,
position_embeddings=position_embeddings,
attention_mask=attention_mask,
output_attentions=output_attentions,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=encoder_states,
attentions=all_attentions,
)
class MLCDVisionTransformer(CLIPVisionTransformer):
def __init__(self, config: MLCDVisionConfig):
super().__init__(config)
self.vision_rotary_embedding = MLCDRotaryEmbedding(config.hidden_size // config.num_attention_heads // 2)
self.class_pos_emb = nn.Parameter(torch.randn(1, config.hidden_size // config.num_attention_heads // 2))
@auto_docstring
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, BaseModelOutputWithPooling]:
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
num_patches_height = pixel_values.shape[-2] // self.config.patch_size
num_patches_width = pixel_values.shape[-1] // self.config.patch_size
rotary_pos_emb = self.vision_rotary_embedding(num_patches_height, num_patches_width)
rotary_pos_emb = rotary_pos_emb.to(self.class_pos_emb.device)
rotary_pos_emb = torch.cat([self.class_pos_emb, rotary_pos_emb], dim=0)
emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
position_embeddings = (emb.cos(), emb.sin())
hidden_states = self.embeddings(pixel_values)
hidden_states = self.pre_layrnorm(hidden_states)
encoder_outputs = self.encoder(
inputs_embeds=hidden_states,
position_embeddings=position_embeddings,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
last_hidden_state = encoder_outputs[0]
pooled_output = last_hidden_state[:, 0, :]
pooled_output = self.post_layernorm(pooled_output)
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPooling(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
@ -344,15 +443,8 @@ class MLCDPreTrainedModel(PreTrainedModel):
config: MLCDVisionConfig
base_model_prefix = "mlcd"
supports_gradient_checkpointing = True
accepts_loss_kwargs = False
_supports_flash_attn = True
_supports_sdpa = True
_supports_flex_attn = True
_supports_attention_backend = True
_can_record_outputs = {
"hidden_states": MLCDEncoderLayer,
"attentions": MLCDAttention,
}
def _init_weights(self, module):
"""Initialize the weights"""
@ -386,55 +478,14 @@ class MLCDPreTrainedModel(PreTrainedModel):
module.bias.data.zero_()
class MLCDVisionTransformer(CLIPVisionTransformer):
def __init__(self, config: MLCDVisionConfig):
super().__init__(config)
self.vision_rotary_embedding = MLCDRotaryEmbedding(config.hidden_size // config.num_attention_heads // 2)
self.class_pos_emb = nn.Parameter(torch.randn(1, config.hidden_size // config.num_attention_heads // 2))
@auto_docstring
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
**kwargs: Unpack[TransformersKwargs],
) -> Union[tuple, BaseModelOutputWithPooling]:
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
num_patches_height = pixel_values.shape[-2] // self.config.patch_size
num_patches_width = pixel_values.shape[-1] // self.config.patch_size
rotary_pos_emb = self.vision_rotary_embedding(num_patches_height, num_patches_width)
rotary_pos_emb = rotary_pos_emb.to(self.class_pos_emb.device)
rotary_pos_emb = torch.cat([self.class_pos_emb, rotary_pos_emb], dim=0)
emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
position_embeddings = (emb.cos(), emb.sin())
hidden_states = self.embeddings(pixel_values)
hidden_states = self.pre_layrnorm(hidden_states)
encoder_outputs = self.encoder(
inputs_embeds=hidden_states,
position_embeddings=position_embeddings,
**kwargs,
)
last_hidden_state = encoder_outputs[0]
pooled_output = last_hidden_state[:, 0, :]
pooled_output = self.post_layernorm(pooled_output)
return BaseModelOutputWithPooling(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
)
class MLCDVisionModel(CLIPVisionModel):
@check_model_inputs(tie_last_hidden_states=False)
@auto_docstring
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
**kwargs: Unpack[TransformersKwargs],
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[tuple, BaseModelOutputWithPooling]:
r"""
Example:
@ -458,9 +509,17 @@ class MLCDVisionModel(CLIPVisionModel):
>>> print(f"Number of attention layers: {len(outputs.attentions)}")
>>> print(f"Attention shape: {outputs.attentions[0].shape}")
```"""
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
return self.vision_model(
pixel_values=pixel_values,
**kwargs,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)

View File

@ -343,12 +343,12 @@ class PLBartEncoder(PLBartPreTrainedModel):
self.max_source_positions = config.max_position_embeddings
embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
self.embed_tokens = PLBartScaledWordEmbedding(
config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
)
if embed_tokens is not None:
self.embed_tokens = embed_tokens
else:
self.embed_tokens = PLBartScaledWordEmbedding(
config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
)
self.embed_tokens.weight = embed_tokens.weight
self.embed_positions = PLBartLearnedPositionalEmbedding(
config.max_position_embeddings,
@ -595,12 +595,12 @@ class PLBartDecoder(PLBartPreTrainedModel):
self.max_target_positions = config.max_position_embeddings
embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
self.embed_tokens = PLBartScaledWordEmbedding(
config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
)
if embed_tokens is not None:
self.embed_tokens = embed_tokens
else:
self.embed_tokens = PLBartScaledWordEmbedding(
config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
)
self.embed_tokens.weight = embed_tokens.weight
self.embed_positions = PLBartLearnedPositionalEmbedding(
config.max_position_embeddings,

View File

@ -25,7 +25,6 @@ from torch import nn
from ...activations import ACT2FN
from ...cache_utils import Cache
from ...generation import GenerationMixin
from ...masking_utils import create_bidirectional_mask
from ...modeling_layers import GradientCheckpointingLayer
from ...modeling_outputs import BaseModelOutput, ModelOutput
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
@ -775,19 +774,14 @@ class Qwen2AudioForConditionalGeneration(Qwen2AudioPreTrainedModel, GenerationMi
lengths_expand = audio_feat_lengths.unsqueeze(1).expand(batch_size, max_seq_len)
# Create mask
padding_mask = seq_range >= lengths_expand
audio_attention_mask_2d = (~padding_mask).to(dtype=torch.long, device=audio_feat_lengths.device)
dummy_embeds = torch.zeros(
(batch_size, max_seq_len, 1),
dtype=inputs_embeds.dtype,
device=inputs_embeds.device,
audio_attention_mask_ = padding_mask.view(batch_size, 1, 1, max_seq_len).expand(
batch_size, 1, max_seq_len, max_seq_len
)
audio_attention_mask = create_bidirectional_mask(
config=self.audio_tower.config,
input_embeds=dummy_embeds,
attention_mask=audio_attention_mask_2d,
audio_attention_mask = audio_attention_mask_.to(
dtype=self.audio_tower.conv1.weight.dtype, device=self.audio_tower.conv1.weight.device
)
audio_attention_mask[audio_attention_mask_] = float("-inf")
audio_outputs = self.audio_tower(input_features, attention_mask=audio_attention_mask)
selected_audio_feature = audio_outputs.last_hidden_state

View File

@ -1316,6 +1316,7 @@ class Qwen3VLProcessor(Qwen2VLProcessor):
video_metadata = videos_inputs.pop("video_metadata")
else:
video_metadata = videos_inputs["video_metadata"]
video_grid_thw = videos_inputs["video_grid_thw"]
else:
videos_inputs = {}
video_grid_thw = None

View File

@ -157,6 +157,7 @@ class Qwen3VLProcessor(ProcessorMixin):
video_metadata = videos_inputs.pop("video_metadata")
else:
video_metadata = videos_inputs["video_metadata"]
video_grid_thw = videos_inputs["video_grid_thw"]
else:
videos_inputs = {}
video_grid_thw = None

View File

@ -383,10 +383,6 @@ class Mxfp4HfQuantizer(HfQuantizer):
state_dict = model.state_dict()
# Get num_local_experts from model config
num_local_experts = getattr(model.config, "num_local_experts", 32)
hidden_size = getattr(model.config, "hidden_size", 2880)
for name, module in model.named_modules():
if (
isinstance(module, Mxfp4GptOssExperts)
@ -396,7 +392,7 @@ class Mxfp4HfQuantizer(HfQuantizer):
state_dict[f"{name}.gate_up_proj_blocks"] = (
module.gate_up_proj.storage.layout.unswizzle_data(module.gate_up_proj.storage.data)
.transpose(-1, -2)
.reshape(num_local_experts, -1, 90, 16)
.reshape(32, -1, 90, 16)
)
state_dict[f"{name}.gate_up_proj_scales"] = (
module.gate_up_proj_precision_config.weight_scale.storage.layout.unswizzle_data(
@ -406,7 +402,7 @@ class Mxfp4HfQuantizer(HfQuantizer):
state_dict[f"{name}.down_proj_blocks"] = (
module.down_proj.storage.layout.unswizzle_data(module.down_proj.storage.data)
.transpose(-1, -2)
.reshape(num_local_experts, hidden_size, 90, -1)
.reshape(32, 2880, 90, -1)
)
state_dict[f"{name}.down_proj_scales"] = (
module.down_proj_precision_config.weight_scale.storage.layout.unswizzle_data(

View File

@ -173,12 +173,12 @@ def recursive_parse(
return parsed_schema
elif isinstance(node_content, dict):
for key, child_node in node_schema.get("properties", {}).items():
if "const" in child_node:
parsed_schema[key] = child_node["const"]
elif key in node_content:
if key in node_content:
parsed_schema[key] = recursive_parse(node_content[key], child_node)
elif "default" in child_node:
parsed_schema[key] = child_node["default"]
else:
pass
if "additionalProperties" in node_schema:
for key, value in node_content.items():
if key not in node_schema.get("properties", {}):

View File

@ -47,7 +47,8 @@ PACKAGE_DISTRIBUTION_MAPPING = importlib.metadata.packages_distributions()
def _is_package_available(pkg_name: str, return_version: bool = False) -> tuple[bool, str] | bool:
"""Check if `pkg_name` exist, and optionally try to get its version"""
spec = importlib.util.find_spec(pkg_name)
package_exists = spec is not None
# the spec might be not None but not importable
package_exists = spec is not None and spec.loader is not None
package_version = "N/A"
if package_exists and return_version:
try:

View File

@ -16,7 +16,6 @@
import copy
import tempfile
import unittest
import unittest.mock
from functools import cached_property
import timeout_decorator # noqa
@ -478,23 +477,6 @@ class BartModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
with torch.no_grad():
model(**inputs)[0]
def test_input_embeddings_support_forward_hook(self):
# Make sure that registering hooks on the input embeddings are indeed called
# in forward. This is necessary for gradient checkpointing in PEFT, see also #41821.
config, inputs_dict = self.model_tester.prepare_config_and_inputs()
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
hook = unittest.mock.MagicMock(return_value=None)
model.get_input_embeddings().register_forward_hook(hook)
inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
model(**inputs)
self.assertGreater(hook.call_count, 0)
@require_torch_fp16
def test_generate_fp16(self):
config, input_dict = self.model_tester.prepare_config_and_inputs()

View File

@ -28,6 +28,7 @@ from transformers.testing_utils import (
require_torch,
require_torch_accelerator,
require_torch_fp16,
require_torch_gpu,
require_torch_multi_accelerator,
require_vision,
slow,
@ -1733,7 +1734,7 @@ class Blip2ModelIntegrationTest(unittest.TestCase):
self.assertEqual(predictions[0].tolist(), expected_ids_and_text[0])
self.assertEqual(generated_text, expected_ids_and_text[1])
@require_torch_accelerator
@require_torch_gpu
def test_inference_itm(self):
model_name = "Salesforce/blip2-itm-vit-g"
processor = Blip2Processor.from_pretrained(model_name)

View File

@ -23,7 +23,7 @@ from transformers.testing_utils import (
Expectations,
get_device_properties,
require_torch,
require_torch_accelerator,
require_torch_gpu,
slow,
torch_device,
)
@ -400,7 +400,7 @@ class FalconH1ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
@slow
@require_torch
@require_torch_accelerator
@require_torch_gpu
class FalconH1ModelIntegrationTest(unittest.TestCase):
@slow
def test_falcon_h1_hard(self):
@ -448,36 +448,10 @@ class FalconH1ModelIntegrationTest(unittest.TestCase):
6.
"""
EXPECTED_TEXT_XPU = """
user
Tell me about the french revolution.
assistant
The French Revolution (17891799) was a period of radical social and political upheaval in France that fundamentally transformed the nation and had profound effects on the rest of Europe and the world. Here are the key aspects of the revolution:
### **Causes**
1. **Economic Crisis**: France was in severe financial trouble due to costly wars (particularly the American Revolution), extravagant spending by the monarchy, and inefficient taxation.
2. **Social Inequality**: The rigid class system (the Ancien Régime) favored the nobility and clergy while the majority of the population (the Third Estate) bore the brunt of taxation and had limited rights.
3. **Enlightenment Ideas**: Philosophers like Rousseau, Voltaire, and Montesquieu inspired ideas of liberty, equality, and popular sovereignty.
4. **Settlement of 1789**: The Estates-General convened to address the financial crisis, leading to debates that exposed the weaknesses of the monarchy and the grievances of the common people.
### **Key Events**
1. **Opening of the Revolution (1789)**:
- **Storming of the Bastille**: A symbol of royal tyranny, marking the start of the revolution.
- **Declaration of the Rights of Man and of the Citizen**: A foundational document proclaiming liberty, equality, and fraternity.
2. **Stages of the Revolution**:
- **Staffords' Reforms (17891791)**: Attempts to address grievances, including the abolition of feudal privileges and the introduction of the Civil Constitution of the Church.
- **Reign of Terror (17931794)**: Led by Maximilien Robespierre, characterized by mass executions of perceived enemies of the revolution, including King Louis XVI and Queen Marie Antoinette.
- **Thermidorian Reaction (1794)**: The fall of Robespierre and the end of the Reign of Terror.
3. **
"""
expected_texts = Expectations(
{
(None, None): EXPECTED_TEXT_DEFAULT,
("cuda", 8): EXPECTED_TEXT_A10,
("xpu", None): EXPECTED_TEXT_XPU,
}
)
EXPECTED_TEXT = expected_texts.get_expectation()
@ -492,9 +466,10 @@ class FalconH1ModelIntegrationTest(unittest.TestCase):
model_id = "tiiuae/Falcon-H1-1.5B-Deep-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = FalconH1ForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16, device_map="auto")
device = "cuda"
messages = [{"role": "user", "content": "Tell me about the french revolution."}]
input_text = tokenizer.apply_chat_template(messages, tokenize=False)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(torch_device)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
with torch.no_grad():
outputs = model.generate(inputs, max_new_tokens=512, do_sample=False)

View File

@ -48,6 +48,7 @@ class HeliumModelTest(CausalLMModelTest, unittest.TestCase):
@slow
# @require_torch_gpu
class HeliumIntegrationTest(unittest.TestCase):
input_text = ["Hello, today is a great day to"]

View File

@ -146,7 +146,7 @@ class MLCDVisionModelIntegrationTest(unittest.TestCase):
@slow
def test_inference(self):
model_name = "DeepGlint-AI/mlcd-vit-bigG-patch14-448"
model = MLCDVisionModel.from_pretrained(model_name, attn_implementation="eager").to(torch_device)
model = MLCDVisionModel.from_pretrained(model_name).to(torch_device)
processor = AutoProcessor.from_pretrained(model_name)
# process single image

View File

@ -25,7 +25,7 @@ from transformers import (
from transformers.testing_utils import (
backend_empty_cache,
require_accelerate,
require_torch_accelerator,
require_torch_gpu,
slow,
torch_device,
)
@ -39,7 +39,7 @@ if is_accelerate_available():
from accelerate import init_empty_weights
@require_torch_accelerator
@require_torch_gpu
class BitNetQuantConfigTest(unittest.TestCase):
def test_to_dict(self):
"""
@ -53,7 +53,7 @@ class BitNetQuantConfigTest(unittest.TestCase):
@slow
@require_torch_accelerator
@require_torch_gpu
@require_accelerate
class BitNetTest(unittest.TestCase):
model_name = "HF1BitLLM/Llama3-8B-1.58-100B-tokens"
@ -197,7 +197,7 @@ class BitNetTest(unittest.TestCase):
@slow
@require_torch_accelerator
@require_torch_gpu
@require_accelerate
class BitNetSerializationTest(unittest.TestCase):
def test_model_serialization(self):

View File

@ -4142,7 +4142,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
# perfect world: fp32_init/2 == fp16_eval
self.assertAlmostEqual(fp16_eval, fp32_init / 2, delta=5_000)
@require_torch_accelerator
@require_torch_gpu
@pytest.mark.torch_compile_test
def test_torch_compile_train(self):
with tempfile.TemporaryDirectory() as tmp_dir:
@ -4154,7 +4154,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
metrics = trainer.train()
self.assertAlmostEqual(metrics.training_loss, original_train_loss)
@require_torch_accelerator
@require_torch_gpu
@pytest.mark.torch_compile_test
def test_torch_compile_eval(self):
with tempfile.TemporaryDirectory() as tmp_dir:
@ -4165,7 +4165,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
trainer = get_regression_trainer(torch_compile=True, output_dir=tmp_dir)
metrics = trainer.evaluate()
self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss, delta=1e-6)
self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss)
@require_torch_accelerator
@require_torch_bf16

View File

@ -281,7 +281,6 @@ class ChatSchemaParserTest(unittest.TestCase):
self.assertEqual(
parsed_chat,
{
"role": "assistant",
"thinking": 'Okay, the user said, "Hello! How are you?" I need to respond appropriately. Since this is the first message, I should greet them back and ask how I can assist. I should keep it friendly and open-ended. Let me make sure the response is welcoming and encourages them to share what they need help with. I\'ll avoid any technical jargon and keep it simple. Let me check for any typos and ensure the tone is positive.',
"tool_calls": [
{
@ -303,10 +302,9 @@ class ChatSchemaParserTest(unittest.TestCase):
self.assertEqual(
parsed_chat,
{
"role": "assistant",
"tool_calls": [
{"type": "function", "function": {"name": "get_weather", "arguments": {"city": "Paris"}}}
],
]
},
)
@ -316,7 +314,6 @@ class ChatSchemaParserTest(unittest.TestCase):
self.assertEqual(
parsed_chat,
{
"role": "assistant",
"content": "Some content about gravity goes here but I'm cutting it off to make this shorter!",
"thinking": 'Okay, the user asked, "Hey! Can you tell me about gravity?" Let me start by breaking down what they might be looking for. They probably want a basic understanding of gravity, maybe for a school project or just personal curiosity. I should explain what gravity is, how it works, and maybe some examples.',
},
@ -328,7 +325,6 @@ class ChatSchemaParserTest(unittest.TestCase):
self.assertEqual(
parsed_chat,
{
"role": "assistant",
"tool_calls": [
{
"type": "function",
@ -340,6 +336,6 @@ class ChatSchemaParserTest(unittest.TestCase):
},
},
}
],
]
},
)

View File

@ -97,17 +97,7 @@ def is_bad_commit(target_test, commit):
# Restore to original commit
repo.git.checkout(original_head)
n_passed = 0
o = re.findall(r"====.* (\d+) passed", result.stdout)
if len(o) > 0:
n_passed = int(o[0])
n_failed = 0
o = re.findall(r"====.* (\d+) failed", result.stdout)
if len(o) > 0:
n_failed = int(o[0])
return result.returncode != 0, n_failed, n_passed
return result.returncode != 0
def find_bad_commit(target_test, start_commit, end_commit):
@ -123,8 +113,7 @@ def find_bad_commit(target_test, start_commit, end_commit):
"""
# check if `end_commit` fails the test
# (we only need one failure to conclude the test is flaky on the previous run with `end_commit`)
failed_before, _, _ = is_bad_commit(target_test, end_commit)
failed_before = is_bad_commit(target_test, end_commit)
if failed_before:
return (
None,
@ -141,9 +130,8 @@ def find_bad_commit(target_test, start_commit, end_commit):
# Now, we are (almost) sure `target_test` is not failing at `end_commit`
# check if `start_commit` fail the test
# **IMPORTANT** we only need one pass to conclude the test is flaky on the current run with `start_commit`!
_, n_failed, n_passed = is_bad_commit(target_test, start_commit)
if n_passed > 0:
failed_now = is_bad_commit(target_test, start_commit)
if not failed_now:
# failed on CI run, but not reproducible here --> don't report
return None, f"flaky: test fails on the current CI run (commit: {start_commit}) but passes during the check."
@ -206,13 +194,12 @@ def get_commit_info(commit):
if pr_for_commit["merged_by"] is not None:
merged_author = pr_for_commit["merged_by"]["login"]
url = f"https://api.github.com/repos/huggingface/transformers/commits/{commit}"
commit_info = requests.get(url).json()
parent = commit_info["parents"][0]["sha"]
if author is None:
url = f"https://api.github.com/repos/huggingface/transformers/commits/{commit}"
commit_info = requests.get(url).json()
author = commit_info["author"]["login"]
return {"commit": commit, "pr_number": pr_number, "author": author, "merged_by": merged_author, "parent": parent}
return {"commit": commit, "pr_number": pr_number, "author": author, "merged_by": merged_author}
if __name__ == "__main__":

View File

@ -1407,10 +1407,7 @@ if __name__ == "__main__":
if not os.path.isdir(os.path.join(os.getcwd(), f"ci_results_{job_name}")):
os.makedirs(os.path.join(os.getcwd(), f"ci_results_{job_name}"))
nvidia_daily_ci_workflow = (
"huggingface/transformers/.github/workflows/self-scheduled-caller.yml",
"huggingface/transformers/.github/workflows/self-scheduled-flash-attn-caller.yml",
)
nvidia_daily_ci_workflow = "huggingface/transformers/.github/workflows/self-scheduled-caller.yml"
amd_daily_ci_workflows = (
"huggingface/transformers/.github/workflows/self-scheduled-amd-mi325-caller.yml",
"huggingface/transformers/.github/workflows/self-scheduled-amd-mi355-caller.yml",