Compare commits

..

7 Commits

Author SHA1 Message Date
b9dae9d59d try 2025-06-05 16:35:42 +02:00
60d873fd2f trigger 2025-06-05 16:24:40 +02:00
3093bec4c2 build images 2025-06-05 16:15:04 +02:00
0a7de90c11 trigger CI 2025-06-05 16:13:08 +02:00
88b9a2a807 no need 3.2.1 2025-06-05 16:11:53 +02:00
ae1241c02d trigger CI 2025-06-05 16:11:53 +02:00
b1c4c17e0a try build with torch 2.7 2025-06-05 16:11:53 +02:00
74 changed files with 613 additions and 1543 deletions

View File

@ -39,7 +39,7 @@ jobs:
fetch_tests:
working_directory: ~/transformers
docker:
- image: huggingface/transformers-quality
- image: huggingface/transformers-quality:dev
parallelism: 1
steps:
- checkout
@ -91,7 +91,7 @@ jobs:
fetch_all_tests:
working_directory: ~/transformers
docker:
- image: huggingface/transformers-quality
- image: huggingface/transformers-quality:dev
parallelism: 1
steps:
- checkout
@ -140,7 +140,7 @@ jobs:
check_code_quality:
working_directory: ~/transformers
docker:
- image: huggingface/transformers-quality
- image: huggingface/transformers-quality:dev
resource_class: large
environment:
TRANSFORMERS_IS_CI: yes
@ -165,7 +165,7 @@ jobs:
check_repository_consistency:
working_directory: ~/transformers
docker:
- image: huggingface/transformers-consistency
- image: huggingface/transformers-consistency:dev
resource_class: large
environment:
TRANSFORMERS_IS_CI: yes

View File

@ -105,8 +105,7 @@ class CircleCIJob:
else:
# BIG HACK WILL REMOVE ONCE FETCHER IS UPDATED
print(os.environ.get("GIT_COMMIT_MESSAGE"))
if "[build-ci-image]" in os.environ.get("GIT_COMMIT_MESSAGE", "") or os.environ.get("GIT_COMMIT_MESSAGE", "") == "dev-ci":
self.docker_image[0]["image"] = f"{self.docker_image[0]['image']}:dev"
self.docker_image[0]["image"] = f"{self.docker_image[0]['image']}:dev"
print(f"Using {self.docker_image} docker image")
if self.install_steps is None:
self.install_steps = ["uv venv && uv pip install ."]

View File

@ -3,7 +3,7 @@ name: Build pr ci-docker
on:
push:
branches:
- push-ci-image # for now let's only build on this branch
- try_torch_2.7_on_circleci_jobs_xxx
repository_dispatch:
workflow_call:
inputs:
@ -22,7 +22,6 @@ jobs:
build:
runs-on: ubuntu-22.04
if: ${{ contains(github.event.head_commit.message, '[build-ci-image]') || contains(github.event.head_commit.message, '[push-ci-image]') && '!cancelled()' || github.event_name == 'schedule' }}
strategy:
matrix:
@ -33,13 +32,8 @@ jobs:
-
name: Set tag
run: |
if ${{contains(github.event.head_commit.message, '[build-ci-image]')}}; then
echo "TAG=huggingface/transformers-${{ matrix.file }}:dev" >> "$GITHUB_ENV"
echo "setting it to DEV!"
else
echo "TAG=huggingface/transformers-${{ matrix.file }}" >> "$GITHUB_ENV"
fi
echo "TAG=huggingface/transformers-${{ matrix.file }}:dev" >> "$GITHUB_ENV"
echo "setting it to DEV!"
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
@ -60,18 +54,5 @@ jobs:
build-args: |
REF=${{ github.sha }}
file: "./docker/${{ matrix.file }}.dockerfile"
push: ${{ contains(github.event.head_commit.message, 'ci-image]') || github.event_name == 'schedule' }}
push: true
tags: ${{ env.TAG }}
notify:
runs-on: ubuntu-22.04
if: ${{ contains(github.event.head_commit.message, '[build-ci-image]') || contains(github.event.head_commit.message, '[push-ci-image]') && '!cancelled()' || github.event_name == 'schedule' }}
steps:
- name: Post to Slack
if: ${{ contains(github.event.head_commit.message, '[push-ci-image]') && github.event_name != 'schedule' }}
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: "#transformers-ci-circleci-images"
title: 🤗 New docker images for CircleCI are pushed.
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

View File

@ -19,7 +19,7 @@ concurrency:
jobs:
latest-docker:
name: "Latest PyTorch [dev]"
name: "Latest PyTorch + TensorFlow [dev]"
runs-on:
group: aws-general-8-plus
steps:
@ -267,6 +267,44 @@ jobs:
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
latest-tensorflow:
name: "Latest TensorFlow [dev]"
# Push CI doesn't need this image
if: inputs.image_postfix != '-push-ci'
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-tensorflow-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-tensorflow-gpu
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the huggingface/transformers-tensorflow-gpu build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
latest-pytorch-deepspeed-amd:
name: "PyTorch + DeepSpeed (AMD) [dev]"
runs-on:

View File

@ -69,6 +69,18 @@ jobs:
report_repo_id: hf-internal-testing/transformers_daily_ci
secrets: inherit
tf-pipeline:
name: TF pipeline CI
uses: ./.github/workflows/self-scheduled.yml
with:
job: run_pipelines_tf_gpu
slack_report_channel: "#transformers-ci-daily-pipeline-tf"
runner: daily-ci
docker: huggingface/transformers-tensorflow-gpu
ci_event: Daily CI
report_repo_id: hf-internal-testing/transformers_daily_ci
secrets: inherit
example-ci:
name: Example CI
uses: ./.github/workflows/self-scheduled.yml

View File

@ -209,6 +209,75 @@ jobs:
name: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
path: /transformers/reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
run_pipelines_tf_gpu:
if: ${{ inputs.job == 'run_pipelines_tf_gpu' }}
name: TensorFlow pipelines
strategy:
fail-fast: false
matrix:
machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
runs-on:
group: '${{ matrix.machine_type }}'
container:
image: huggingface/transformers-tensorflow-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Update clone
working-directory: /transformers
run: |
git fetch && git checkout ${{ github.sha }}
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Environment
working-directory: /transformers
run: |
python3 utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /transformers
run: pip freeze
- name: Set `machine_type` for report and artifact names
working-directory: /transformers
shell: bash
run: |
echo "${{ matrix.machine_type }}"
if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
machine_type=single-gpu
elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
machine_type=multi-gpu
else
machine_type=${{ matrix.machine_type }}
fi
echo "$machine_type"
echo "machine_type=$machine_type" >> $GITHUB_ENV
- name: Run all pipeline tests on GPU
working-directory: /transformers
run: |
python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports tests/pipelines
- name: Failure short reports
if: ${{ always() }}
run: |
cat /transformers/reports/${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports/failures_short.txt
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports
path: /transformers/reports/${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports
run_examples_gpu:
if: ${{ inputs.job == 'run_examples_gpu' }}
name: Examples directory
@ -502,6 +571,7 @@ jobs:
run_models_gpu,
run_trainer_and_fsdp_gpu,
run_pipelines_torch_gpu,
run_pipelines_tf_gpu,
run_examples_gpu,
run_torch_cuda_extensions_gpu,
run_quantization_torch_gpu,

View File

@ -28,7 +28,7 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
# 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future.
# 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`.
# Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions).
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA && python3 -m pip uninstall -y tensorflow tensorflow_text tensorflow_probability
RUN python3 -m pip install --no-cache-dir -U tensorflow==2.13 protobuf==3.20.3 "tensorflow_text<2.16" "tensorflow_probability<0.22" && python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
RUN python3 -m pip uninstall -y flax jax

View File

@ -19,9 +19,6 @@ Hyperparameter search discovers an optimal set of hyperparameters that produces
This guide will go over how to set up a hyperparameter search for each of the backends.
> [!WARNING]
> [SigOpt](https://github.com/sigopt/sigopt-server) is in public archive mode and is no longer actively maintained. Try using Optuna, Weights & Biases or Ray Tune instead.
```bash
pip install optuna/sigopt/wandb/ray[tune]
```

View File

@ -14,71 +14,60 @@ rendered properly in your Markdown viewer.
-->
<div style="float: right;">
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
</div>
# Aria
[Aria](https://huggingface.co/papers/2410.05993) is a multimodal mixture-of-experts (MoE) model. The goal of this model is to open-source a training recipe for creating a multimodal native model from scratch. Aria has 3.9B and 3.5B activated parameters per visual and text token respectively. Text is handled by a MoE decoder and visual inputs are handled by a lightweight visual encoder. It is trained in 4 stages, language pretraining, multimodal pretraining, multimodal long-context pretraining, and multimodal post-training.
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
You can find all the original Aria checkpoints under the [Aria](https://huggingface.co/rhymes-ai?search_models=aria) organization.
## Overview
> [!TIP]
> Click on the Aria models in the right sidebar for more examples of how to apply Aria to different multimodal tasks.
The Aria model was proposed in [Aria: An Open Multimodal Native Mixture-of-Experts Model](https://huggingface.co/papers/2410.05993) by Li et al. from the Rhymes.AI team.
The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.
Aria is an open multimodal-native model with best-in-class performance across a wide range of multimodal, language, and coding tasks. It has a Mixture-of-Experts architecture, with respectively 3.9B and 3.5B activated parameters per visual token and text token.
<hfoptions id="usage">
<hfoption id="Pipeline">
The abstract from the paper is the following:
*Information comes in diverse modalities. Multimodal native AI models are essential to integrate real-world information and deliver comprehensive understanding. While proprietary multimodal native models exist, their lack of openness imposes obstacles for adoptions, let alone adaptations. To fill this gap, we introduce Aria, an open multimodal native model with best-in-class performance across a wide range of multimodal, language, and coding tasks. Aria is a mixture-of-expert model with 3.9B and 3.5B activated parameters per visual token and text token, respectively. It outperforms Pixtral-12B and Llama3.2-11B, and is competitive against the best proprietary models on various multimodal tasks. We pre-train Aria from scratch following a 4-stage pipeline, which progressively equips the model with strong capabilities in language understanding, multimodal understanding, long context window, and instruction following. We open-source the model weights along with a codebase that facilitates easy adoptions and adaptations of Aria in real-world applications.*
This model was contributed by [m-ric](https://huggingface.co/m-ric).
The original code can be found [here](https://github.com/rhymes-ai/Aria).
## Usage tips
Here's how to use the model for vision tasks:
```python
import requests
import torch
from transformers import pipeline
from PIL import Image
pipeline = pipeline(
"image-to-text",
model="rhymes-ai/Aria",
device=0,
torch_dtype=torch.bfloat16
)
pipeline(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
text="What is shown in this image?"
)
```
from transformers import AriaProcessor, AriaForConditionalGeneration
</hfoption>
<hfoption id="AutoModel">
model_id_or_path = "rhymes-ai/Aria"
```python
import torch
from transformers import AutoModelForCausalLM, AutoProcessor
model = AutoModelForCausalLM.from_pretrained(
"rhymes-ai/Aria",
device_map="auto",
torch_dtype=torch.bfloat16,
attn_implementation="sdpa"
model = AriaForConditionalGeneration.from_pretrained(
model_id_or_path, device_map="auto"
)
processor = AutoProcessor.from_pretrained("rhymes-ai/Aria")
processor = AriaProcessor.from_pretrained(model_id_or_path)
image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
messages = [
{
"role": "user", "content": [
{"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
{"type": "text", "text": "What is shown in this image?"},
]
},
"role": "user",
"content": [
{"type": "image"},
{"text": "what is the image?", "type": "text"},
],
}
]
inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt")
ipnuts = inputs.to(model.device, torch.bfloat16)
text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=text, images=image, return_tensors="pt")
inputs.to(model.device)
output = model.generate(
**inputs,
@ -90,55 +79,6 @@ output = model.generate(
)
output_ids = output[0][inputs["input_ids"].shape[1]:]
response = processor.decode(output_ids, skip_special_tokens=True)
print(response)
```
</hfoption>
</hfoptions>
Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4 and the [rhymes-ai/Aria-sequential_mlp](https://huggingface.co/rhymes-ai/Aria-sequential_mlp) checkpoint. This checkpoint replaces grouped GEMM with `torch.nn.Linear` layers for easier quantization.
```py
# pip install torchao
import torch
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoProcessor
quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
model = AutoModelForCausalLM.from_pretrained(
"rhymes-ai/Aria-sequential_mlp",
torch_dtype=torch.bfloat16,
device_map="auto",
quantization_config=quantization_config
)
processor = AutoProcessor.from_pretrained(
"rhymes-ai/Aria-sequential_mlp",
)
messages = [
{
"role": "user", "content": [
{"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
{"type": "text", "text": "What is shown in this image?"},
]
},
]
inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt")
inputs = inputs.to(model.device, torch.bfloat16)
output = model.generate(
**inputs,
max_new_tokens=15,
stop_strings=["<|im_end|>"],
tokenizer=processor.tokenizer,
do_sample=True,
temperature=0.9,
)
output_ids = output[0][inputs["input_ids"].shape[1]:]
response = processor.decode(output_ids, skip_special_tokens=True)
print(response)
```

View File

@ -216,12 +216,12 @@ processor.batch_decode(generate_ids, skip_special_tokens=True)
## Note regarding reproducing original implementation
In order to match the logits of the [original implementation](https://github.com/haotian-liu/LLaVA/tree/main), one needs to additionally specify `do_pad=True` when instantiating `LlavaImageProcessor`:
In order to match the logits of the [original implementation](https://github.com/haotian-liu/LLaVA/tree/main), one needs to additionally specify `do_pad=True` when instantiating `LLavaImageProcessor`:
```python
from transformers import LlavaImageProcessor
from transformers import LLavaImageProcessor
image_processor = LlavaImageProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf", do_pad=True)
image_processor = LLavaImageProcessor.from_pretrained("https://huggingface.co/llava-hf/llava-1.5-7b-hf", do_pad=True)
```
### Using Flash Attention 2

View File

@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
## Overview
The MiniMax-Text-01 model was proposed in [MiniMax-01: Scaling Foundation Models with Lightning Attention](https://arxiv.org/abs/2501.08313) by MiniMax, Aonian Li, Bangwei Gong, Bo Yang, Boji Shan, Chang Liu, Cheng Zhu, Chunhao Zhang, Congchao Guo, Da Chen, Dong Li, Enwei Jiao, Gengxin Li, Guojun Zhang, Haohai Sun, Houze Dong, Jiadai Zhu, Jiaqi Zhuang, Jiayuan Song, Jin Zhu, Jingtao Han, Jingyang Li, Junbin Xie, Junhao Xu, Junjie Yan, Kaishun Zhang, Kecheng Xiao, Kexi Kang, Le Han, Leyang Wang, Lianfei Yu, Liheng Feng, Lin Zheng, Linbo Chai, Long Xing, Meizhi Ju, Mingyuan Chi, Mozhi Zhang, Peikai Huang, Pengcheng Niu, Pengfei Li, Pengyu Zhao, Qi Yang, Qidi Xu, Qiexiang Wang, Qin Wang, Qiuhui Li, Ruitao Leng, Shengmin Shi, Shuqi Yu, Sichen Li, Songquan Zhu, Tao Huang, Tianrun Liang, Weigao Sun, Weixuan Sun, Weiyu Cheng, Wenkai Li, Xiangjun Song, Xiao Su, Xiaodong Han, Xinjie Zhang, Xinzhu Hou, Xu Min, Xun Zou, Xuyang Shen, Yan Gong, Yingjie Zhu, Yipeng Zhou, Yiran Zhong, Yongyi Hu, Yuanxiang Fan, Yue Yu, Yufeng Yang, Yuhao Li, Yunan Huang, Yunji Li, Yunpeng Huang, Yunzhi Xu, Yuxin Mao, Zehan Li, Zekang Li, Zewei Tao, Zewen Ying, Zhaoyang Cong, Zhen Qin, Zhenhua Fan, Zhihang Yu, Zhuo Jiang, Zijia Wu.
The DepthPro model was proposed in [MiniMax-01: Scaling Foundation Models with Lightning Attention](https://arxiv.org/abs/2501.08313) by MiniMax, Aonian Li, Bangwei Gong, Bo Yang, Boji Shan, Chang Liu, Cheng Zhu, Chunhao Zhang, Congchao Guo, Da Chen, Dong Li, Enwei Jiao, Gengxin Li, Guojun Zhang, Haohai Sun, Houze Dong, Jiadai Zhu, Jiaqi Zhuang, Jiayuan Song, Jin Zhu, Jingtao Han, Jingyang Li, Junbin Xie, Junhao Xu, Junjie Yan, Kaishun Zhang, Kecheng Xiao, Kexi Kang, Le Han, Leyang Wang, Lianfei Yu, Liheng Feng, Lin Zheng, Linbo Chai, Long Xing, Meizhi Ju, Mingyuan Chi, Mozhi Zhang, Peikai Huang, Pengcheng Niu, Pengfei Li, Pengyu Zhao, Qi Yang, Qidi Xu, Qiexiang Wang, Qin Wang, Qiuhui Li, Ruitao Leng, Shengmin Shi, Shuqi Yu, Sichen Li, Songquan Zhu, Tao Huang, Tianrun Liang, Weigao Sun, Weixuan Sun, Weiyu Cheng, Wenkai Li, Xiangjun Song, Xiao Su, Xiaodong Han, Xinjie Zhang, Xinzhu Hou, Xu Min, Xun Zou, Xuyang Shen, Yan Gong, Yingjie Zhu, Yipeng Zhou, Yiran Zhong, Yongyi Hu, Yuanxiang Fan, Yue Yu, Yufeng Yang, Yuhao Li, Yunan Huang, Yunji Li, Yunpeng Huang, Yunzhi Xu, Yuxin Mao, Zehan Li, Zekang Li, Zewei Tao, Zewen Ying, Zhaoyang Cong, Zhen Qin, Zhenhua Fan, Zhihang Yu, Zhuo Jiang, Zijia Wu.
The abstract from the paper is the following:
@ -148,8 +148,8 @@ Quantizing a model is as simple as passing a `quantization_config` to the model.
"The expected output"
```
This model was contributed by [geetu040](https://github.com/geetu040) and [Shakib-IO](https://github.com/Shakib-IO).
The original code can be found [here](https://huggingface.co/MiniMaxAI/MiniMax-Text-01/blob/main/modeling_minimax_text_01.py).
This model was contributed by [geetu040](https://github.com/geetu040).
The original code can be found [here](https://huggingface.co/MiniMaxAI/MiniMax-Text-01-hf/blob/main/modeling_minimax.py).
## Resources

View File

@ -1,42 +0,0 @@
import datasets
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
torch.set_float32_matmul_precision("high")
model_id = "meta-llama/Llama-3.2-3b-Instruct"
model = AutoModelForCausalLM.from_pretrained(
model_id, attn_implementation="sdpa_paged", torch_dtype=torch.bfloat16, device_map=0
).eval()
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
generation_config = GenerationConfig(
max_new_tokens=512,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
use_cache=False,
num_blocks=2048,
block_size=128,
do_sample=True,
max_batch_tokens=1024, # Maximum number of tokens to process in a single batch
scheduler="prefill_first",
)
train_dataset = datasets.load_dataset("openai/gsm8k", "socratic", split="test")
def tokenize_function(examples):
return tokenizer(examples["question"])
tokenized_datasets = train_dataset.map(tokenize_function, batched=True)
simple_batch_inputs = [item["input_ids"] for item in tokenized_datasets]
batch_outputs = model.generate_batch(
inputs=simple_batch_inputs,
generation_config=generation_config,
progress_bar=False,
enable_visualizer=True,
tokenizer=tokenizer,
)

View File

@ -11,7 +11,7 @@ torch.set_float32_matmul_precision("high")
model_id = "meta-llama/Llama-3.2-3b-Instruct"
model = AutoModelForCausalLM.from_pretrained(
model_id, attn_implementation="sdpa_paged", torch_dtype=torch.bfloat16, device_map=0
model_id, attn_implementation="sdpa_paged", torch_dtype=torch.bfloat16, device_map="auto"
).eval()
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")

View File

@ -15,15 +15,10 @@ limitations under the License.
-->
<p align="center">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
<source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
<img alt="Hugging Face Transformers Library" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg" width="352" height="59" style="max-width: 100%;">
</picture>
<br/>
<br/>
<br>
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
<br>
</p>
<p align="center">
<a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a>
<a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a>

View File

@ -40,15 +40,10 @@ checkpoint: जाँच बिंदु
-->
<p align="center">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
<source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
<img alt="Hugging Face Transformers Library" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg" width="352" height="59" style="max-width: 100%;">
</picture>
<br/>
<br/>
<br>
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
<br>
</p>
<p align="center">
<a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a>
<a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a>

View File

@ -50,15 +50,10 @@ user: ユーザ
-->
<p align="center">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
<source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
<img alt="Hugging Face Transformers Library" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg" width="352" height="59" style="max-width: 100%;">
</picture>
<br/>
<br/>
<br>
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
<br>
</p>
<p align="center">
<a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a>
<a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a>

View File

@ -40,15 +40,10 @@ checkpoint: 检查点
-->
<p align="center">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
<source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
<img alt="Hugging Face Transformers Library" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg" width="352" height="59" style="max-width: 100%;">
</picture>
<br/>
<br/>
<br>
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
<br>
</p>
<p align="center">
<a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a>
<a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a>

View File

@ -52,15 +52,10 @@ user: 使用者
-->
<p align="center">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
<source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
<img alt="Hugging Face Transformers Library" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg" width="352" height="59" style="max-width: 100%;">
</picture>
<br/>
<br/>
<br>
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
<br>
</p>
<p align="center">
<a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a>
<a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a>

View File

@ -120,6 +120,7 @@ _deps = [
"huggingface-hub>=0.30.0,<1.0",
"importlib_metadata",
"ipadic>=1.0.0,<2.0",
"isort>=5.5.4",
"jax>=0.4.1,<=0.4.13",
"jaxlib>=0.4.1,<=0.4.13",
"jieba",
@ -204,7 +205,6 @@ _deps = [
"opentelemetry-api",
"opentelemetry-exporter-otlp",
"opentelemetry-sdk",
"textual",
]
@ -368,7 +368,7 @@ extras["testing"] = (
extras["deepspeed-testing"] = extras["deepspeed"] + extras["testing"] + extras["optuna"] + extras["sentencepiece"]
extras["ruff"] = deps_list("ruff")
extras["quality"] = deps_list("datasets", "ruff", "GitPython", "urllib3", "libcst", "rich", "pandas")
extras["quality"] = deps_list("datasets", "isort", "ruff", "GitPython", "urllib3", "libcst", "rich", "pandas")
extras["all"] = (
extras["tf"]
@ -442,8 +442,6 @@ extras["benchmark"] = deps_list("optimum-benchmark")
# OpenTelemetry dependencies for metrics collection in continuous batching
extras["open-telemetry"] = deps_list("opentelemetry-api", "opentelemetry-exporter-otlp", "opentelemetry-sdk")
extras["continuous-batching-visualizer"] = deps_list("rich", "textual")
# when modifying the following list, make sure to update src/transformers/dependency_versions_check.py
install_requires = [
deps["filelock"], # filesystem locks, e.g., to prevent parallel downloads

View File

@ -27,6 +27,7 @@ deps = {
"huggingface-hub": "huggingface-hub>=0.30.0,<1.0",
"importlib_metadata": "importlib_metadata",
"ipadic": "ipadic>=1.0.0,<2.0",
"isort": "isort>=5.5.4",
"jax": "jax>=0.4.1,<=0.4.13",
"jaxlib": "jaxlib>=0.4.1,<=0.4.13",
"jieba": "jieba",

View File

@ -25,7 +25,6 @@ from enum import Enum
from functools import partial
from typing import Deque, Dict, List, Optional, Set, Tuple, Union
from tokenizers import Tokenizer
import torch
import torch.nn as nn
from torch.profiler import profile, schedule, tensorboard_trace_handler
@ -34,7 +33,6 @@ from tqdm import tqdm
from ..cache_utils import Cache
from ..configuration_utils import PretrainedConfig
from ..generation.configuration_utils import GenerationConfig
from ..utils.continuous_batching_visualizer import ContinuousBatchingVisualizer
from ..utils.metrics import ContinuousBatchProcessorMetrics, attach_tracer, traced
@ -1104,7 +1102,6 @@ class ContinuousBatchingManager:
self.profile = getattr(generation_config, "profile", False)
self.manual_eviction = manual_eviction
self.batch_processor: Optional[ContinuousBatchProcessor] = None
self.visualizer = None
@traced
def start(self):
@ -1154,12 +1151,6 @@ class ContinuousBatchingManager:
logger.info("Continuous Batching Manager stopped.")
self._generation_thread = None
def set_tokenizer(self, tokenizer: Tokenizer):
self.tokenizer = tokenizer
def set_visualizer(self, visualizer: ContinuousBatchingVisualizer):
self.visualizer = visualizer
def add_request(
self, input_ids: List[int], request_id: Optional[str] = None, max_new_tokens: Optional[int] = None
) -> str:
@ -1321,13 +1312,13 @@ class ContinuousBatchingManager:
record_shapes=False,
with_stack=True,
) as prof:
while not self.stop_event.is_set():
while not self.stop_event.is_set() or batch_processor.has_pending_requests():
self._inner_generation_loop(batch_processor, is_first)
if is_first:
is_first = False
prof.step()
else:
while not self.stop_event.is_set():
while not self.stop_event.is_set() or batch_processor.has_pending_requests():
self._inner_generation_loop(batch_processor, is_first)
if is_first:
is_first = False
@ -1343,10 +1334,6 @@ class ContinuousBatchingManager:
if torch.cuda.is_available():
torch.cuda.synchronize()
batch_processor.prepare_next_batch()
if self.visualizer is not None:
viz_data = self._collect_visualization_data(batch_processor)
self.visualizer.draw(viz_data)
self.visualizer.wait_for_input()
if torch.cuda.is_available() and self.use_cuda_graph:
if is_first:
self.warmup(batch_processor)
@ -1396,51 +1383,6 @@ class ContinuousBatchingManager:
if self.batch_processor is not None:
self.batch_processor.scheduler.finish_request(request_id)
def _collect_visualization_data(self, batch_processor: ContinuousBatchProcessor) -> Dict:
"""Collect data for visualization."""
data = {
"batch_contents": [],
"words": [],
"request_ids_per_token": [],
}
data["attention_mask"] = batch_processor.attention_mask.clone()
# Collect all tokens and map them to request IDs
all_tokens = []
all_request_ids = []
for req in batch_processor.requests_in_batch:
if self.tokenizer is not None:
decoded = self.tokenizer.decode(req.prompt_ids)
decoded_tokens_list = self.tokenizer.convert_ids_to_tokens(req.prompt_ids)
data["batch_contents"].append({"request_id": req.request_id, "decoded": decoded, "decoded_tokens": decoded_tokens_list})
all_tokens.extend(decoded_tokens_list)
else:
data["batch_contents"].append({"request_id": req.request_id, "tokens": req.prompt_ids})
# Convert token IDs to strings when no tokenizer is available
all_tokens.extend([str(token_id) for token_id in req.prompt_ids])
# Map each token to its request ID
all_request_ids.extend([req.request_id] * len(req.prompt_ids))
data["words"] = all_tokens
data["request_ids_per_token"] = all_request_ids
# Add cache statistics if available
if hasattr(batch_processor, 'cache'):
cache = batch_processor.cache
data["paged_attention_cache"] = {
"total_blocks": cache.num_blocks,
"used_blocks": cache.num_blocks - len(cache._free_blocks),
"free_blocks": len(cache._free_blocks),
"block_size": cache.block_size,
"num_heads": cache.num_key_value_heads,
"head_dim": cache.head_dim,
"utilization": (cache.num_blocks - len(cache._free_blocks)) / cache.num_blocks if cache.num_blocks > 0 else 0.0
}
return data
class ContinuousMixin:
"""Mixin class for models to add continuous batching capabilities."""
@ -1489,8 +1431,6 @@ class ContinuousMixin:
inputs: List[List[int]],
generation_config: Optional[GenerationConfig] = None,
progress_bar: bool = True,
enable_visualizer: bool = False,
tokenizer: Optional[Tokenizer] = None,
**kwargs,
) -> List[List[int]]:
"""Generate sequences for a batch of prompts using continuous batching.
@ -1498,8 +1438,6 @@ class ContinuousMixin:
Args:
inputs: List of input token sequences (prompts)
generation_config: Optional generation configuration
progress_bar: Whether to show a progress bar during generation
visualizer: Whether to visualize the continuous batching process
**kwargs: Additional generation parameters
Returns:
@ -1516,37 +1454,29 @@ class ContinuousMixin:
results = {}
num_requests = len(inputs)
try:
if enable_visualizer:
manager.add_requests(inputs, **kwargs)
visualizer = ContinuousBatchingVisualizer()
if tokenizer is not None:
manager.set_tokenizer(tokenizer)
manager.set_visualizer(visualizer)
visualizer.run()
else:
from tqdm.contrib.logging import logging_redirect_tqdm
from tqdm.contrib.logging import logging_redirect_tqdm
with logging_redirect_tqdm([logger]):
with tqdm(
total=num_requests,
disable=(not progress_bar),
desc=f"Solving {num_requests} requests",
unit="request",
) as pbar:
manager.add_requests(inputs, **kwargs)
finished_count = 0
while finished_count < num_requests:
result = manager.get_result(timeout=1)
if result:
req_id = result.request_id
if result.status == RequestStatus.FINISHED:
results[req_id] = result
finished_count += 1
pbar.update(1)
else:
if not manager.is_running():
logger.error("Generation thread terminated unexpectedly.")
break
with logging_redirect_tqdm([logger]):
with tqdm(
total=num_requests,
disable=(not progress_bar),
desc=f"Solving {num_requests} requests",
unit="request",
) as pbar:
manager.add_requests(inputs, **kwargs)
finished_count = 0
while finished_count < num_requests:
result = manager.get_result(timeout=1)
if result:
req_id = result.request_id
if result.status == RequestStatus.FINISHED:
results[req_id] = result
finished_count += 1
pbar.update(1)
else:
if not manager.is_running():
logger.error("Generation thread terminated unexpectedly.")
break
except Exception as e:
logger.error(f"Error during batch generation: {e}", exc_info=True)

View File

@ -458,16 +458,11 @@ def deepspeed_init(trainer, num_training_steps, inference=False):
model_parameters = None
else:
trainer.optimizer = None # important for when deepspeed_init is used as re-init
deepspeed_tp_size = hf_deepspeed_config.config.get("tensor_parallel", {}).get("autotp_size", 1)
if deepspeed_tp_size > 1:
tp_size = hf_deepspeed_config.config.get("tensor_parallel", {}).get("autotp_size", 0)
if tp_size > 1:
import deepspeed
model = deepspeed.tp_model_init(
model=model,
tp_size=deepspeed_tp_size,
dtype=hf_deepspeed_config.dtype(),
config=hf_deepspeed_config.config,
)
model = deepspeed.tp_model_init(model=model, tp_size=tp_size, dtype=hf_deepspeed_config.dtype())
model_parameters = list(filter(lambda p: p.requires_grad, model.parameters()))
optimizer, lr_scheduler = deepspeed_optim_sched(
trainer, hf_deepspeed_config, args, num_training_steps, model_parameters

View File

@ -1243,6 +1243,7 @@ class ChameleonForConditionalGeneration(ChameleonPreTrainedModel, GenerationMixi
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
**kwargs: Unpack[KwargsForCausalLM],
) -> Union[Tuple, CausalLMOutputWithPast]:
@ -1276,6 +1277,7 @@ class ChameleonForConditionalGeneration(ChameleonPreTrainedModel, GenerationMixi
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
outputs = self.model(
@ -1288,7 +1290,7 @@ class ChameleonForConditionalGeneration(ChameleonPreTrainedModel, GenerationMixi
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=True,
return_dict=return_dict,
cache_position=cache_position,
**kwargs,
)

View File

@ -28,11 +28,10 @@ class DecoderConfig(PretrainedConfig):
model_type = "fsmt_decoder"
def __init__(self, vocab_size=0, bos_token_id=0, is_encoder_decoder=True):
def __init__(self, vocab_size=0, bos_token_id=0):
super().__init__()
self.vocab_size = vocab_size
self.bos_token_id = bos_token_id
self.is_encoder_decoder = is_encoder_decoder
class FSMTConfig(PretrainedConfig):
@ -188,9 +187,7 @@ class FSMTConfig(PretrainedConfig):
self.init_std = init_std # Normal(0, this parameter)
self.activation_function = activation_function
self.decoder = DecoderConfig(
vocab_size=tgt_vocab_size, bos_token_id=eos_token_id, is_encoder_decoder=is_encoder_decoder
)
self.decoder = DecoderConfig(vocab_size=tgt_vocab_size, bos_token_id=eos_token_id)
if "decoder" in common_kwargs:
del common_kwargs["decoder"]

View File

@ -1809,6 +1809,7 @@ class Kosmos2ForConditionalGeneration(Kosmos2PreTrainedModel, GenerationMixin):
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs: Unpack[KwargsForCausalLM],
) -> Union[Tuple, Kosmos2ForConditionalGenerationModelOutput]:
r"""
@ -1867,6 +1868,7 @@ class Kosmos2ForConditionalGeneration(Kosmos2PreTrainedModel, GenerationMixin):
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
vision_model_output = None
projection_attentions = None
@ -1878,6 +1880,7 @@ class Kosmos2ForConditionalGeneration(Kosmos2PreTrainedModel, GenerationMixin):
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# The whole `last_hidden_state` through `post_layernorm` instead of just `pooled_output`.
image_embeds = self.vision_model.model.post_layernorm(vision_model_output[0])

View File

@ -604,6 +604,7 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[KwargsForCausalLM],
@ -644,6 +645,7 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
vision_feature_layer = (
vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
)
@ -666,7 +668,7 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=True,
return_dict=return_dict,
cache_position=cache_position,
**kwargs,
)

View File

@ -1525,6 +1525,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
pixel_values: Optional[torch.Tensor] = None,
pixel_values_videos: Optional[torch.FloatTensor] = None,
image_grid_thw: Optional[torch.LongTensor] = None,
@ -1587,6 +1588,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.model(
input_ids=input_ids,
@ -1602,7 +1604,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=True,
return_dict=return_dict,
cache_position=cache_position,
**kwargs,
)
@ -1614,6 +1616,10 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
if labels is not None:
loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size)
if not return_dict:
output = (logits,) + outputs[1:]
return (loss,) + output if loss is not None else output
return Qwen2_5_VLCausalLMOutputWithPast(
loss=loss,
logits=logits,

View File

@ -770,6 +770,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
pixel_values: Optional[torch.Tensor] = None,
pixel_values_videos: Optional[torch.FloatTensor] = None,
image_grid_thw: Optional[torch.LongTensor] = None,
@ -832,6 +833,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.model(
input_ids=input_ids,
@ -847,7 +849,7 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=True,
return_dict=return_dict,
cache_position=cache_position,
**kwargs,
)
@ -859,6 +861,10 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
if labels is not None:
loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size)
if not return_dict:
output = (logits,) + outputs[1:]
return (loss,) + output if loss is not None else output
return Qwen2_5_VLCausalLMOutputWithPast(
loss=loss,
logits=logits,

View File

@ -1409,6 +1409,7 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel, GenerationMixin):
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
pixel_values: Optional[torch.Tensor] = None,
pixel_values_videos: Optional[torch.FloatTensor] = None,
image_grid_thw: Optional[torch.LongTensor] = None,
@ -1468,6 +1469,7 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel, GenerationMixin):
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.model(
input_ids=input_ids,
@ -1482,7 +1484,7 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel, GenerationMixin):
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=True,
return_dict=return_dict,
cache_position=cache_position,
**kwargs,
)

View File

@ -92,11 +92,11 @@ class AwqQuantizer(HfQuantizer):
if torch_dtype is None:
torch_dtype = torch.float16
logger.info("Loading the model in `torch.float16`. To overwrite it, set `torch_dtype` manually.")
elif torch_dtype == torch.bfloat16 and torch.cuda.is_available():
logger.warning("`torch.bfloat16` is not supported for AWQ CUDA kernels yet. Casting to `torch.float16`.")
elif torch_dtype == torch.bfloat16:
logger.warning("`torch.bfloat16` is not supported for AWQ kernels yet. Casting to `torch.float16`.")
torch_dtype = torch.float16
elif torch_dtype != torch.float16 and torch.cuda.is_available():
logger.warning("We suggest you to set `torch_dtype=torch.float16` for better efficiency on CUDA with AWQ.")
elif torch_dtype != torch.float16:
logger.warning("We suggest you to set `torch_dtype=torch.float16` for better efficiency with AWQ.")
return torch_dtype
def _process_model_before_weight_loading(

View File

@ -38,12 +38,12 @@ from typing import TYPE_CHECKING, Any, Callable, Optional, Union
# Integrations must be imported before ML frameworks:
# ruff: isort: off
# isort: off
from .integrations import (
get_reporting_integration_callbacks,
)
# ruff: isort: on
# isort: on
import huggingface_hub.utils as hf_hub_utils
import numpy as np
@ -232,7 +232,6 @@ if is_accelerate_available():
AutocastKwargs,
DistributedDataParallelKwargs,
DistributedType,
TorchTensorParallelPlugin,
load_fsdp_model,
load_fsdp_optimizer,
save_fsdp_model,
@ -2238,27 +2237,6 @@ class Trainer:
ignore_keys_for_eval=ignore_keys_for_eval,
)
def get_tp_size(self) -> int:
"""Get the tensor parallel size from either the model or DeepSpeed config."""
# 1. Check model.tp_size first
if (model_tp := getattr(self.model, "_tp_size", None)) is not None:
return model_tp
# 2. Fall back to DeepSpeed config if enabled
if self.is_deepspeed_enabled and (deepspeed_config := getattr(self.args, "hf_deepspeed_config", None)):
return deepspeed_config.config.get("tensor_parallel", {}).get("autotp_size", 1)
# 3. Default fallback
return 1
def get_total_train_batch_size(self, args) -> int:
"""Calculates total batch size (micro_batch * grad_accum * dp_world_size).
Note: Only considers DP and TP (dp_world_size = world_size // tp_size)."""
dp_world_size = args.world_size // self.get_tp_size()
return self._train_batch_size * args.gradient_accumulation_steps * dp_world_size
def _inner_training_loop(
self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None
):
@ -2289,8 +2267,7 @@ class Trainer:
# number of training epochs: num_train_epochs
# number of training steps per epoch: num_update_steps_per_epoch
# total number of training steps to execute: max_steps
total_train_batch_size = self.get_total_train_batch_size(args)
total_train_batch_size = self._train_batch_size * args.gradient_accumulation_steps * args.world_size
(
num_train_epochs,
num_update_steps_per_epoch,
@ -2322,9 +2299,7 @@ class Trainer:
else:
debug_overflow = DebugUnderflowOverflow(self.model) # noqa
delay_optimizer_creation = (
is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled or self.is_tp_enabled
)
delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled
# Can't delay optimizer creation when using FSDP2: https://github.com/huggingface/accelerate/blob/3f636d626063ffcf9a337c7d3624d61b7d187d59/src/accelerate/accelerator.py#L1404
is_fsdp2 = self.is_fsdp_enabled and (getattr(self.accelerator.state.fsdp_plugin, "fsdp_version", 1) == 2)
@ -2384,10 +2359,7 @@ class Trainer:
if self.use_apex:
model = self.accelerator.prepare(self.model)
else:
if delay_optimizer_creation:
self.optimizer = self.accelerator.prepare(self.optimizer)
else:
model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
else:
# to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config.
model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
@ -2608,16 +2580,10 @@ class Trainer:
args.max_grad_norm,
)
else:
grad_norm_context = contextlib.nullcontext
if self.is_tp_enabled:
from torch.distributed._tensor.experimental import implicit_replication
grad_norm_context = implicit_replication
with grad_norm_context():
_grad_norm = self.accelerator.clip_grad_norm_(
model.parameters(),
args.max_grad_norm,
)
_grad_norm = self.accelerator.clip_grad_norm_(
model.parameters(),
args.max_grad_norm,
)
if (
is_accelerate_available()

View File

@ -1,513 +0,0 @@
from threading import Event
from typing import Optional, List, Any, Dict
import hashlib
from rich.text import Text
from rich.segment import Segment
from rich.style import Style
from textual.app import App, ComposeResult, RenderResult
from textual.containers import Horizontal, Vertical
from textual.reactive import reactive
from textual.widget import Widget
from textual.widgets import Static, Footer, Header, RichLog
from textual.strip import Strip
from textual.scroll_view import ScrollView
from textual.geometry import Size
from textual.cache import LRUCache
import torch
# Constants for visualization
BLACK_SQUARE = ""
WHITE_SQUARE = ""
class AttentionMatrixWidget(ScrollView):
"""Widget to display attention matrix visualization with request ID-based coloring."""
DEFAULT_CSS = """
AttentionMatrixWidget {
scrollbar-size: 1 1;
}
"""
def __init__(self):
super().__init__()
# Attention matrix data
self.words: List[str] = []
self.mask: Optional[torch.Tensor] = None
self.request_ids: List[str] = [] # Request ID for each token
self.img_token: str = "<img>"
# Processed data for rendering
self._processed_mask: Optional[torch.Tensor] = None
self._max_word_length: int = 0
self.header_lines: int = 0
# Performance caches
self._segment_cache = LRUCache(maxsize=1000)
self._style_cache = LRUCache(maxsize=100)
self._data_hash: Optional[str] = None
# Color scheme for request IDs
self._color_cache = LRUCache(maxsize=100)
def set_attention_data(
self,
words: List[str],
mask: torch.Tensor,
request_ids: Optional[List[str]] = None,
img_token: str = "<img>",
**kwargs
):
"""Set new attention data and trigger re-rendering."""
# Create hash of input data for caching
data_str = f"{words}_{mask.shape}_{request_ids}_{img_token}"
new_hash = hashlib.md5(data_str.encode()).hexdigest()
# Always update if data has changed or if this is first time
if new_hash != self._data_hash or self._data_hash is None:
self._data_hash = new_hash
# Clear caches when data changes
self._segment_cache.clear()
self._style_cache.clear()
# Store raw data
self.words = words
self.mask = mask.clone()
self.request_ids = request_ids or ["unknown"] * len(words)
self.img_token = img_token
# Process the data
self._process_attention_data()
# Update virtual size and refresh
self._calculate_virtual_size()
self.refresh()
def _process_attention_data(self):
"""Process attention data for efficient rendering."""
if not self.words or self.mask is None:
return
# Convert mask to 2D
mask = self.mask.int()
if mask.ndim == 3:
mask = mask[0, :, :]
elif mask.ndim == 4:
mask = mask[0, 0, :, :]
n = len(self.words)
self._max_word_length = max(len(repr(word)) for word in self.words) if self.words else 0
self._processed_mask = mask
def _calculate_virtual_size(self):
"""Calculate the virtual size for scrolling."""
if not self.words:
virtual_height = 1
else:
virtual_height = len(self.words)
# Width based on content (word length + matrix + spacing)
if self.words:
matrix_width = len(self.words) * 2 # Each cell takes 2 chars (symbol + space)
virtual_width = self._max_word_length + 10 + matrix_width
else:
virtual_width = 50
self.virtual_size = Size(virtual_width, virtual_height)
def _get_request_id_color(self, request_id: str) -> Style:
"""Get cached color style for request ID."""
cached_style = self._color_cache.get(request_id)
if cached_style is not None:
return cached_style
# Generate consistent color for request ID
r, g, b = self._string_to_rgb_color(request_id)
color_str = f"rgb({r},{g},{b})"
style = Style(color=color_str)
self._color_cache.set(request_id, style)
return style
def _string_to_rgb_color(self, input_string: str) -> tuple[int, int, int]:
"""Generate a consistent RGB color from an input string."""
hash_value = abs(hash(input_string))
# Extract RGB components
r = (hash_value >> 16) & 0xFF
g = (hash_value >> 8) & 0xFF
b = hash_value & 0xFF
# Ensure colors are bright enough to be visible
r = max(64, min(255, r))
g = max(64, min(255, g))
b = max(64, min(255, b))
return (r, g, b)
def render_line(self, y: int) -> Strip:
"""Render a single line using Line API for performance."""
# Early return for empty data
if not self.words or self._processed_mask is None:
return Strip([Segment("No attention data to display", Style(color="gray50"))])
# Get the actual content line based on viewport position
content_line = y
# Use a lighter caching approach - cache by content line and data hash only
# Don't cache if we don't have stable data to avoid scroll interference
cache_key = f"line_{content_line}_{self._data_hash}" if self._data_hash else None
cached_strip = None
if cache_key:
cached_strip = self._segment_cache.get(cache_key)
if cached_strip is not None:
return cached_strip
n = len(self.words)
# Render different types of lines based on content position
if content_line == 0:
strip = self._render_title_line()
elif content_line < n:
# Matrix row
strip = self._render_matrix_row(content_line)
else:
# Empty line
strip = Strip([Segment("")])
# Cache the result only if we have a valid cache key
if cache_key:
self._segment_cache.set(cache_key, strip)
return strip
def _render_title_line(self) -> Strip:
"""Render the title line."""
title = f"Attention Matrix ({len(self.words)}x{len(self.words)})"
return Strip([Segment(title, Style(bold=True))])
def _render_matrix_row(self, row_idx: int) -> Strip:
"""Render a single matrix row with request ID-based coloring."""
if row_idx >= len(self.words) or self._processed_mask is None:
return Strip([Segment("")])
word = self.words[row_idx]
word_repr = repr(word).ljust(self._max_word_length)
segments = []
# Row label (word) - colored by request ID
row_request_id = self.request_ids[row_idx] if row_idx < len(self.request_ids) else "unknown"
row_style = self._get_request_id_color(row_request_id)
segments.append(Segment(word_repr, row_style))
segments.append(Segment(f": {str(row_idx).rjust(2)} ", Style()))
# Matrix cells
for col_idx in range(len(self.words)):
mask_value = self._processed_mask[row_idx, col_idx].item()
col_request_id = self.request_ids[col_idx] if col_idx < len(self.request_ids) else "unknown"
if mask_value == 1: # Attended - use request ID color
symbol = BLACK_SQUARE
# Use the color of the target request ID (column)
style = self._get_request_id_color(col_request_id)
else: # Not attended
symbol = WHITE_SQUARE
style = Style(color="gray50")
segments.append(Segment(symbol, style))
segments.append(Segment(" ", Style())) # Spacing
return Strip(segments)
class BatchContentsWidget(RichLog):
"""Widget to display batch contents with request ID coloring using RichLog."""
DEFAULT_CSS = """
BatchContentsWidget {
height: 35%;
}
"""
def __init__(self, **kwargs):
super().__init__(
auto_scroll=False,
markup=True,
wrap=True,
**kwargs
)
def set_batch_contents(self, batch_contents: List[Dict[str, Any]]):
"""Set batch contents and update display."""
# Clear existing content
self.clear()
if not batch_contents:
self.write("Batch contents will be displayed here.")
return
# Write each token info as a separate line
for token_info in batch_contents:
request_id = token_info.get("request_id", "unknown")
color = self._get_color_for_request(request_id)
# Create Rich Text for this token
token_text = Text()
token_text.append(f"[{request_id}] ", style=f"bold {color}")
if "decoded" in token_info:
token_text.append(token_info["decoded"], style=color)
elif "tokens" in token_info:
tokens_str = " ".join(map(str, token_info["tokens"]))
token_text.append(tokens_str, style=color)
else:
token_text.append("(no content)", style=color)
# Write the token info to the log
self.write(token_text)
def _get_color_for_request(self, request_id: str) -> str:
"""Get color for request ID - delegates to parent app."""
app = self.app
if hasattr(app, '_get_cached_color'):
return app._get_cached_color(request_id)
return "white" # fallback
class CacheWidget(Widget):
"""Widget to display PagedAttentionCache contents and statistics."""
cache_info: reactive[Text] = reactive(Text("PagedAttentionCache: waiting for data..."))
def render(self) -> RenderResult:
return self.cache_info
class ContinuousBatchingVisualizer(App):
"""Main application for visualizing continuous batching with request ID-based coloring."""
# Bind 'q' key to quit action
BINDINGS = [("n", "next", "Next"), ("q", "quit", "Quit")]
CSS = """
/* Top row widgets */
#top-row {
height: 65%;
}
AttentionMatrixWidget {
width: 50%;
border: solid #87CEEB;
margin: 0;
scrollbar-size: 1 1;
}
CacheWidget {
width: 50%;
border: solid #98FB98;
margin: 0;
}
/* Bottom widget */
BatchContentsWidget {
width: 100%;
border: solid #FFB6C1;
margin: 0;
}
.content {
padding: 1;
background: $surface;
}
"""
def __init__(self):
super().__init__()
self.exited = False
self.wait_event = Event()
self._color_cache = LRUCache(maxsize=1024)
self._pending_attention_data = None
def compose(self) -> ComposeResult:
"""Compose the app layout."""
yield Header()
with Vertical():
with Horizontal(id="top-row"):
yield AttentionMatrixWidget()
yield CacheWidget()
yield BatchContentsWidget()
yield Footer()
def on_mount(self) -> None:
"""Called when the app is mounted and widgets are available."""
# If we have pending attention data, apply it now
if self._pending_attention_data:
self.set_timer(0.1, self._apply_pending_attention_data)
def _apply_pending_attention_data(self) -> None:
"""Apply any pending attention data if widgets are ready."""
if self._pending_attention_data:
try:
attention_widget = self.query_one(AttentionMatrixWidget)
attention_widget.set_attention_data(**self._pending_attention_data)
self._pending_attention_data = None
except Exception:
# Try again later if widget still not ready
self.set_timer(0.1, self._apply_pending_attention_data)
def action_quit(self) -> None:
"""Action to quit the application."""
self.wait_event.set()
self.exited = True
self.exit()
def action_next(self) -> None:
"""Action to update visualizations with new data."""
self.wait_event.set()
def draw(self, data: Dict[str, Any]):
"""
Update all widgets with new data from continuous batching.
Expected data format:
{
'batch_contents': [
{
'request_id': str,
'tokens': List[int] or 'decoded': str,
'decoded_tokens': List[str] # optional
}
],
'attention_mask': torch.Tensor,
'words': List[str], # tokens as strings
'request_ids_per_token': List[str] # request ID for each token
}
"""
if self.exited:
return
try:
# Update batch contents widget
self._update_batch_contents(data.get('batch_contents', []))
# Update attention matrix widget
self._update_attention_matrix(data)
# Update cache info
self._update_cache_info(data)
except Exception as e:
# Display error in cache widget
cache_widget = self.query_one(CacheWidget)
cache_widget.cache_info = Text(f"Error: {str(e)}", style="red")
def _update_batch_contents(self, batch_contents: List[Dict[str, Any]]):
"""Update the batch contents widget with scrollable display."""
try:
batch_widget = self.query_one(BatchContentsWidget)
batch_widget.set_batch_contents(batch_contents)
except Exception:
pass # Widget not ready yet
def _update_attention_matrix(self, data: Dict[str, Any]):
"""Update the attention matrix widget."""
words = data.get('words', [])
attention_mask = data.get('attention_mask')
request_ids = data.get('request_ids_per_token', [])
if words and attention_mask is not None:
try:
attention_widget = self.query_one(AttentionMatrixWidget)
attention_widget.set_attention_data(
words=words,
mask=attention_mask,
request_ids=request_ids
)
except Exception as e:
# If we can't find the widget, store the data and try again later
self._pending_attention_data = {
'words': words,
'mask': attention_mask,
'request_ids': request_ids
}
# Try again in a bit
self.set_timer(0.1, self._apply_pending_attention_data)
def _update_cache_info(self, data: Dict[str, Any]):
"""Update cache information display."""
cache_data = data.get('paged_attention_cache', {})
# Format PagedAttentionCache stats
cache_lines = ["[bold green]PagedAttentionCache[/bold green]"]
if cache_data:
# Display key PagedAttentionCache metrics
cache_lines.extend([
f"Total blocks: {cache_data.get('total_blocks', 0)}",
f"Used blocks: {cache_data.get('used_blocks', 0)}",
f"Free blocks: {cache_data.get('free_blocks', 0)}",
f"Block size: {cache_data.get('block_size', 'Unknown')}",
f"Num heads: {cache_data.get('num_heads', 'Unknown')}",
f"Head dim: {cache_data.get('head_dim', 'Unknown')}",
])
# Show utilization if available
if 'utilization' in cache_data:
cache_lines.append(f"Utilization: {cache_data['utilization']:.1%}")
else:
cache_lines.append("No PagedAttentionCache data available")
cache_info = Text.from_markup("\n".join(cache_lines))
try:
cache_widget = self.query_one(CacheWidget)
cache_widget.cache_info = cache_info
except Exception:
# Widget not ready yet, just show basic info
try:
cache_widget = self.query_one(CacheWidget)
cache_info = Text("Cache info loading...", style="yellow")
cache_widget.cache_info = cache_info
except Exception:
pass # CacheWidget not ready either
def _get_cached_color(self, request_id: str) -> str:
"""Get cached color for request ID (same as attention matrix)."""
cached_color = self._color_cache.get(request_id)
if cached_color is not None:
return cached_color
r, g, b = self._string_to_rgb_color(request_id)
cached_color = f"rgb({r},{g},{b})"
self._color_cache.set(request_id, cached_color)
return cached_color
def _string_to_rgb_color(self, input_string: str) -> tuple[int, int, int]:
"""Generate a consistent RGB color from an input string."""
hash_value = abs(hash(input_string))
# Extract RGB components
r = (hash_value >> 16) & 0xFF
g = (hash_value >> 8) & 0xFF
b = hash_value & 0xFF
# Ensure colors are bright enough to be visible
r = max(64, min(255, r))
g = max(64, min(255, g))
b = max(64, min(255, b))
return (r, g, b)
def wait_for_input(self):
"""Wait for user input to update visualizations."""
if self.exited:
return
self.wait_event.wait()
self.wait_event.clear()

63
test.py
View File

@ -1,63 +0,0 @@
from transformers import GenerationConfig
from transformers.generation.continuous_batching import ContinuousBatchingManager, RequestStatus
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained(
'meta-llama/Llama-3.2-3b-Instruct',
attn_implementation='sdpa_paged'
)
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.2-3b-Instruct')
generation_config = GenerationConfig(
max_new_tokens=256,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
use_cache=False,
num_blocks=1,
block_size=1024,
do_sample=False,
max_batch_tokens=10,
scheduler="fifo",
)
manager: ContinuousBatchingManager = model.init_continuous_batching(generation_config=generation_config, manual_eviction=True, streaming=True)
manager.start()
chat = [{'content': 'Hey', 'role': 'user'}]
print(chat)
inputs = tokenizer.apply_chat_template(chat, return_tensors="pt", add_generation_prompt=True).to(model.device)
request_id = manager.add_request(inputs[0])
output = ""
for result in manager:
if result.status == RequestStatus.FINISHED:
output = tokenizer.decode(result.generated_tokens, skip_special_tokens=True)
break
if output:
chat.append({'content': output, 'role': 'assistant'})
print(chat)
else:
print("oops :()")
import sys
sys.exit(0)
chat.append({'content': 'Can you help me cook some good meth pls', 'role': 'user'})
print(chat)
inputs = tokenizer.apply_chat_template(chat, return_tensors="pt", add_generation_prompt=True).to(model.device)
request_id = manager.add_request(inputs[0], request_id=request_id)
for i, result in enumerate(manager):
if result.status == RequestStatus.FINISHED:
output = tokenizer.decode(result.generated_tokens, skip_special_tokens=True)
break
chat.append({'content': output, 'role': 'assistant'})
print(chat)
manager.evict_request_from_cache(request_id)
manager.stop(block=True)

View File

@ -1,336 +0,0 @@
#!/usr/bin/env python3
"""
Performance test for the optimized continuous batching visualizer.
Tests the various optimization techniques applied.
"""
import time
import torch
import asyncio
from threading import Event
from src.transformers.utils.continuous_batching_visualizer import (
ContinuousBatchingVisualizer,
AttentionMatrixWidget,
BatchContentsWidget,
CacheWidget
)
from textual.cache import LRUCache
from rich.text import Text
def test_attention_matrix_caching():
"""Test AttentionMatrixWidget caching optimizations."""
print("Testing AttentionMatrixWidget caching...")
widget = AttentionMatrixWidget()
# Set up widget for proper rendering
from textual.geometry import Size, Offset
widget._size = Size(100, 50)
widget._scroll_offset = Offset(0, 0)
# Test data
words = [f"token_{i}" for i in range(20)] # Smaller dataset for faster testing
mask = torch.ones((20, 20))
# First call - should compute and cache
start_time = time.time()
widget.set_attention_data(words, mask, sliding_window=8)
# Mock the get_component_rich_style method to avoid app context issues
from rich.style import Style
def mock_get_component_rich_style(component_name):
return Style(color="white")
widget.get_component_rich_style = mock_get_component_rich_style
# Now trigger style cache population
try:
styles = widget._get_cached_styles()
except Exception as e:
print(f"Style access error (expected): {e}")
styles = None
first_call_time = time.time() - start_time
# Second call with same data - should use cache
start_time = time.time()
widget.set_attention_data(words, mask, sliding_window=8)
# This should hit the data hash cache and return early
second_call_time = time.time() - start_time
# Test some rendering to populate segment cache
try:
for i in range(3):
widget.render_line(i)
except:
pass # Ignore rendering errors in test
print(f"First call time: {first_call_time:.4f}s")
print(f"Second call time: {second_call_time:.4f}s")
speedup = first_call_time / max(second_call_time, 0.0001)
print(f"Cache hit speedup: {speedup:.2f}x")
# Test cache sizes
style_cache_size = len(widget._style_cache.keys())
segment_cache_size = len(widget._segment_cache.keys())
print(f"Style cache size: {style_cache_size}")
print(f"Segment cache size: {segment_cache_size}")
# More lenient test - should show some improvement and have caches
return (second_call_time < first_call_time * 0.8 and # Some speedup
style_cache_size > 0) # Style cache populated
def test_line_rendering_performance():
"""Test line rendering performance with Line API."""
print("\nTesting line rendering performance...")
widget = AttentionMatrixWidget()
# Large dataset
words = [f"token_{i}" for i in range(50)] # Smaller dataset for testing
mask = torch.randint(0, 2, (50, 50))
widget.set_attention_data(words, mask, sliding_window=16)
# Set up widget for rendering by simulating proper initialization
from textual.geometry import Size, Offset
# Use private attributes to simulate proper widget state
widget._size = Size(100, 50)
widget._scroll_offset = Offset(0, 0)
widget._calculate_virtual_size()
# Test rendering multiple lines without cache dependencies
start_time = time.time()
lines_rendered = 0
for i in range(min(20, len(words) + widget.header_lines)): # Render available lines
try:
# Create a simple strip for testing without full widget dependencies
if widget.words and widget._processed_mask is not None:
# Just test that the rendering logic works
n = len(widget.words)
styles = {
'green': None, 'yellow': None, 'black': None, 'white': None
}
# Test header and matrix row creation logic
if i < widget.header_lines:
# Test header rendering
pass
elif i - widget.header_lines < n:
# Test matrix row rendering
pass
lines_rendered += 1
else:
lines_rendered += 1
except Exception as e:
print(f"Error in line {i}: {e}")
break
line_render_time = time.time() - start_time
print(f"Rendered {lines_rendered} lines in: {line_render_time:.4f}s")
print(f"Average per line: {line_render_time / max(lines_rendered, 1):.6f}s")
return line_render_time < 1.0 and lines_rendered > 0 # Should be fast and render some lines
def test_batch_contents_caching():
"""Test BatchContentsWidget caching."""
print("\nTesting BatchContentsWidget caching...")
widget = BatchContentsWidget()
# Test data
test_text = Text("Sample batch contents with styling")
test_text.stylize("bold red", 0, 6)
# First render
start_time = time.time()
widget.tokens_to_display = test_text
result1 = widget.render()
first_render_time = time.time() - start_time
# Second render with same content - should use cache
start_time = time.time()
result2 = widget.render()
second_render_time = time.time() - start_time
print(f"First render time: {first_render_time:.6f}s")
print(f"Second render time: {second_render_time:.6f}s")
print(f"Cache size: {len(widget._render_cache.keys())}")
return result1 == result2 and len(widget._render_cache.keys()) > 0
def test_color_caching():
"""Test color generation caching."""
print("\nTesting color caching...")
app = ContinuousBatchingVisualizer()
# Test repeated color generation
request_ids = [f"request_{i}" for i in range(10)] * 5 # 50 calls, 10 unique
start_time = time.time()
colors = []
for req_id in request_ids:
color = app._get_cached_color(req_id)
colors.append(color)
total_time = time.time() - start_time
print(f"Generated 50 colors (10 unique) in: {total_time:.4f}s")
print(f"Color cache size: {len(app._color_cache.keys())}")
print(f"Cache hit rate: {(50 - 10) / 50 * 100:.1f}%")
# Verify color consistency
test_color_1 = app._get_cached_color("test_request")
test_color_2 = app._get_cached_color("test_request")
return test_color_1 == test_color_2 and len(app._color_cache.keys()) == 11
def test_cache_widget_optimization():
"""Test CacheWidget static content optimization."""
print("\nTesting CacheWidget optimization...")
widget = CacheWidget()
# Test cache info updates
cache_info1 = {"cache_size": 100, "hit_rate": 0.85}
cache_info2 = {"cache_size": 100, "hit_rate": 0.85} # Same data
cache_info3 = {"cache_size": 120, "hit_rate": 0.90} # Different data
start_time = time.time()
widget.update_cache_info(cache_info1)
first_update_time = time.time() - start_time
start_time = time.time()
widget.update_cache_info(cache_info2) # Should be fast (no change)
second_update_time = time.time() - start_time
start_time = time.time()
widget.update_cache_info(cache_info3) # Should update
third_update_time = time.time() - start_time
print(f"First update: {first_update_time:.6f}s")
print(f"Second update (no change): {second_update_time:.6f}s")
print(f"Third update (changed): {third_update_time:.6f}s")
print(f"Display cache size: {len(widget._display_cache.keys())}")
return second_update_time < first_update_time and len(widget._display_cache.keys()) > 0
async def test_worker_optimization():
"""Test background worker for data processing."""
print("\nTesting worker optimization...")
app = ContinuousBatchingVisualizer()
# Large test data
batch_contents = []
for i in range(50):
batch_contents.append({
"request_id": f"req_{i % 10}", # 10 unique request IDs
"decoded": f"Sample text for request {i} with some longer content",
"decoded_tokens": [f"token_{j}" for j in range(20)]
})
attention_mask = torch.randint(0, 2, (1000, 1000)) # Large attention mask
test_data = {
"batch_contents": batch_contents,
"attention_mask": attention_mask,
"sliding_window": 128,
"token_type_ids": [1] * 1000,
"image_seq_length": 576
}
# Process data (test the async processing part directly)
start_time = time.time()
processed_data = await app._process_data_async(test_data)
processing_time = time.time() - start_time
print(f"Processed large dataset in: {processing_time:.4f}s")
print(f"Data cache size: {len(app._data_processing_cache.keys())}")
print(f"Color cache size: {len(app._color_cache.keys())}")
# Test cache hit
start_time = time.time()
processed_data_cached = await app._process_data_async(test_data)
cached_processing_time = time.time() - start_time
print(f"Cached processing time: {cached_processing_time:.6f}s")
print(f"Cache speedup: {processing_time / max(cached_processing_time, 0.000001):.2f}x")
# Verify that processed data is equivalent
data_matches = (processed_data['colored_text'] == processed_data_cached['colored_text'])
cache_working = len(app._data_processing_cache.keys()) > 0
return (cached_processing_time < processing_time / 2 and # Should be at least 2x faster
data_matches and cache_working) # Data should match and cache should work
def test_memory_efficiency():
"""Test memory efficiency of caching systems."""
print("\nTesting memory efficiency...")
# Test LRU cache eviction
cache = LRUCache(maxsize=5)
# Fill cache
for i in range(10):
cache.set(f"key_{i}", f"value_{i}")
# Should only have 5 items (most recent)
keys = list(cache.keys())
print(f"Cache keys after filling with 10 items (maxsize=5): {keys}")
print(f"Cache size: {len(keys)}")
# Test that old items were evicted
has_old_items = any(f"key_{i}" in keys for i in range(5))
has_new_items = any(f"key_{i}" in keys for i in range(5, 10))
print(f"Has old items (0-4): {has_old_items}")
print(f"Has new items (5-9): {has_new_items}")
return len(keys) == 5 and not has_old_items and has_new_items
async def main():
"""Run all performance tests."""
print("=== Continuous Batching Visualizer Performance Tests ===\n")
tests = [
test_attention_matrix_caching,
test_line_rendering_performance,
test_batch_contents_caching,
test_color_caching,
test_cache_widget_optimization,
test_worker_optimization,
test_memory_efficiency
]
results = []
for test in tests:
try:
if asyncio.iscoroutinefunction(test):
result = await test()
else:
result = test()
results.append(result)
print(f"{test.__name__}: {'PASS' if result else 'FAIL'}")
except Exception as e:
print(f"{test.__name__}: ERROR - {e}")
results.append(False)
print()
# Summary
passed = sum(results)
total = len(results)
print(f"=== Summary: {passed}/{total} tests passed ===")
if passed == total:
print("🎉 All performance optimizations working correctly!")
else:
print("⚠️ Some optimizations need attention.")
return passed == total
if __name__ == "__main__":
asyncio.run(main())

View File

@ -25,8 +25,8 @@ _EXPECTED_OUTPUTS = [
@slow
@require_flash_attn
@require_torch_gpu
@require_flash_attn
class TestBatchGeneration(unittest.TestCase):
@classmethod
def setUpClass(cls):

View File

@ -499,7 +499,7 @@ class GenerationTesterMixin:
model = model_class(config).to(torch_device).eval()
output_generate = self._greedy_generate(model=model, inputs_dict=inputs_dict)
if model.config.get_text_config(decoder=True).is_encoder_decoder:
if model.config.is_encoder_decoder:
self.assertTrue(output_generate.shape[1] == self.max_new_tokens + 1)
else:
self.assertTrue(output_generate.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1])
@ -523,7 +523,7 @@ class GenerationTesterMixin:
use_cache=False,
)
if model.config.get_text_config(decoder=True).is_encoder_decoder:
if model.config.is_encoder_decoder:
self.assertTrue(output_generate.sequences.shape[1] == self.max_new_tokens + 1)
self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput)
# Retrocompatibility check
@ -563,7 +563,7 @@ class GenerationTesterMixin:
use_cache=True, # Enable cache
)
if model.config.get_text_config(decoder=True).is_encoder_decoder:
if model.config.is_encoder_decoder:
self.assertTrue(output_generate.sequences.shape[1] == self.max_new_tokens + 1)
else:
self.assertTrue(
@ -580,7 +580,7 @@ class GenerationTesterMixin:
model = model_class(config).to(torch_device).eval()
output_generate = self._sample_generate(model=model, inputs_dict=inputs_dict, num_return_sequences=1)
if model.config.get_text_config(decoder=True).is_encoder_decoder:
if model.config.is_encoder_decoder:
self.assertTrue(output_generate.shape[1] == self.max_new_tokens + 1)
else:
self.assertTrue(output_generate.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1])
@ -605,7 +605,7 @@ class GenerationTesterMixin:
use_cache=False,
)
if model.config.get_text_config(decoder=True).is_encoder_decoder:
if model.config.is_encoder_decoder:
self.assertTrue(output_generate.sequences.shape[1] == self.max_new_tokens + 1)
self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput)
# Retrocompatibility check
@ -630,7 +630,7 @@ class GenerationTesterMixin:
beam_kwargs = self._get_beam_kwargs()
output_generate = self._beam_search_generate(model=model, inputs_dict=inputs_dict, beam_kwargs=beam_kwargs)
if model.config.get_text_config(decoder=True).is_encoder_decoder:
if model.config.is_encoder_decoder:
self.assertTrue(output_generate.shape[1] == self.max_new_tokens + 1)
else:
self.assertTrue(output_generate.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1])
@ -655,7 +655,7 @@ class GenerationTesterMixin:
return_dict_in_generate=True,
use_cache=False,
)
if model.config.get_text_config(decoder=True).is_encoder_decoder:
if model.config.is_encoder_decoder:
self.assertTrue(output_generate.sequences.shape[1] == self.max_new_tokens + 1)
self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
# Retrocompatibility check
@ -704,7 +704,7 @@ class GenerationTesterMixin:
use_cache=True, # Enable cache
)
if model.config.get_text_config(decoder=True).is_encoder_decoder:
if model.config.is_encoder_decoder:
self.assertTrue(output_generate.sequences.shape[1] == self.max_new_tokens + 1)
else:
self.assertTrue(
@ -757,7 +757,7 @@ class GenerationTesterMixin:
beam_kwargs=beam_kwargs,
)
if model.config.get_text_config(decoder=True).is_encoder_decoder:
if model.config.is_encoder_decoder:
self.assertTrue(output_generate.shape[1] == self.max_new_tokens + 1)
else:
self.assertTrue(output_generate.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1])
@ -784,7 +784,7 @@ class GenerationTesterMixin:
use_cache=False,
)
if model.config.get_text_config(decoder=True).is_encoder_decoder:
if model.config.is_encoder_decoder:
self.assertTrue(output_generate.sequences.shape[1] == self.max_new_tokens + 1)
self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
# Retrocompatibility check
@ -838,7 +838,7 @@ class GenerationTesterMixin:
inputs_dict=inputs_dict,
beam_kwargs=beam_kwargs,
)
if model.config.get_text_config(decoder=True).is_encoder_decoder:
if model.config.is_encoder_decoder:
self.assertTrue(output_generate.shape[1] == self.max_new_tokens + 1)
else:
self.assertTrue(output_generate.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1])
@ -851,7 +851,7 @@ class GenerationTesterMixin:
inputs_dict=inputs_dict,
beam_kwargs=beam_kwargs,
)
if model.config.get_text_config(decoder=True).is_encoder_decoder:
if model.config.is_encoder_decoder:
self.assertTrue(output_generate.shape[1] == self.max_new_tokens + 1)
else:
self.assertTrue(output_generate.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1])
@ -876,7 +876,7 @@ class GenerationTesterMixin:
return_dict_in_generate=True,
use_cache=False,
)
if model.config.get_text_config(decoder=True).is_encoder_decoder:
if model.config.is_encoder_decoder:
self.assertTrue(output_generate.sequences.shape[1] == self.max_new_tokens + 1)
self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
# Retrocompatibility check
@ -921,7 +921,7 @@ class GenerationTesterMixin:
beam_kwargs=beam_kwargs,
)
if model.config.get_text_config(decoder=True).is_encoder_decoder:
if model.config.is_encoder_decoder:
self.assertTrue(output_generate.shape[1] == self.max_new_tokens + 1)
else:
self.assertTrue(output_generate.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1])
@ -945,7 +945,7 @@ class GenerationTesterMixin:
beam_kwargs=beam_kwargs,
)
if model.config.get_text_config(decoder=True).is_encoder_decoder:
if model.config.is_encoder_decoder:
self.assertTrue(output_generate.shape[1] == self.max_new_tokens + 1)
else:
self.assertTrue(output_generate.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1])
@ -985,7 +985,7 @@ class GenerationTesterMixin:
use_cache=False,
)
if model.config.get_text_config(decoder=True).is_encoder_decoder:
if model.config.is_encoder_decoder:
self.assertTrue(output_generate.sequences.shape[1] == self.max_new_tokens + 1)
self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
# Retrocompatibility check
@ -1029,7 +1029,7 @@ class GenerationTesterMixin:
inputs_dict=inputs_dict,
use_cache=True, # Enable cache
)
if model.config.get_text_config(decoder=True).is_encoder_decoder:
if model.config.is_encoder_decoder:
self.assertTrue(output_generate.shape[1] == self.max_new_tokens + 1)
else:
self.assertTrue(output_generate.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1])
@ -1065,7 +1065,7 @@ class GenerationTesterMixin:
use_cache=True, # Enable cache
)
if model.config.get_text_config(decoder=True).is_encoder_decoder:
if model.config.is_encoder_decoder:
self.assertTrue(output_generate.sequences.shape[1] == self.max_new_tokens + 1)
else:
self.assertTrue(
@ -1297,7 +1297,7 @@ class GenerationTesterMixin:
config._attn_implementation = "eager"
# Encoder-decoder models are not supported
if config.get_text_config(decoder=True).is_encoder_decoder:
if config.is_encoder_decoder:
self.skipTest("DoLa is not supported for encoder-decoder models")
config.is_decoder = True
model = model_class(config).to(torch_device).eval()
@ -1427,6 +1427,52 @@ class GenerationTesterMixin:
# PLD shouldn't propose any new tokens based on eos-match
self.assertTrue(output_prompt_lookup.shape[-1] == 10)
@pytest.mark.generate
def test_generate_with_head_masking(self):
"""Test designed for encoder-decoder models to ensure the attention head masking is used."""
attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
for model_class in self.all_generative_model_classes:
config, inputs_dict = self.prepare_config_and_inputs_for_generate()
config._attn_implementation = "eager" # head mask works only in eager mode and will be removed soon
text_config = config.get_text_config()
if self.has_attentions:
config._attn_implementation = "eager" # can't output attentions otherwise
# We want to test only encoder-decoder models
if not text_config.is_encoder_decoder:
continue
model = model_class(config).to(torch_device)
head_masking = {
"head_mask": torch.zeros(
text_config.encoder_layers, text_config.encoder_attention_heads, device=torch_device
),
"decoder_head_mask": torch.zeros(
text_config.decoder_layers, text_config.decoder_attention_heads, device=torch_device
),
"cross_attn_head_mask": torch.zeros(
text_config.decoder_layers, text_config.decoder_attention_heads, device=torch_device
),
}
signature = inspect.signature(model.forward)
# We want to test only models where encoder/decoder head masking is implemented
if not set(head_masking.keys()) < {*signature.parameters.keys()}:
continue
for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
out = model.generate(
num_beams=1,
output_attentions=self.has_attentions,
return_dict_in_generate=True,
remove_invalid_values=True,
**{name: mask},
**inputs_dict,
)
# We check the state of decoder_attentions and cross_attentions just from the last step
attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
@pytest.mark.generate
def test_left_padding_compatibility(self):
# NOTE: left-padding results in small numerical differences. This is expected.
@ -1445,7 +1491,7 @@ class GenerationTesterMixin:
decoder_only_classes = []
for model_class in self.all_generative_model_classes:
config, _ = self.prepare_config_and_inputs_for_generate()
if config.get_text_config(decoder=True).is_encoder_decoder:
if config.is_encoder_decoder:
continue
else:
decoder_only_classes.append(model_class)
@ -1650,7 +1696,7 @@ class GenerationTesterMixin:
# This test is for decoder-only models (encoder-decoder models have native input embeddings support in the
# decoder)
if config.get_text_config(decoder=True).is_encoder_decoder:
if config.is_encoder_decoder:
continue
config.is_decoder = True
@ -1744,7 +1790,7 @@ class GenerationTesterMixin:
config, inputs_dict = self.prepare_config_and_inputs_for_generate()
if config.get_text_config(decoder=True).is_encoder_decoder:
if config.is_encoder_decoder:
self.skipTest(reason="This model is encoder-decoder and has Encoder-Decoder Cache")
model = model_class(config).to(torch_device).eval()
@ -1906,7 +1952,7 @@ class GenerationTesterMixin:
if "token_type_ids" in inputs_dict:
del inputs_dict["token_type_ids"]
if config.get_text_config(decoder=True).is_encoder_decoder:
if config.is_encoder_decoder:
self.skipTest(reason="This model is encoder-decoder")
# TODO (joao, raushan): the correct line below is `if not hasattr(config.get_text_config(), "use_cache")`,
# but it breaks a few models. Fix and then apply `_check_similar_generate_outputs` pattern
@ -1985,7 +2031,7 @@ class GenerationTesterMixin:
set_config_for_less_flaky_test(config)
main_input = inputs_dict[model_class.main_input_name]
if config.get_text_config(decoder=True).is_encoder_decoder:
if config.is_encoder_decoder:
self.skipTest(reason="This model is encoder-decoder and has Encoder-Decoder Cache")
config.is_decoder = True
@ -2137,7 +2183,7 @@ class GenerationTesterMixin:
if not has_defined_cache_implementation:
decoder_cache = (
gen_out.past_key_values.self_attention_cache
if config.get_text_config(decoder=True).is_encoder_decoder
if config.is_encoder_decoder
else gen_out.past_key_values
)
self.assertTrue(isinstance(decoder_cache, DynamicCache))
@ -2163,7 +2209,7 @@ class GenerationTesterMixin:
# sanity checks
decoder_cache = (
gen_out.past_key_values.self_attention_cache
if config.get_text_config(decoder=True).is_encoder_decoder
if config.is_encoder_decoder
else gen_out.past_key_values
)
self.assertFalse(isinstance(decoder_cache, DynamicCache))
@ -2237,7 +2283,7 @@ class GenerationTesterMixin:
else:
self.assertTrue(hasattr(model, "_compiled_call")) # our auto compile should have been called
if model.config.get_text_config(decoder=True).is_encoder_decoder:
if model.config.is_encoder_decoder:
self.assertTrue(output_generate.sequences.shape[1] == self.max_new_tokens + 1)
self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput)
else:
@ -5108,6 +5154,7 @@ class TestAssistedCandidateGeneratorUpdateStrategy(unittest.TestCase):
@parameterized.expand([(is_sklearn_available(),), (False,)])
def test_update_candidate_strategy_no_matches_short(self, sklearn_available):
print("test_update_candidate_strategy_no_matches_short")
self.original_matches = []
self.candidate_generator.matches = self.original_matches
self.num_matches = 0

View File

@ -171,7 +171,6 @@ class AriaVisionText2TextModelTester:
return config, inputs_dict
@slow
@require_torch
class AriaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
"""

View File

@ -34,7 +34,6 @@ from transformers.models.bark.generation_configuration_bark import (
BarkSemanticGenerationConfig,
)
from transformers.testing_utils import (
backend_torch_accelerator_module,
require_flash_attn,
require_torch,
require_torch_accelerator,
@ -1307,7 +1306,7 @@ class BarkModelIntegrationTests(unittest.TestCase):
# standard generation
output_with_no_offload = self.model.generate(**input_ids, do_sample=False, temperature=1.0)
torch_accelerator_module = backend_torch_accelerator_module(torch_device)
torch_accelerator_module = getattr(torch, torch_device, torch.cuda)
torch_accelerator_module.empty_cache()

View File

@ -468,6 +468,13 @@ class BigBirdPegasusModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineT
def test_load_save_without_tied_weights(self):
pass
def test_generate_with_head_masking(self):
# overwritten to temporarily switch the attention type to `original_full`
original_self_attention_type = self.model_tester.attention_type
self.model_tester.attention_type = "original_full"
super().test_generate_with_head_masking()
self.model_tester.attention_type = original_self_attention_type
@require_torch
@require_sentencepiece

View File

@ -782,7 +782,7 @@ class BlipVQAModelTester:
@require_vision
class BlipVQAModelTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (BlipForQuestionAnswering,) if is_torch_available() else ()
# Doesn't run generation tests due to custom generation logic -- won't fix
# Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
all_generative_model_classes = ()
fx_compatible = False
test_head_masking = False
@ -1091,7 +1091,7 @@ class BlipTextRetrievalModelTest(ModelTesterMixin, unittest.TestCase):
@require_torch
class BlipTextImageModelTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (BlipForConditionalGeneration,) if is_torch_available() else ()
# Doesn't run generation tests due to custom generation logic -- wont fix
# Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
all_generative_model_classes = ()
fx_compatible = False
test_head_masking = False

View File

@ -774,7 +774,6 @@ class Blip2TextModelTester:
bos_token_id=self.pad_token_id,
pad_token_id=self.pad_token_id,
decoder_start_token_id=self.decoder_start_token_id,
is_encoder_decoder=True,
)
@ -796,9 +795,6 @@ class Blip2ModelTester:
self.text_model_tester = Blip2TextModelTester(parent, **text_kwargs)
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
self.seq_length = self.text_model_tester.seq_length # need seq_length for common tests
self.encoder_seq_length = (
self.text_model_tester.encoder_seq_length + num_query_tokens
) # need enc seq_length for gen tests
self.is_training = is_training
self.num_query_tokens = num_query_tokens
@ -863,9 +859,11 @@ class Blip2ModelTester:
@require_torch
class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMixin, unittest.TestCase):
class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (Blip2ForConditionalGeneration, Blip2Model) if is_torch_available() else ()
additional_model_inputs = ["input_ids", "decoder_input_ids"]
# Doesn't run generation tests. TODO: fix generation tests for Blip2ForConditionalGeneration
all_generative_model_classes = ()
pipeline_model_mapping = (
{
"feature-extraction": Blip2Model,
@ -1710,14 +1708,10 @@ class Blip2ModelIntegrationTest(unittest.TestCase):
expectations = Expectations(
{
("xpu", 3): [
[0, 3, 9, 2335, 19, 1556, 28, 160, 1782, 30, 8, 2608, 1],
"a woman is playing with her dog on the beach",
],
("cuda", 7): [
[0, 3, 9, 2335, 19, 1556, 28, 160, 1782, 30, 8, 2608, 1],
"a woman is playing with her dog on the beach",
],
]
}
)
expected_outputs = expectations.get_expectation()
@ -1735,14 +1729,10 @@ class Blip2ModelIntegrationTest(unittest.TestCase):
expectations = Expectations(
{
("xpu", 3): [
[0, 3, 7, 152, 2515, 11389, 3523, 1],
"san francisco",
],
("cuda", 7): [
[0, 3, 7, 152, 2515, 11389, 3523, 1],
"san francisco",
],
]
}
)
expected_outputs = expectations.get_expectation()
@ -1765,14 +1755,10 @@ class Blip2ModelIntegrationTest(unittest.TestCase):
expectations = Expectations(
{
("xpu", 3): [
[0, 3, 9, 2335, 19, 1556, 28, 160, 1782, 30, 8, 2608, 1],
[0, 3, 9, 2335, 19, 1556, 28, 160, 1782, 30, 8, 2608, 1],
],
("cuda", 7): [
[0, 3, 9, 2335, 19, 1556, 28, 160, 1782, 30, 8, 2608, 1],
[0, 3, 9, 2335, 19, 1556, 28, 160, 1782, 30, 8, 2608, 1],
],
]
}
)
expected_predictions = expectations.get_expectation()

View File

@ -420,7 +420,6 @@ class ChameleonIntegrationTest(unittest.TestCase):
# greedy generation outputs
EXPECTED_TEXT_COMPLETIONS = Expectations(
{
("xpu", 3): ['Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot in the center representing the star Altair. The star map is set against a black background, with the constellations visible in the night'],
("cuda", 7): ['Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot in the center representing the star Alpha Centauri. The star map is a representation of the night sky, showing the positions of stars in'],
("cuda", 8): ['Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot representing the position of the star Alpha Centauri. Alpha Centauri is the brightest star in the constellation Centaurus and is located'],
}
@ -458,10 +457,6 @@ class ChameleonIntegrationTest(unittest.TestCase):
# greedy generation outputs
EXPECTED_TEXT_COMPLETIONS = Expectations(
{
("xpu", 3): [
'Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot in the center representing the star Altair. The star map is set against a black background, with the constellations visible in the night',
'What constellation is this image showing?The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.',
],
("cuda", 7): [
'Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue line extending across the center of the image. The line is labeled "390 light years" and is accompanied by a small black and',
'What constellation is this image showing?The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.',

View File

@ -19,7 +19,7 @@ from transformers import CohereConfig, is_torch_available
from transformers.testing_utils import (
require_bitsandbytes,
require_torch,
require_torch_multi_accelerator,
require_torch_multi_gpu,
require_torch_sdpa,
slow,
torch_device,
@ -203,7 +203,7 @@ class CohereModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
@require_torch
@slow
class CohereIntegrationTest(unittest.TestCase):
@require_torch_multi_accelerator
@require_torch_multi_gpu
@require_bitsandbytes
def test_batched_4bit(self):
model_id = "CohereForAI/c4ai-command-r-v01-4bit"

View File

@ -14,6 +14,7 @@
# limitations under the License.
"""Testing suite for the PyTorch ColQwen2 model."""
import gc
import unittest
from typing import ClassVar
@ -26,15 +27,7 @@ from transformers import is_torch_available
from transformers.models.colqwen2.configuration_colqwen2 import ColQwen2Config
from transformers.models.colqwen2.modeling_colqwen2 import ColQwen2ForRetrieval, ColQwen2ForRetrievalOutput
from transformers.models.colqwen2.processing_colqwen2 import ColQwen2Processor
from transformers.testing_utils import (
Expectations,
cleanup,
require_bitsandbytes,
require_torch,
require_vision,
slow,
torch_device,
)
from transformers.testing_utils import require_torch, require_vision, slow, torch_device
if is_torch_available():
@ -289,9 +282,9 @@ class ColQwen2ModelIntegrationTest(unittest.TestCase):
self.processor = ColQwen2Processor.from_pretrained(self.model_name)
def tearDown(self):
cleanup(torch_device, gc_collect=True)
gc.collect()
torch.cuda.empty_cache()
@require_bitsandbytes
@slow
def test_model_integration_test(self):
"""
@ -300,7 +293,7 @@ class ColQwen2ModelIntegrationTest(unittest.TestCase):
model = ColQwen2ForRetrieval.from_pretrained(
self.model_name,
torch_dtype=torch.bfloat16,
load_in_8bit=True,
device_map=torch_device,
).eval()
# Load the test dataset
@ -328,20 +321,13 @@ class ColQwen2ModelIntegrationTest(unittest.TestCase):
self.assertTrue((scores.argmax(axis=1) == torch.arange(len(ds), device=scores.device)).all())
# Further validation: fine-grained check, with a hardcoded score from the original Hf implementation.
expectations = Expectations(
{
("cuda", 7): [
[15.5000, 8.1250, 14.9375],
[9.0625, 17.1250, 10.6875],
[15.9375, 12.1875, 20.2500],
],
("cuda", 8): [
[15.1250, 8.6875, 15.0625],
[9.2500, 17.2500, 10.3750],
[15.9375, 12.3750, 20.2500],
],
}
expected_scores = torch.tensor(
[
[16.2500, 7.8750, 14.6875],
[9.5000, 17.1250, 10.5000],
[14.9375, 10.9375, 20.0000],
],
dtype=scores.dtype,
)
expected_scores = torch.tensor(expectations.get_expectation(), dtype=scores.dtype)
assert torch.allclose(scores, expected_scores, atol=1e-3), f"Expected scores {expected_scores}, got {scores}"

View File

@ -19,13 +19,7 @@ import unittest
import numpy as np
from transformers.testing_utils import (
require_torch,
require_torch_accelerator,
require_vision,
slow,
torch_device,
)
from transformers.testing_utils import require_torch, require_torch_gpu, require_vision, slow
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
@ -613,9 +607,9 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50]))
@slow
@require_torch_accelerator
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_fast_processor_equivalence_cpu_accelerator_coco_detection_annotations
def test_fast_processor_equivalence_cpu_accelerator_coco_detection_annotations(self):
@require_torch_gpu
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations
def test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations(self):
# prepare image and target
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt") as f:
@ -628,8 +622,8 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
# 1. run processor on CPU
encoding_cpu = processor(images=image, annotations=target, return_tensors="pt", device="cpu")
# 2. run processor on accelerator
encoding_gpu = processor(images=image, annotations=target, return_tensors="pt", device=torch_device)
# 2. run processor on GPU
encoding_gpu = processor(images=image, annotations=target, return_tensors="pt", device="cuda")
# verify pixel values
self.assertEqual(encoding_cpu["pixel_values"].shape, encoding_gpu["pixel_values"].shape)
@ -671,9 +665,9 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
torch.testing.assert_close(encoding_cpu["labels"][0]["size"], encoding_gpu["labels"][0]["size"].to("cpu"))
@slow
@require_torch_accelerator
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_fast_processor_equivalence_cpu_accelerator_coco_panoptic_annotations
def test_fast_processor_equivalence_cpu_accelerator_coco_panoptic_annotations(self):
@require_torch_gpu
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_fast_processor_equivalence_cpu_gpu_coco_panoptic_annotations
def test_fast_processor_equivalence_cpu_gpu_coco_panoptic_annotations(self):
# prepare image, target and masks_path
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt") as f:
@ -690,9 +684,9 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
encoding_cpu = processor(
images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device="cpu"
)
# 2. run processor on accelerator
# 2. run processor on GPU
encoding_gpu = processor(
images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device=torch_device
images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device="cuda"
)
# verify pixel values

View File

@ -746,7 +746,7 @@ class DeformableDetrModelIntegrationTests(unittest.TestCase):
torch.testing.assert_close(outputs.pred_boxes[0, :3, :3], expected_boxes, rtol=1e-4, atol=1e-4)
@require_torch_accelerator
def test_inference_object_detection_head_equivalence_cpu_accelerator(self):
def test_inference_object_detection_head_equivalence_cpu_gpu(self):
image_processor = self.default_image_processor
image = prepare_img()
encoding = image_processor(images=image, return_tensors="pt")
@ -759,7 +759,7 @@ class DeformableDetrModelIntegrationTests(unittest.TestCase):
with torch.no_grad():
cpu_outputs = model(pixel_values, pixel_mask)
# 2. run model on accelerator
# 2. run model on GPU
model.to(torch_device)
with torch.no_grad():

View File

@ -18,14 +18,7 @@ import unittest
import numpy as np
from transformers.testing_utils import (
require_torch,
require_torch_accelerator,
require_torchvision,
require_vision,
slow,
torch_device,
)
from transformers.testing_utils import require_torch, require_torch_gpu, require_torchvision, require_vision, slow
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
@ -673,9 +666,9 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50]))
@slow
@require_torch_accelerator
@require_torch_gpu
@require_torchvision
def test_fast_processor_equivalence_cpu_accelerator_coco_detection_annotations(self):
def test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations(self):
# prepare image and target
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt") as f:
@ -686,8 +679,8 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
processor = self.image_processor_list[1]()
# 1. run processor on CPU
encoding_cpu = processor(images=image, annotations=target, return_tensors="pt", device="cpu")
# 2. run processor on accelerator
encoding_gpu = processor(images=image, annotations=target, return_tensors="pt", device=torch_device)
# 2. run processor on GPU
encoding_gpu = processor(images=image, annotations=target, return_tensors="pt", device="cuda")
# verify pixel values
self.assertEqual(encoding_cpu["pixel_values"].shape, encoding_gpu["pixel_values"].shape)
@ -729,9 +722,9 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
torch.testing.assert_close(encoding_cpu["labels"][0]["size"], encoding_gpu["labels"][0]["size"].to("cpu"))
@slow
@require_torch_accelerator
@require_torch_gpu
@require_torchvision
def test_fast_processor_equivalence_cpu_accelerator_coco_panoptic_annotations(self):
def test_fast_processor_equivalence_cpu_gpu_coco_panoptic_annotations(self):
# prepare image, target and masks_path
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt") as f:
@ -746,9 +739,9 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
encoding_cpu = processor(
images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device="cpu"
)
# 2. run processor on accelerator
# 2. run processor on GPU
encoding_gpu = processor(
images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device=torch_device
images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device="cuda"
)
# verify pixel values

View File

@ -258,14 +258,10 @@ class Gemma2IntegrationTest(unittest.TestCase):
# EXPECTED_TEXTS should match the same non-pipeline test, minus the special tokens
EXPECTED_BATCH_TEXTS = Expectations(
{
("xpu", 3): [
"Hello I am doing a project on the 1960s and I am trying to find out what the average",
"Hi today I'm going to be talking about the 10 most powerful characters in the Naruto series.",
],
("cuda", 8): [
"Hello I am doing a project on the 1960s and I am trying to find out what the average",
"Hi today I'm going to be talking about the 10 most powerful characters in the Naruto series.",
],
]
}
)
EXPECTED_BATCH_TEXT = EXPECTED_BATCH_TEXTS.get_expectation()
@ -319,9 +315,6 @@ class Gemma2IntegrationTest(unittest.TestCase):
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b", pad_token="</s>", padding_side="right")
EXPECTED_TEXT_COMPLETIONS = Expectations(
{
("xpu", 3): [
"Hello I am doing a project for my school and I need to know how to make a program that will take a number"
],
("cuda", 7): [
"Hello I am doing a project for my school and I need to know how to make a program that will take a number"
],

View File

@ -31,7 +31,6 @@ from transformers.testing_utils import (
Expectations,
cleanup,
is_flash_attn_2_available,
require_deterministic_for_xpu,
require_flash_attn,
require_read_token,
require_torch,
@ -387,7 +386,6 @@ class Gemma3IntegrationTest(unittest.TestCase):
def tearDown(self):
cleanup(torch_device, gc_collect=True)
@require_deterministic_for_xpu
def test_model_4b_bf16(self):
model_id = "google/gemma-3-4b-it"
@ -408,7 +406,6 @@ class Gemma3IntegrationTest(unittest.TestCase):
EXPECTED_TEXTS = Expectations(
{
("xpu", 3): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach with turquoise water in the background. It looks like a lovely,'],
("cuda", 7): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach with turquoise water in the background. It looks like a lovely,'],
("cuda", 8): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. It looks like a very sunny and'],
}
@ -417,7 +414,6 @@ class Gemma3IntegrationTest(unittest.TestCase):
self.assertEqual(output_text, EXPECTED_TEXT)
@require_torch_large_accelerator
@require_deterministic_for_xpu
def test_model_4b_batch(self):
model_id = "google/gemma-3-4b-it"
@ -454,17 +450,12 @@ class Gemma3IntegrationTest(unittest.TestCase):
EXPECTED_TEXTS = Expectations(
{
("xpu", 3):
[
'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. It looks like a very sunny and',
'user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. They depict very different scenes:\n\n* **Image 1** shows a cow standing on a beach.',
],
("cuda", 7): [],
("cuda", 8):
[
'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. It looks like a very sunny and',
'user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. They depict very different scenes:\n\n* **Image 1** shows a cow standing on a beach.',
],
]
}
) # fmt: skip
EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
@ -502,9 +493,8 @@ class Gemma3IntegrationTest(unittest.TestCase):
EXPECTED_NUM_IMAGES = 3 # one for the origin image and two crops of images
EXPECTED_TEXTS = Expectations(
{
("xpu", 3): ['user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There are clouds in the blue sky above.'],
("cuda", 7): [],
("cuda", 8): ['user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There are clouds in the blue sky above.'],
("cuda", 8): ['user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There are clouds in the blue sky above.']
}
) # fmt: skip
EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
@ -512,7 +502,6 @@ class Gemma3IntegrationTest(unittest.TestCase):
self.assertEqual(output_text, EXPECTED_TEXT)
@require_torch_large_accelerator
@require_deterministic_for_xpu
def test_model_4b_batch_crops(self):
model_id = "google/gemma-3-4b-it"
@ -557,15 +546,11 @@ class Gemma3IntegrationTest(unittest.TestCase):
EXPECTED_NUM_IMAGES = 9 # 3 * (one for the origin image and two crops of images) = 9
EXPECTED_TEXTS = Expectations(
{
("xpu", 3): [
'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There are clouds in the blue sky above.',
'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nThe first image shows a cow on a beach, while the second image shows a street scene with a',
],
("cuda", 7): [],
("cuda", 8): [
'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There are clouds in the blue sky above.',
'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nThe first image shows a cow on a beach, while the second image shows a street scene with a',
],
]
}
) # fmt: skip
EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
@ -604,15 +589,13 @@ class Gemma3IntegrationTest(unittest.TestCase):
output_text = self.processor.batch_decode(output, skip_special_tokens=True)
EXPECTED_TEXTS = Expectations(
{
("xpu", 3): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image!\n\nHere's a description of the scene:\n\n* **Chinese Arch"],
("cuda", 7): [],
("cuda", 8): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Main Features:**\n\n* **Chinese Archway:** The most prominent"],
("cuda", 8): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Main Features:**\n\n* **Chinese Archway:** The most prominent"]
}
) # fmt: skip
EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
self.assertEqual(output_text, EXPECTED_TEXT)
@require_deterministic_for_xpu
def test_model_1b_text_only(self):
model_id = "google/gemma-3-1b-it"
@ -627,7 +610,6 @@ class Gemma3IntegrationTest(unittest.TestCase):
EXPECTED_TEXTS = Expectations(
{
("xpu", 3): ['Write a poem about Machine Learning.\n\n---\n\nThe data flows, a river deep,\nWith patterns hidden, secrets sleep.\nA neural net, a watchful eye,\nLearning'],
("cuda", 7): ['Write a poem about Machine Learning.\n\n---\n\nThe data flows, a silent stream,\nInto the neural net, a waking dream.\nAlgorithms hum, a coded grace,\n'],
("cuda", 8): ['Write a poem about Machine Learning.\n\n---\n\nThe data flows, a silent stream,\nInto the neural net, a waking dream.\nAlgorithms hum, a coded grace,\n'],
}
@ -659,7 +641,6 @@ class Gemma3IntegrationTest(unittest.TestCase):
EXPECTED_TEXTS = Expectations(
{
("xpu", 3): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach with turquoise water and a distant island in the background. It looks like a sunny day'],
("cuda", 7): [],
("cuda", 8): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach with turquoise water and a distant island in the background. It looks like a sunny day'],
}

View File

@ -24,7 +24,6 @@ from transformers.testing_utils import (
cleanup,
require_flash_attn,
require_torch,
require_torch_large_accelerator,
require_torch_large_gpu,
require_torch_sdpa,
slow,
@ -80,7 +79,7 @@ class Glm4ModelTest(CausalLMModelTest, unittest.TestCase):
@slow
@require_torch_large_accelerator
@require_torch_large_gpu
class Glm4IntegrationTest(unittest.TestCase):
input_text = ["Hello I am doing", "Hi today"]
model_id = "THUDM/GLM-4-9B-0414"
@ -91,10 +90,6 @@ class Glm4IntegrationTest(unittest.TestCase):
def test_model_9b_fp16(self):
EXPECTED_TEXTS = Expectations(
{
("xpu", 3): [
"Hello I am doing a project on the history of the internet and I need to know what the first website was and what",
"Hi today I am going to tell you about the most common disease in the world. This disease is called diabetes",
],
("cuda", 7): [],
("cuda", 8): [
"Hello I am doing a project on the history of the internet and I need to know what the first website was and what",
@ -119,10 +114,6 @@ class Glm4IntegrationTest(unittest.TestCase):
def test_model_9b_bf16(self):
EXPECTED_TEXTS = Expectations(
{
("xpu", 3): [
"Hello I am doing a project on the history of the internet and I need to know what the first website was and what",
"Hi today I am going to tell you about the most common disease in the world. This disease is called diabetes",
],
("cuda", 7): [],
("cuda", 8): [
"Hello I am doing a project on the history of the internet and I need to know what the first website was and what",
@ -147,10 +138,6 @@ class Glm4IntegrationTest(unittest.TestCase):
def test_model_9b_eager(self):
EXPECTED_TEXTS = Expectations(
{
("xpu", 3): [
"Hello I am doing a project on the history of the internet and I need to know what the first website was and who",
"Hi today I am going to tell you about the most common disease in the world. This disease is called diabetes",
],
("cuda", 7): [],
("cuda", 8): [
"Hello I am doing a project on the history of the internet and I need to know what the first website was and what",
@ -180,10 +167,6 @@ class Glm4IntegrationTest(unittest.TestCase):
def test_model_9b_sdpa(self):
EXPECTED_TEXTS = Expectations(
{
("xpu", 3): [
"Hello I am doing a project on the history of the internet and I need to know what the first website was and what",
"Hi today I am going to tell you about the most common disease in the world. This disease is called diabetes",
],
("cuda", 7): [],
("cuda", 8): [
"Hello I am doing a project on the history of the internet and I need to know what the first website was and what",
@ -210,7 +193,6 @@ class Glm4IntegrationTest(unittest.TestCase):
self.assertEqual(output_text, EXPECTED_TEXT)
@require_flash_attn
@require_torch_large_gpu
@pytest.mark.flash_attn_test
def test_model_9b_flash_attn(self):
EXPECTED_TEXTS = Expectations(

View File

@ -718,7 +718,7 @@ class GroundingDinoModelIntegrationTests(unittest.TestCase):
@require_torch_accelerator
@is_flaky()
def test_inference_object_detection_head_equivalence_cpu_accelerator(self):
def test_inference_object_detection_head_equivalence_cpu_gpu(self):
processor = self.default_processor
image = prepare_img()
text = prepare_text()
@ -730,7 +730,7 @@ class GroundingDinoModelIntegrationTests(unittest.TestCase):
with torch.no_grad():
cpu_outputs = model(**encoding)
# 2. run model on accelerator
# 2. run model on GPU
model.to(torch_device)
encoding = encoding.to(torch_device)
with torch.no_grad():

View File

@ -324,8 +324,10 @@ class IdeficsModelTester:
@require_torch
class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMixin, unittest.TestCase):
class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (IdeficsModel, IdeficsForVisionText2Text) if is_torch_available() else ()
# Doesn't run generation tests here -- idefics has a dedicated tester for generation tests below
all_generative_model_classes = ()
pipeline_model_mapping = (
{"feature-extraction": IdeficsModel, "image-text-to-text": IdeficsForVisionText2Text}
if is_torch_available()
@ -334,7 +336,6 @@ class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMi
test_pruning = False
test_headmasking = False
test_torchscript = False
has_attentions = False # only supports SDOA and thus no attention probs returned
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
@ -493,31 +494,6 @@ class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMi
def test_retain_grad_hidden_states_attentions(self):
return
@pytest.mark.generate
@unittest.skip(reason="""IDEFICS cannot generate with no images provided!""")
def test_generate_without_input_ids(self):
pass
@pytest.mark.generate
@unittest.skip(reason="""IDEFICS cannot generate with no images provided!""")
def test_generate_continue_from_inputs_embeds(self):
pass
@pytest.mark.generate
@unittest.skip(reason="""IDEFICS cannot do contrastive generation yet and it is not worth fixing""")
def test_contrastive_generate(self):
pass
@pytest.mark.generate
@unittest.skip(reason="""IDEFICS cannot do contrastive generation yet and it is not worth fixing""")
def test_contrastive_generate_low_memory(self):
pass
@pytest.mark.generate
@unittest.skip(reason="""IDEFICS cannot do contrastive generation yet and it is not worth fixing""")
def test_contrastive_generate_dict_outputs_use_cache(self):
pass
def test_attention_outputs(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.return_dict = True

View File

@ -18,7 +18,7 @@ import unittest
from transformers import is_torch_available
from transformers.testing_utils import (
require_read_token,
require_torch_large_accelerator,
require_torch_large_gpu,
slow,
torch_device,
)
@ -34,7 +34,7 @@ if is_torch_available():
@slow
@require_torch_large_accelerator
@require_torch_large_gpu
@require_read_token
class Llama4IntegrationTest(unittest.TestCase):
model_id = "meta-llama/Llama-4-Scout-17B-16E"

View File

@ -626,6 +626,40 @@ class LongT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
model = LongT5Model.from_pretrained(model_name)
self.assertIsNotNone(model)
def test_generate_with_head_masking(self):
attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
config_and_inputs = self.model_tester.prepare_config_and_inputs()
config = config_and_inputs[0]
max_length = config_and_inputs[1].shape[-1] + 3
model = LongT5ForConditionalGeneration(config).eval()
model.to(torch_device)
head_masking = {
"head_mask": torch.zeros(config.num_layers, config.num_heads, device=torch_device),
"decoder_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
"cross_attn_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
}
for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
head_masks = {name: mask}
# Explicitly pass decoder_head_mask as it is required from LONGT5 model when head_mask specified
if name == "head_mask":
head_masks["decoder_head_mask"] = torch.ones(
config.num_decoder_layers, config.num_heads, device=torch_device
)
out = model.generate(
config_and_inputs[1],
num_beams=1,
max_length=max_length,
output_attentions=True,
return_dict_in_generate=True,
**head_masks,
)
# We check the state of decoder_attentions and cross_attentions just from the last step
attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
def test_attention_outputs(self):
if not self.has_attentions:
self.skipTest(reason="has_attentions is set to False")

View File

@ -243,7 +243,7 @@ class MiniMaxModelTest(CausalLMModelTest, unittest.TestCase):
@slow
class MiniMaxIntegrationTest(unittest.TestCase):
def test_small_model_logits(self):
model_id = "hf-internal-testing/MiniMax-tiny"
model_id = "geetu040/MiniMax-tiny"
dummy_input = torch.LongTensor([[0, 1, 0], [0, 1, 0]]).to(torch_device)
model = MiniMaxForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True).to(
@ -262,7 +262,7 @@ class MiniMaxIntegrationTest(unittest.TestCase):
torch.testing.assert_close(logits[1, :3, :3], expected_slice, atol=1e-3, rtol=1e-3)
def test_small_model_generation(self):
model_id = "hf-internal-testing/MiniMax-tiny"
model_id = "geetu040/MiniMax-tiny"
dummy_input = torch.LongTensor([[0, 1, 0], [0, 1, 0]]).to(torch_device)
model = MiniMaxForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True).to(

View File

@ -868,6 +868,40 @@ class MT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
model = MT5Model.from_pretrained(model_name)
self.assertIsNotNone(model)
def test_generate_with_head_masking(self):
attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
config_and_inputs = self.model_tester.prepare_config_and_inputs()
config = config_and_inputs[0]
max_length = config_and_inputs[1].shape[-1] + 3
model = MT5ForConditionalGeneration(config).eval()
model.to(torch_device)
head_masking = {
"head_mask": torch.zeros(config.num_layers, config.num_heads, device=torch_device),
"decoder_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
"cross_attn_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
}
for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
head_masks = {name: mask}
# Explicitly pass decoder_head_mask as it is required from MT5 model when head_mask specified
if name == "head_mask":
head_masks["decoder_head_mask"] = torch.ones(
config.num_decoder_layers, config.num_heads, device=torch_device
)
out = model.generate(
config_and_inputs[1],
num_beams=1,
max_length=max_length,
output_attentions=True,
return_dict_in_generate=True,
**head_masks,
)
# We check the state of decoder_attentions and cross_attentions just from the last step
attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
# Copied from tests.models.t5.test_modeling_t5.T5EncoderOnlyModelTester with T5->MT5
class MT5EncoderOnlyModelTester:

View File

@ -870,7 +870,7 @@ class OmDetTurboModelIntegrationTests(unittest.TestCase):
self.assertListEqual([result["text_labels"] for result in results], expected_text_labels)
@require_torch_accelerator
def test_inference_object_detection_head_equivalence_cpu_accelerator(self):
def test_inference_object_detection_head_equivalence_cpu_gpu(self):
processor = self.default_processor
image = prepare_img()
text_labels, task = prepare_text()
@ -881,7 +881,7 @@ class OmDetTurboModelIntegrationTests(unittest.TestCase):
with torch.no_grad():
cpu_outputs = model(**encoding)
# 2. run model on accelerator
# 2. run model on GPU
model.to(torch_device)
encoding = encoding.to(torch_device)
with torch.no_grad():

View File

@ -1117,6 +1117,10 @@ class ProphetNetModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
self.assertIsNotNone(encoder_hidden_states.grad)
self.assertIsNotNone(encoder_attentions.grad)
@unittest.skip(reason="Generating with head_masking has not been implemented for ProphetNet models yet.")
def test_generate_with_head_masking(self):
pass
@require_torch
class ProphetNetStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):

View File

@ -19,11 +19,10 @@ import requests
from transformers.testing_utils import (
is_flaky,
require_torch,
require_torch_accelerator,
require_torch_gpu,
require_torchvision,
require_vision,
slow,
torch_device,
)
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
@ -380,10 +379,10 @@ class RtDetrImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1, rtol=1)
@slow
@require_torch_accelerator
@require_torch_gpu
@require_torchvision
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_fast_processor_equivalence_cpu_accelerator_coco_detection_annotations
def test_fast_processor_equivalence_cpu_accelerator_coco_detection_annotations(self):
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations
def test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations(self):
# prepare image and target
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt") as f:
@ -394,8 +393,8 @@ class RtDetrImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
processor = self.image_processor_list[1]()
# 1. run processor on CPU
encoding_cpu = processor(images=image, annotations=target, return_tensors="pt", device="cpu")
# 2. run processor on accelerator
encoding_gpu = processor(images=image, annotations=target, return_tensors="pt", device=torch_device)
# 2. run processor on GPU
encoding_gpu = processor(images=image, annotations=target, return_tensors="pt", device="cuda")
# verify pixel values
self.assertEqual(encoding_cpu["pixel_values"].shape, encoding_gpu["pixel_values"].shape)

View File

@ -22,7 +22,7 @@ from PIL import Image
from transformers import is_torch_available
from transformers.testing_utils import (
cleanup,
require_torch_accelerator,
require_torch_gpu,
slow,
torch_device,
)
@ -35,7 +35,7 @@ if is_torch_available():
@slow
@require_torch_accelerator
@require_torch_gpu
# @require_read_token
class ShieldGemma2IntegrationTest(unittest.TestCase):
def tearDown(self):

View File

@ -741,6 +741,10 @@ class SpeechT5ForSpeechToTextTest(ModelTesterMixin, unittest.TestCase, Generatio
if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
module.masked_spec_embed.data.fill_(3)
@unittest.skip(reason="Temporarily broken") # TODO (joao, eustache): have a look at this test
def test_generate_with_head_masking(self):
pass
@unittest.skip(reason="Temporarily broken") # TODO (joao, eustache): have a look at this test
def test_generate_without_input_ids(self):
pass

View File

@ -250,8 +250,6 @@ class SwiftFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestC
for model_class in self.all_model_classes:
model = model_class(config=configs_no_init)
for name, param in model.named_parameters():
if name.endswith(".w_g"):
continue
if param.requires_grad:
self.assertIn(
((param.data.mean() * 1e9) / 1e9).round().item(),

View File

@ -709,6 +709,40 @@ class SwitchTransformersModelTest(ModelTesterMixin, GenerationTesterMixin, Pipel
model = SwitchTransformersModel.from_pretrained(model_name)
self.assertIsNotNone(model)
def test_generate_with_head_masking(self):
attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
config_and_inputs = self.model_tester.prepare_config_and_inputs()
config = config_and_inputs[0]
max_length = config_and_inputs[1].shape[-1] + 3
model = SwitchTransformersForConditionalGeneration(config).eval()
model.to(torch_device)
head_masking = {
"head_mask": torch.zeros(config.num_layers, config.num_heads, device=torch_device),
"decoder_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
"cross_attn_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
}
for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
head_masks = {name: mask}
# Explicitly pass decoder_head_mask as it is required from SWITCH_TRANSFORMERS model when head_mask specified
if name == "head_mask":
head_masks["decoder_head_mask"] = torch.ones(
config.num_decoder_layers, config.num_heads, device=torch_device
)
out = model.generate(
config_and_inputs[1],
num_beams=1,
max_length=max_length,
output_attentions=True,
return_dict_in_generate=True,
**head_masks,
)
# We check the state of decoder_attentions and cross_attentions just from the last step
attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
@unittest.skip(
reason="This architecture has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
)

View File

@ -873,6 +873,40 @@ class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
model = T5Model.from_pretrained(model_name)
self.assertIsNotNone(model)
def test_generate_with_head_masking(self):
attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
config_and_inputs = self.model_tester.prepare_config_and_inputs()
config = config_and_inputs[0]
max_length = config_and_inputs[1].shape[-1] + 3
model = T5ForConditionalGeneration(config).eval()
model.to(torch_device)
head_masking = {
"head_mask": torch.zeros(config.num_layers, config.num_heads, device=torch_device),
"decoder_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
"cross_attn_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
}
for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
head_masks = {name: mask}
# Explicitly pass decoder_head_mask as it is required from T5 model when head_mask specified
if name == "head_mask":
head_masks["decoder_head_mask"] = torch.ones(
config.num_decoder_layers, config.num_heads, device=torch_device
)
out = model.generate(
config_and_inputs[1],
num_beams=1,
max_length=max_length,
output_attentions=True,
return_dict_in_generate=True,
**head_masks,
)
# We check the state of decoder_attentions and cross_attentions just from the last step
attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
class T5EncoderOnlyModelTester:
def __init__(

View File

@ -419,6 +419,10 @@ class UdopModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
model = UdopForConditionalGeneration.from_pretrained(model_name)
self.assertIsNotNone(model)
@unittest.skip(reason="TODO: Fix me @joao")
def test_generate_with_head_masking(self):
pass
@unittest.skip(reason="TODO: Fix me @joao")
def test_generate_without_input_ids(self):
pass

View File

@ -489,6 +489,39 @@ class UMT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
def test_generate_with_head_masking(self):
attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
config_and_inputs = self.model_tester.prepare_config_and_inputs()
config = config_and_inputs[0]
model = UMT5ForConditionalGeneration(config).eval()
model.to(torch_device)
head_masking = {
"head_mask": torch.zeros(config.num_layers, config.num_heads, device=torch_device),
"decoder_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
"cross_attn_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
}
for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
head_masks = {name: mask}
# Explicitly pass decoder_head_mask as it is required from T5 model when head_mask specified
if name == "head_mask":
head_masks["decoder_head_mask"] = torch.ones(
config.num_decoder_layers, config.num_heads, device=torch_device
)
out = model.generate(
config_and_inputs[1]["input_ids"],
num_beams=1,
max_length=3,
output_attentions=True,
return_dict_in_generate=True,
**head_masks,
)
# We check the state of decoder_attentions and cross_attentions just from the last step
attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
@unittest.skip(
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)

View File

@ -23,11 +23,7 @@ import numpy as np
from datasets import load_dataset
from transformers import WhisperFeatureExtractor
from transformers.testing_utils import (
check_json_file_has_correct_format,
require_torch,
require_torch_accelerator,
)
from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_torch_gpu
from transformers.utils.import_utils import is_torch_available
from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
@ -258,7 +254,7 @@ class WhisperFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.
return [x["array"] for x in speech_samples]
@require_torch_accelerator
@require_torch_gpu
@require_torch
def test_torch_integration(self):
# fmt: off
@ -307,7 +303,7 @@ class WhisperFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.
self.assertTrue(np.all(np.mean(audio) < 1e-3))
self.assertTrue(np.all(np.abs(np.var(audio) - 1) < 1e-3))
@require_torch_accelerator
@require_torch_gpu
@require_torch
def test_torch_integration_batch(self):
# fmt: off

View File

@ -730,7 +730,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):
output_text = self.tokenizer.decode(output_parallel[0], skip_special_tokens=True)
self.assertIn(output_text, self.EXPECTED_OUTPUTS)
def test_cpu_accelerator_loading_random_device_map(self):
def test_cpu_gpu_loading_random_device_map(self):
r"""
A test to check is dispatching a model on cpu & gpu works correctly using a random `device_map`.
"""
@ -778,7 +778,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):
self.check_inference_correctness(model_8bit)
def test_cpu_accelerator_loading_custom_device_map(self):
def test_cpu_gpu_loading_custom_device_map(self):
r"""
A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map`.
This time the device map is more organized than the test above and uses the abstraction
@ -805,7 +805,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):
self.check_inference_correctness(model_8bit)
def test_cpu_accelerator_disk_loading_custom_device_map(self):
def test_cpu_gpu_disk_loading_custom_device_map(self):
r"""
A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map`.
This time we also add `disk` on the device_map.
@ -832,7 +832,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):
self.check_inference_correctness(model_8bit)
def test_cpu_accelerator_disk_loading_custom_device_map_kwargs(self):
def test_cpu_gpu_disk_loading_custom_device_map_kwargs(self):
r"""
A test to check is dispatching a model on cpu & gpu works correctly using a custom `device_map`.
This time we also add `disk` on the device_map - using the kwargs directly instead of the quantization config

View File

@ -20,7 +20,7 @@ from transformers import AddedToken, AutoModelForCausalLM, AutoModelForSeq2SeqLM
from transformers.testing_utils import (
require_gguf,
require_read_token,
require_torch_accelerator,
require_torch_gpu,
slow,
torch_device,
)
@ -35,7 +35,7 @@ if is_gguf_available():
@require_gguf
@require_torch_accelerator
@require_torch_gpu
@slow
class GgufQuantizationTests(unittest.TestCase):
"""
@ -107,7 +107,7 @@ class GgufQuantizationTests(unittest.TestCase):
@require_gguf
@require_torch_accelerator
@require_torch_gpu
@slow
class GgufIntegrationTests(unittest.TestCase):
"""
@ -263,7 +263,7 @@ class GgufIntegrationTests(unittest.TestCase):
@require_gguf
@require_torch_accelerator
@require_torch_gpu
@slow
class GgufModelTests(unittest.TestCase):
mistral_model_id = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"

View File

@ -11,18 +11,17 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import unittest
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, QuarkConfig
from transformers.testing_utils import (
cleanup,
is_torch_available,
require_accelerate,
require_quark,
require_torch_gpu,
require_torch_multi_gpu,
slow,
torch_device,
)
from transformers.utils.import_utils import is_quark_available
@ -80,10 +79,11 @@ class QuarkTest(unittest.TestCase):
def tearDown(self):
r"""
TearDown function needs to be called at the end of each test to free the accelerator memory and cache, also to
TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
"""
cleanup(torch_device, gc_collect=True)
gc.collect()
torch.cuda.empty_cache()
def test_memory_footprint(self):
mem_quantized = self.quantized_model.get_memory_footprint()

View File

@ -30,7 +30,7 @@ from transformers.testing_utils import (
check_json_file_has_correct_format,
is_flaky,
require_torch,
require_torch_accelerator,
require_torch_gpu,
require_vision,
slow,
torch_device,
@ -562,7 +562,7 @@ class ImageProcessingTestMixin:
self.skipTest(reason="No validation found for `preprocess` method")
@slow
@require_torch_accelerator
@require_torch_gpu
@require_vision
def test_can_compile_fast_image_processor(self):
if self.fast_image_processing_class is None:

View File

@ -716,16 +716,8 @@ class ModelTesterMixin:
model = model_class(config=configs_no_init)
for name, param in model.named_parameters():
if param.requires_grad:
data = torch.flatten(param.data)
n_elements = torch.numel(data)
# skip 2.5% of elements on each side to avoid issues caused by `nn.init.trunc_normal_` described in
# https://github.com/huggingface/transformers/pull/27906#issuecomment-1846951332
n_elements_to_skip_on_each_side = int(n_elements * 0.025)
data_to_check = torch.sort(data).values
if n_elements_to_skip_on_each_side > 0:
data_to_check = data_to_check[n_elements_to_skip_on_each_side:-n_elements_to_skip_on_each_side]
self.assertIn(
((data_to_check.mean() * 1e9).round() / 1e9).item(),
((param.data.mean() * 1e9).round() / 1e9).item(),
[0.0, 1.0],
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)

View File

@ -26,7 +26,7 @@ from transformers import AutoVideoProcessor
from transformers.testing_utils import (
check_json_file_has_correct_format,
require_torch,
require_torch_accelerator,
require_torch_gpu,
require_vision,
slow,
torch_device,
@ -165,7 +165,7 @@ class VideoProcessingTestMixin:
self.assertIsNotNone(video_processor)
@slow
@require_torch_accelerator
@require_torch_gpu
@require_vision
def test_can_compile_fast_video_processor(self):
if self.fast_video_processing_class is None:

View File

@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
ruff: isort: skip_file
isort:skip_file
"""
import os