Compare commits

..

1 Commits

Author SHA1 Message Date
9d0c37f9d4 another way 2025-06-02 18:56:08 +02:00
317 changed files with 1223 additions and 5778 deletions

View File

@ -110,6 +110,7 @@ class CircleCIJob:
print(f"Using {self.docker_image} docker image")
if self.install_steps is None:
self.install_steps = ["uv venv && uv pip install ."]
self.install_steps.append("uv venv && uv pip install git+https://github.com/ydshieh/pytest.git@8.3.5-ydshieh git+https://github.com/ydshieh/pluggy.git@1.5.0-ydshieh")
if self.pytest_options is None:
self.pytest_options = {}
if isinstance(self.tests_to_run, str):
@ -213,7 +214,7 @@ generate_job = CircleCIJob(
docker_image=[{"image": "huggingface/transformers-torch-light"}],
# networkx==3.3 (after #36957) cause some issues
# TODO: remove this once it works directly
install_steps=["uv venv && uv pip install ."],
install_steps=["uv venv && uv pip install . && uv pip install networkx==3.2.1"],
marker="generate",
parallelism=6,
)
@ -309,7 +310,7 @@ onnx_job = CircleCIJob(
docker_image=[{"image":"huggingface/transformers-torch-tf-light"}],
install_steps=[
"uv venv",
"uv pip install .[testing,sentencepiece,onnxruntime,vision,rjieba]",
"uv pip install .[torch,tf,testing,sentencepiece,onnxruntime,vision,rjieba]",
],
pytest_options={"k onnx": None},
pytest_num_workers=1,
@ -338,7 +339,7 @@ non_model_job = CircleCIJob(
docker_image=[{"image": "huggingface/transformers-torch-light"}],
# networkx==3.3 (after #36957) cause some issues
# TODO: remove this once it works directly
install_steps=["uv venv && uv pip install ."],
install_steps=["uv venv && uv pip install . && uv pip install networkx==3.2.1"],
marker="not generate",
parallelism=6,
)

View File

@ -64,7 +64,7 @@ jobs:
commit_id=$GITHUB_SHA
fi
commit_msg=$(git show -s --format=%s | cut -c1-70)
python3 benchmark/benchmarks_entrypoint.py "huggingface/transformers" "$BRANCH_NAME" "$commit_id" "$commit_msg"
python3 benchmark/benchmarks_entrypoint.py "$BRANCH_NAME" "$commit_id" "$commit_msg"
env:
HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
# Enable this to see debug logs

View File

@ -19,7 +19,7 @@ concurrency:
jobs:
latest-docker:
name: "Latest PyTorch [dev]"
name: "Latest PyTorch + TensorFlow [dev]"
runs-on:
group: aws-general-8-plus
steps:
@ -267,6 +267,44 @@ jobs:
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
latest-tensorflow:
name: "Latest TensorFlow [dev]"
# Push CI doesn't need this image
if: inputs.image_postfix != '-push-ci'
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-tensorflow-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-tensorflow-gpu
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the huggingface/transformers-tensorflow-gpu build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
latest-pytorch-deepspeed-amd:
name: "PyTorch + DeepSpeed (AMD) [dev]"
runs-on:

View File

@ -69,6 +69,18 @@ jobs:
report_repo_id: hf-internal-testing/transformers_daily_ci
secrets: inherit
tf-pipeline:
name: TF pipeline CI
uses: ./.github/workflows/self-scheduled.yml
with:
job: run_pipelines_tf_gpu
slack_report_channel: "#transformers-ci-daily-pipeline-tf"
runner: daily-ci
docker: huggingface/transformers-tensorflow-gpu
ci_event: Daily CI
report_repo_id: hf-internal-testing/transformers_daily_ci
secrets: inherit
example-ci:
name: Example CI
uses: ./.github/workflows/self-scheduled.yml

View File

@ -209,6 +209,75 @@ jobs:
name: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
path: /transformers/reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
run_pipelines_tf_gpu:
if: ${{ inputs.job == 'run_pipelines_tf_gpu' }}
name: TensorFlow pipelines
strategy:
fail-fast: false
matrix:
machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
runs-on:
group: '${{ matrix.machine_type }}'
container:
image: huggingface/transformers-tensorflow-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Update clone
working-directory: /transformers
run: |
git fetch && git checkout ${{ github.sha }}
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Environment
working-directory: /transformers
run: |
python3 utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /transformers
run: pip freeze
- name: Set `machine_type` for report and artifact names
working-directory: /transformers
shell: bash
run: |
echo "${{ matrix.machine_type }}"
if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
machine_type=single-gpu
elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
machine_type=multi-gpu
else
machine_type=${{ matrix.machine_type }}
fi
echo "$machine_type"
echo "machine_type=$machine_type" >> $GITHUB_ENV
- name: Run all pipeline tests on GPU
working-directory: /transformers
run: |
python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports tests/pipelines
- name: Failure short reports
if: ${{ always() }}
run: |
cat /transformers/reports/${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports/failures_short.txt
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports
path: /transformers/reports/${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports
run_examples_gpu:
if: ${{ inputs.job == 'run_examples_gpu' }}
name: Examples directory
@ -502,6 +571,7 @@ jobs:
run_models_gpu,
run_trainer_and_fsdp_gpu,
run_pipelines_torch_gpu,
run_pipelines_tf_gpu,
run_examples_gpu,
run_torch_cuda_extensions_gpu,
run_quantization_torch_gpu,

View File

@ -2,11 +2,11 @@ import argparse
import importlib.util
import logging
import os
from typing import Dict
import sys
from typing import Dict, Tuple
from psycopg2.extensions import register_adapter
from psycopg2.extras import Json
from psycopg2.extensions import register_adapter
register_adapter(dict, Json)
@ -17,13 +17,10 @@ class ImportModuleException(Exception):
class MetricsRecorder:
def __init__(
self, connection, logger: logging.Logger, repository: str, branch: str, commit_id: str, commit_msg: str
):
def __init__(self, connection, logger: logging.Logger, branch: str, commit_id: str, commit_msg: str):
self.conn = connection
self.conn.autocommit = True
self.logger = logger
self.repository = repository
self.branch = branch
self.commit_id = commit_id
self.commit_msg = commit_msg
@ -35,8 +32,8 @@ class MetricsRecorder:
# gpu_name: str, model_id: str
with self.conn.cursor() as cur:
cur.execute(
"INSERT INTO benchmarks (repository, branch, commit_id, commit_message, metadata) VALUES (%s, %s, %s, %s, %s) RETURNING benchmark_id",
(self.repository, self.branch, self.commit_id, self.commit_msg, metadata),
"INSERT INTO benchmarks (branch, commit_id, commit_message, metadata) VALUES (%s, %s, %s, %s) RETURNING benchmark_id",
(self.branch, self.commit_id, self.commit_msg, metadata),
)
benchmark_id = cur.fetchone()[0]
logger.debug(f"initialised benchmark #{benchmark_id}")
@ -85,18 +82,12 @@ handler.setFormatter(formatter)
logger.addHandler(handler)
def parse_arguments() -> Tuple[str, str, str, str]:
def parse_arguments():
"""
Parse command line arguments for the benchmarking CLI.
"""
parser = argparse.ArgumentParser(description="CLI for benchmarking the huggingface/transformers.")
parser.add_argument(
"repository",
type=str,
help="The repository name on which the benchmarking is performed.",
)
parser.add_argument(
"branch",
type=str,
@ -117,7 +108,7 @@ def parse_arguments() -> Tuple[str, str, str, str]:
args = parser.parse_args()
return args.repository, args.branch, args.commit_id, args.commit_msg
return args.branch, args.commit_id, args.commit_msg
def import_from_path(module_name, file_path):
@ -134,7 +125,7 @@ def import_from_path(module_name, file_path):
if __name__ == "__main__":
benchmarks_folder_path = os.path.dirname(os.path.realpath(__file__))
repository, branch, commit_id, commit_msg = parse_arguments()
branch, commit_id, commit_msg = parse_arguments()
for entry in os.scandir(benchmarks_folder_path):
try:
@ -145,7 +136,7 @@ if __name__ == "__main__":
logger.debug(f"loading: {entry.name}")
module = import_from_path(entry.name.split(".")[0], entry.path)
logger.info(f"running benchmarks in: {entry.name}")
module.run_benchmark(logger, repository, branch, commit_id, commit_msg)
module.run_benchmark(logger, branch, commit_id, commit_msg)
except ImportModuleException as e:
logger.error(e)
except Exception as e:

View File

@ -1,6 +1,5 @@
CREATE TABLE IF NOT EXISTS benchmarks (
benchmark_id SERIAL PRIMARY KEY,
repository VARCHAR(255),
branch VARCHAR(255),
commit_id VARCHAR(72),
commit_message VARCHAR(70),

View File

@ -33,15 +33,11 @@ def collect_metrics(benchmark_id, continue_metric_collection, metrics_recorder):
sleep(0.01)
def run_benchmark(
logger: Logger, repository: str, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100
):
def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100):
continue_metric_collection = Event()
metrics_thread = None
model_id = "meta-llama/Llama-2-7b-hf"
metrics_recorder = MetricsRecorder(
psycopg2.connect("dbname=metrics"), logger, repository, branch, commit_id, commit_msg
)
metrics_recorder = MetricsRecorder(psycopg2.connect("dbname=metrics"), logger, branch, commit_id, commit_msg)
try:
gpu_stats = gpustat.GPUStatCollection.new_query()
gpu_name = gpu_stats[0]["name"]

View File

@ -5,7 +5,7 @@ ARG REF=main
RUN apt-get update && apt-get install -y time git g++ pkg-config make git-lfs
ENV UV_PYTHON=/usr/local/bin/python
RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools GitPython
RUN uv pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir --upgrade 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
# tensorflow pin matching setup.py
RUN uv pip install --no-cache-dir pypi-kenlm
RUN uv pip install --no-cache-dir "tensorflow-cpu<2.16" "tf-keras<2.16"

View File

@ -16,7 +16,7 @@ RUN cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
RUN make install -j 10
RUN uv pip install --no-cache --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache --upgrade 'torch==2.6.0' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]" unidic unidic-lite
# spacy is not used so not tested. Causes to failures. TODO fix later

View File

@ -5,7 +5,7 @@ USER root
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
ENV UV_PYTHON=/usr/local/bin/python
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
RUN uv pip uninstall transformers

View File

@ -5,7 +5,7 @@ USER root
RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1-mesa-glx libgl1 g++ tesseract-ocr
ENV UV_PYTHON=/usr/local/bin/python
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir --no-deps timm accelerate
RUN pip install -U --upgrade-strategy eager --no-cache-dir pytesseract python-Levenshtein opencv-python nltk
# RUN uv pip install --no-cache-dir natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels

View File

@ -5,7 +5,7 @@ USER root
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git
ENV UV_PYTHON=/usr/local/bin/python
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir --upgrade 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
RUN uv pip uninstall transformers

View File

@ -5,7 +5,7 @@ USER root
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs
ENV UV_PYTHON=/usr/local/bin/python
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir --upgrade 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]"
RUN uv pip uninstall transformers

View File

@ -7,7 +7,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-de
ENV UV_PYTHON=/usr/local/bin/python
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
RUN uv pip install --no-cache-dir --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
RUN git lfs install
RUN uv pip install --no-cache-dir pypi-kenlm

View File

@ -28,7 +28,7 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
# 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future.
# 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`.
# Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions).
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA && python3 -m pip uninstall -y tensorflow tensorflow_text tensorflow_probability
RUN python3 -m pip install --no-cache-dir -U tensorflow==2.13 protobuf==3.20.3 "tensorflow_text<2.16" "tensorflow_probability<0.22" && python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
RUN python3 -m pip uninstall -y flax jax
@ -45,7 +45,7 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/pef
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum
# For video model testing
RUN python3 -m pip install --no-cache-dir av
RUN python3 -m pip install --no-cache-dir av==9.2.0
# Some slow tests require bnb
RUN python3 -m pip install --no-cache-dir bitsandbytes

View File

@ -555,8 +555,6 @@
title: MegatronBERT
- local: model_doc/megatron_gpt2
title: MegatronGPT2
- local: model_doc/minimax
title: MiniMax
- local: model_doc/mistral
title: Mistral
- local: model_doc/mixtral

View File

@ -56,10 +56,10 @@ Attention is calculated independently in each layer of the model, and caching is
Refer to the table below to compare how caching improves efficiency.
| without caching | with caching |
|---|---|
| for each step, recompute all previous `K` and `V` | for each step, only compute current `K` and `V`
| attention cost per step is **quadratic** with sequence length | attention cost per step is **linear** with sequence length (memory grows linearly, but compute/token remains low) |
| without caching | with caching | | | |
|---|---|---|---|---|
| for each step, recompute all previous `K` and `V` | for each step, only compute current `K` and `V` | | | |
| attention cost per step is **quadratic** with sequence length | attention cost per step is **linear** with sequence length (memory grows linearly, but compute/token remains low) | | | |

View File

@ -19,9 +19,6 @@ Hyperparameter search discovers an optimal set of hyperparameters that produces
This guide will go over how to set up a hyperparameter search for each of the backends.
> [!WARNING]
> [SigOpt](https://github.com/sigopt/sigopt-server) is in public archive mode and is no longer actively maintained. Try using Optuna, Weights & Biases or Ray Tune instead.
```bash
pip install optuna/sigopt/wandb/ray[tune]
```

View File

@ -14,71 +14,60 @@ rendered properly in your Markdown viewer.
-->
<div style="float: right;">
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
</div>
# Aria
[Aria](https://huggingface.co/papers/2410.05993) is a multimodal mixture-of-experts (MoE) model. The goal of this model is to open-source a training recipe for creating a multimodal native model from scratch. Aria has 3.9B and 3.5B activated parameters per visual and text token respectively. Text is handled by a MoE decoder and visual inputs are handled by a lightweight visual encoder. It is trained in 4 stages, language pretraining, multimodal pretraining, multimodal long-context pretraining, and multimodal post-training.
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
You can find all the original Aria checkpoints under the [Aria](https://huggingface.co/rhymes-ai?search_models=aria) organization.
## Overview
> [!TIP]
> Click on the Aria models in the right sidebar for more examples of how to apply Aria to different multimodal tasks.
The Aria model was proposed in [Aria: An Open Multimodal Native Mixture-of-Experts Model](https://huggingface.co/papers/2410.05993) by Li et al. from the Rhymes.AI team.
The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.
Aria is an open multimodal-native model with best-in-class performance across a wide range of multimodal, language, and coding tasks. It has a Mixture-of-Experts architecture, with respectively 3.9B and 3.5B activated parameters per visual token and text token.
<hfoptions id="usage">
<hfoption id="Pipeline">
The abstract from the paper is the following:
*Information comes in diverse modalities. Multimodal native AI models are essential to integrate real-world information and deliver comprehensive understanding. While proprietary multimodal native models exist, their lack of openness imposes obstacles for adoptions, let alone adaptations. To fill this gap, we introduce Aria, an open multimodal native model with best-in-class performance across a wide range of multimodal, language, and coding tasks. Aria is a mixture-of-expert model with 3.9B and 3.5B activated parameters per visual token and text token, respectively. It outperforms Pixtral-12B and Llama3.2-11B, and is competitive against the best proprietary models on various multimodal tasks. We pre-train Aria from scratch following a 4-stage pipeline, which progressively equips the model with strong capabilities in language understanding, multimodal understanding, long context window, and instruction following. We open-source the model weights along with a codebase that facilitates easy adoptions and adaptations of Aria in real-world applications.*
This model was contributed by [m-ric](https://huggingface.co/m-ric).
The original code can be found [here](https://github.com/rhymes-ai/Aria).
## Usage tips
Here's how to use the model for vision tasks:
```python
import requests
import torch
from transformers import pipeline
from PIL import Image
pipeline = pipeline(
"image-to-text",
model="rhymes-ai/Aria",
device=0,
torch_dtype=torch.bfloat16
)
pipeline(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
text="What is shown in this image?"
)
```
from transformers import AriaProcessor, AriaForConditionalGeneration
</hfoption>
<hfoption id="AutoModel">
model_id_or_path = "rhymes-ai/Aria"
```python
import torch
from transformers import AutoModelForCausalLM, AutoProcessor
model = AutoModelForCausalLM.from_pretrained(
"rhymes-ai/Aria",
device_map="auto",
torch_dtype=torch.bfloat16,
attn_implementation="sdpa"
model = AriaForConditionalGeneration.from_pretrained(
model_id_or_path, device_map="auto"
)
processor = AutoProcessor.from_pretrained("rhymes-ai/Aria")
processor = AriaProcessor.from_pretrained(model_id_or_path)
image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
messages = [
{
"role": "user", "content": [
{"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
{"type": "text", "text": "What is shown in this image?"},
]
},
"role": "user",
"content": [
{"type": "image"},
{"text": "what is the image?", "type": "text"},
],
}
]
inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt")
ipnuts = inputs.to(model.device, torch.bfloat16)
text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=text, images=image, return_tensors="pt")
inputs.to(model.device)
output = model.generate(
**inputs,
@ -90,55 +79,6 @@ output = model.generate(
)
output_ids = output[0][inputs["input_ids"].shape[1]:]
response = processor.decode(output_ids, skip_special_tokens=True)
print(response)
```
</hfoption>
</hfoptions>
Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4 and the [rhymes-ai/Aria-sequential_mlp](https://huggingface.co/rhymes-ai/Aria-sequential_mlp) checkpoint. This checkpoint replaces grouped GEMM with `torch.nn.Linear` layers for easier quantization.
```py
# pip install torchao
import torch
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoProcessor
quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
model = AutoModelForCausalLM.from_pretrained(
"rhymes-ai/Aria-sequential_mlp",
torch_dtype=torch.bfloat16,
device_map="auto",
quantization_config=quantization_config
)
processor = AutoProcessor.from_pretrained(
"rhymes-ai/Aria-sequential_mlp",
)
messages = [
{
"role": "user", "content": [
{"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
{"type": "text", "text": "What is shown in this image?"},
]
},
]
inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt")
inputs = inputs.to(model.device, torch.bfloat16)
output = model.generate(
**inputs,
max_new_tokens=15,
stop_strings=["<|im_end|>"],
tokenizer=processor.tokenizer,
do_sample=True,
temperature=0.9,
)
output_ids = output[0][inputs["input_ids"].shape[1]:]
response = processor.decode(output_ids, skip_special_tokens=True)
print(response)
```

View File

@ -14,94 +14,93 @@ rendered properly in your Markdown viewer.
-->
<div style="float: right;">
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
">
<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
</div>
# GPT Neo
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
">
<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
</div>
## Overview
The GPTNeo model was released in the [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) repository by Sid
Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy. It is a GPT2 like causal language model trained on the
[Pile](https://pile.eleuther.ai/) dataset.
The architecture is similar to GPT2 except that GPT Neo uses local attention in every other layer with a window size of
256 tokens.
This model was contributed by [valhalla](https://huggingface.co/valhalla).
## Usage example
The `generate()` method can be used to generate text using GPT Neo model.
```python
>>> from transformers import GPTNeoForCausalLM, GPT2Tokenizer
>>> model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
>>> tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
>>> prompt = (
... "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
... "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
... "researchers was the fact that the unicorns spoke perfect English."
... )
>>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
>>> gen_tokens = model.generate(
... input_ids,
... do_sample=True,
... temperature=0.9,
... max_length=100,
... )
>>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
```
## Combining GPT-Neo and Flash Attention 2
First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature, and make sure your hardware is compatible with Flash-Attention 2. More details are available [here](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2) concerning the installation.
Make sure as well to load your model in half-precision (e.g. `torch.float16`).
To load and run a model using Flash Attention 2, refer to the snippet below:
```python
>>> import torch
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
>>> device = "cuda" # the device to load the model onto
>>> model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-2.7B", torch_dtype=torch.float16, attn_implementation="flash_attention_2")
>>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")
>>> prompt = "def hello_world():"
>>> model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
>>> model.to(device)
>>> generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
>>> tokenizer.batch_decode(generated_ids)[0]
"def hello_world():\n >>> run_script("hello.py")\n >>> exit(0)\n<|endoftext|>"
```
### Expected speedups
Below is an expected speedup diagram that compares pure inference time between the native implementation in transformers using `EleutherAI/gpt-neo-2.7B` checkpoint and the Flash Attention 2 version of the model.
Note that for GPT-Neo it is not possible to train / run on very long context as the max [position embeddings](https://huggingface.co/EleutherAI/gpt-neo-2.7B/blob/main/config.json#L58 ) is limited to 2048 - but this is applicable to all gpt-neo models and not specific to FA-2
<div style="text-align: center">
<img src="https://user-images.githubusercontent.com/49240599/272241893-b1c66b75-3a48-4265-bc47-688448568b3d.png">
</div>
## GPT-Neo
## Resources
[GPT-Neo](https://zenodo.org/records/5297715) is an open-source alternative to GPT-2 and GPT-3 models, built with Mesh TensorFlow for TPUs. GPT-Neo uses local attention in every other layer for more efficiency. It is trained on the [Pile](https://huggingface.co/datasets/EleutherAI/pile), a diverse dataset consisting of 22 smaller high-quality datasets.
You can find all the original GPT-Neo checkpoints under the [EleutherAI](https://huggingface.co/EleutherAI?search_models=gpt-neo) organization.
> [!TIP]
> Click on the GPT-Neo models in the right sidebar for more examples of how to apply GPT Neo to different language tasks.
The example below demonstrates how to generate text with [`Pipeline`] or the [`AutoModel`], and from the command line.
<hfoptions id="usage">
<hfoption id="Pipeline">
```py
import torch
from transformers import pipeline
pipeline = pipeline(task="text-generation", model="EleutherAI/gpt-neo-1.3B", torch_dtype=torch.float16, device=0)
pipeline("Hello, I'm a language model")
```
</hfoption>
<hfoption id="AutoModel">
```py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B", torch_dtype=torch.float16, device_map="auto", attn_implementation="flash_attention_2")
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
input_ids = tokenizer("Hello, I'm a language model", return_tensors="pt").to("cuda")
output = model.generate(**input_ids)
print(tokenizer.decode(output[0], skip_special_tokens=True))
```
</hfoption>
<hfoption id="transformers CLI">
```bash
echo -e "Hello, I'm a language model" | transformers-cli run --task text-generation --model EleutherAI/gpt-neo-1.3B --device 0
```
</hfoption>
</hfoptions>
Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to 4-bits.
```py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype="float16",
bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(
"EleutherAI/gpt-neo-2.7B",
quantization_config=quantization_config,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")
inputs = tokenizer("Hello, I'm a language model", return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
```
## Notes
- Pad inputs on the right because GPT-Neo uses absolute position embeddings.
- [Text classification task guide](../tasks/sequence_classification)
- [Causal language modeling task guide](../tasks/language_modeling)
## GPTNeoConfig

View File

@ -216,12 +216,12 @@ processor.batch_decode(generate_ids, skip_special_tokens=True)
## Note regarding reproducing original implementation
In order to match the logits of the [original implementation](https://github.com/haotian-liu/LLaVA/tree/main), one needs to additionally specify `do_pad=True` when instantiating `LlavaImageProcessor`:
In order to match the logits of the [original implementation](https://github.com/haotian-liu/LLaVA/tree/main), one needs to additionally specify `do_pad=True` when instantiating `LLavaImageProcessor`:
```python
from transformers import LlavaImageProcessor
from transformers import LLavaImageProcessor
image_processor = LlavaImageProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf", do_pad=True)
image_processor = LLavaImageProcessor.from_pretrained("https://huggingface.co/llava-hf/llava-1.5-7b-hf", do_pad=True)
```
### Using Flash Attention 2

View File

@ -1,189 +0,0 @@
<!--Copyright 2025 MiniMaxAI and The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# MiniMax
## Overview
The MiniMax-Text-01 model was proposed in [MiniMax-01: Scaling Foundation Models with Lightning Attention](https://arxiv.org/abs/2501.08313) by MiniMax, Aonian Li, Bangwei Gong, Bo Yang, Boji Shan, Chang Liu, Cheng Zhu, Chunhao Zhang, Congchao Guo, Da Chen, Dong Li, Enwei Jiao, Gengxin Li, Guojun Zhang, Haohai Sun, Houze Dong, Jiadai Zhu, Jiaqi Zhuang, Jiayuan Song, Jin Zhu, Jingtao Han, Jingyang Li, Junbin Xie, Junhao Xu, Junjie Yan, Kaishun Zhang, Kecheng Xiao, Kexi Kang, Le Han, Leyang Wang, Lianfei Yu, Liheng Feng, Lin Zheng, Linbo Chai, Long Xing, Meizhi Ju, Mingyuan Chi, Mozhi Zhang, Peikai Huang, Pengcheng Niu, Pengfei Li, Pengyu Zhao, Qi Yang, Qidi Xu, Qiexiang Wang, Qin Wang, Qiuhui Li, Ruitao Leng, Shengmin Shi, Shuqi Yu, Sichen Li, Songquan Zhu, Tao Huang, Tianrun Liang, Weigao Sun, Weixuan Sun, Weiyu Cheng, Wenkai Li, Xiangjun Song, Xiao Su, Xiaodong Han, Xinjie Zhang, Xinzhu Hou, Xu Min, Xun Zou, Xuyang Shen, Yan Gong, Yingjie Zhu, Yipeng Zhou, Yiran Zhong, Yongyi Hu, Yuanxiang Fan, Yue Yu, Yufeng Yang, Yuhao Li, Yunan Huang, Yunji Li, Yunpeng Huang, Yunzhi Xu, Yuxin Mao, Zehan Li, Zekang Li, Zewei Tao, Zewen Ying, Zhaoyang Cong, Zhen Qin, Zhenhua Fan, Zhihang Yu, Zhuo Jiang, Zijia Wu.
The abstract from the paper is the following:
*We introduce MiniMax-01 series, including MiniMax-Text-01 and MiniMax-VL-01, which are comparable to top-tier models while offering superior capabilities in processing longer contexts. The core lies in lightning attention and its efficient scaling. To maximize computational capacity, we integrate it with Mixture of Experts (MoE), creating a model with 32 experts and 456 billion total parameters, of which 45.9 billion are activated for each token. We develop an optimized parallel strategy and highly efficient computation-communication overlap techniques for MoE and lightning attention. This approach enables us to conduct efficient training and inference on models with hundreds of billions of parameters across contexts spanning millions of tokens. The context window of MiniMax-Text-01 can reach up to 1 million tokens during training and extrapolate to 4 million tokens during inference at an affordable cost. Our vision-language model, MiniMax-VL-01 is built through continued training with 512 billion vision-language tokens. Experiments on both standard and in-house benchmarks show that our models match the performance of state-of-the-art models like GPT-4o and Claude-3.5-Sonnet while offering 20-32 times longer context window.*
### Architectural details
MiniMax is a powerful language model with 456 billion total parameters, of which 45.9 billion are activated per token. To better unlock the long context capabilities of the model, MiniMax adopts a hybrid architecture that combines Lightning Attention, Softmax Attention and Mixture-of-Experts (MoE). Leveraging advanced parallel strategies and innovative compute-communication overlap methods—such as Linear Attention Sequence Parallelism Plus (LASP+), varlen ring attention, Expert Tensor Parallel (ETP), etc., MiniMax's training context length is extended to 1 million tokens, and it can handle a context of up to 4 million tokens during the inference. On various academic benchmarks, MiniMax also demonstrates the performance of a top-tier model.
The architecture of MiniMax is briefly described as follows:
- Total Parameters: 456B
- Activated Parameters per Token: 45.9B
- Number Layers: 80
- Hybrid Attention: a softmax attention is positioned after every 7 lightning attention.
- Number of attention heads: 64
- Attention head dimension: 128
- Mixture of Experts:
- Number of experts: 32
- Expert hidden dimension: 9216
- Top-2 routing strategy
- Positional Encoding: Rotary Position Embedding (RoPE) applied to half of the attention head dimension with a base frequency of 10,000,000
- Hidden Size: 6144
- Vocab Size: 200,064
For more details refer to the [release blog post](https://www.minimaxi.com/en/news/minimax-01-series-2).
### License
`MiniMax` is released under the MINIMAX MODEL LICENSE AGREEMENT.
## Usage tips
The pre-trained model can be used as follows:
```python
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
>>> model = AutoModelForCausalLM.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf", device_map="auto")
>>> tokenizer = AutoTokenizer.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")
>>> messages = [
... {"role": "user", "content": "What is your favourite condiment?"},
... {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
... {"role": "user", "content": "Do you have mayonnaise recipes?"}
... ]
>>> model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
>>> generated_ids = model.generate(model_inputs, max_new_tokens=100, do_sample=True)
>>> tokenizer.batch_decode(generated_ids)[0]
"Mayonnaise can be made as follows: (...)"
```
As can be seen, the instruction-tuned model requires a [chat template](../chat_templating) to be applied to make sure the inputs are prepared in the right format.
## Speeding up MiniMax by using Flash Attention
The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model.
First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature.
```bash
pip install -U flash-attn --no-build-isolation
```
Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of the [flash attention repository](https://github.com/Dao-AILab/flash-attention). Make also sure to load your model in half-precision (e.g. `torch.float16`)
To load and run a model using Flash Attention-2, refer to the snippet below:
```python
>>> import torch
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
>>> model = AutoModelForCausalLM.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf", torch_dtype=torch.float16, attn_implementation="flash_attention_2", device_map="auto")
>>> tokenizer = AutoTokenizer.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")
>>> prompt = "My favourite condiment is"
>>> model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
>>> model.to(device)
>>> generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
>>> tokenizer.batch_decode(generated_ids)[0]
"The expected output"
```
### Sliding window Attention
The current implementation supports the sliding window attention mechanism and memory efficient cache management.
To enable sliding window attention, just make sure to have a `flash-attn` version that is compatible with sliding window attention (`>=2.3.0`).
The Flash Attention-2 model uses also a more memory efficient cache slicing mechanism - as recommended per the official implementation of Mistral model that use rolling cache mechanism we keep the cache size fixed (`self.config.sliding_window`), support batched generation only for `padding_side="left"` and use the absolute position of the current token to compute the positional embedding.
## Shrinking down MiniMax using quantization
As the MiniMax model has 456 billion parameters, that would require about 912GB of GPU RAM in half precision (float16), since each parameter is stored in 2 bytes. However, one can shrink down the size of the model using [quantization](../quantization.md). If the model is quantized to 4 bits (or half a byte per parameter), about 228 GB of RAM is required.
Quantizing a model is as simple as passing a `quantization_config` to the model. Below, we'll leverage the bitsandbytes quantization library (but refer to [this page](../quantization.md) for alternative quantization methods):
```python
>>> import torch
>>> from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
>>> # specify how to quantize the model
>>> quantization_config = BitsAndBytesConfig(
... load_in_4bit=True,
... bnb_4bit_quant_type="nf4",
... bnb_4bit_compute_dtype="torch.float16",
... )
>>> model = AutoModelForCausalLM.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf", quantization_config=True, device_map="auto")
>>> tokenizer = AutoTokenizer.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")
>>> prompt = "My favourite condiment is"
>>> messages = [
... {"role": "user", "content": "What is your favourite condiment?"},
... {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
... {"role": "user", "content": "Do you have mayonnaise recipes?"}
... ]
>>> model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
>>> generated_ids = model.generate(model_inputs, max_new_tokens=100, do_sample=True)
>>> tokenizer.batch_decode(generated_ids)[0]
"The expected output"
```
This model was contributed by [geetu040](https://github.com/geetu040) and [Shakib-IO](https://github.com/Shakib-IO).
The original code can be found [here](https://huggingface.co/MiniMaxAI/MiniMax-Text-01/blob/main/modeling_minimax_text_01.py).
## Resources
A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with MiniMax. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
<PipelineTag pipeline="text-generation"/>
- The [Alignment Handbook](https://github.com/huggingface/alignment-handbook) by Hugging Face includes scripts and recipes to perform supervised fine-tuning (SFT) and direct preference optimization with Mistral-7B. This includes scripts for full fine-tuning, QLoRa on a single GPU as well as multi-GPU fine-tuning.
- [Causal language modeling task guide](../tasks/language_modeling)
## MiniMaxConfig
[[autodoc]] MiniMaxConfig
## MiniMaxModel
[[autodoc]] MiniMaxModel
- forward
## MiniMaxForCausalLM
[[autodoc]] MiniMaxForCausalLM
- forward
## MiniMaxForSequenceClassification
[[autodoc]] MiniMaxForSequenceClassification
- forward
## MiniMaxForTokenClassification
[[autodoc]] MiniMaxForTokenClassification
- forward
## MiniMaxForQuestionAnswering
[[autodoc]] MiniMaxForQuestionAnswering
- forward

View File

@ -119,11 +119,6 @@ Image.fromarray(depth.astype("uint8"))
[[autodoc]] ZoeDepthImageProcessor
- preprocess
## ZoeDepthImageProcessorFast
[[autodoc]] ZoeDepthImageProcessorFast
- preprocess
## ZoeDepthForDepthEstimation
[[autodoc]] ZoeDepthForDepthEstimation

View File

@ -62,17 +62,16 @@ Install torchao from PyPi or the PyTorch index with the following commands.
# Stable release from Pypi which will default to CUDA 12.6
pip install --upgrade torchao transformers
```
</hfoption>
</hfoption>
<hfoption id="PyTorch Index">
Stable Release from the PyTorch index
```bash
pip install torchao --index-url https://download.pytorch.org/whl/cu126 # options are cpu/cu118/cu126/cu128
```
</hfoption>
</hfoptions>
If your torchao version is below 0.10.0, you need to upgrade it, please refer to the [deprecation notice](#deprecation-notice) for more details.
If your torcha version is below 0.10.0, you need to upgrade it, please refer to the [deprecation notice](#deprecation-notice) for more details.
## Quantization examples
@ -89,7 +88,6 @@ We'll show examples for recommended quantization methods based on hardwares, e.g
### H100 GPU
<hfoptions id="examples-H100-GPU">
<hfoption id="float8-dynamic-and-weight-only">
```py
import torch
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
@ -150,7 +148,6 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
### A100 GPU
<hfoptions id="examples-A100-GPU">
<hfoption id="int8-dynamic-and-weight-only">
```py
import torch
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
@ -218,7 +215,6 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
### CPU
<hfoptions id="examples-CPU">
<hfoption id="int8-dynamic-and-weight-only">
```py
import torch
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
@ -280,18 +276,18 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
### Per Module Quantization
#### 1. Skip quantization for certain layers
With `ModuleFqnToConfig` we can specify a default configuration for all layers while skipping quantization for certain layers.
With `AOPerModuleConfig` we can specify a default configuration for all layers while skipping quantization for certain layers.
```py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
model_id = "meta-llama/Llama-3.1-8B-Instruct"
from torchao.quantization import Int4WeightOnlyConfig, ModuleFqnToConfig
from torchao.quantization import Int4WeightOnlyConfig, AOPerModuleConfig
config = Int4WeightOnlyConfig(group_size=128)
# set default to int4 (for linears), and skip quantizing `model.layers.0.self_attn.q_proj`
quant_config = ModuleFqnToConfig({"_default": config, "model.layers.0.self_attn.q_proj": None})
quant_config = AOPerModuleConfig({"_default": config, "model.layers.0.self_attn.q_proj": None})
quantization_config = TorchAoConfig(quant_type=quant_config)
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
# lm_head is not quantized and model.layers.0.self_attn.q_proj is not quantized
@ -315,7 +311,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
model_id = "facebook/opt-125m"
from torchao.quantization import Int4WeightOnlyConfig, ModuleFqnToConfig, Int8DynamicActivationInt4WeightConfig, IntxWeightOnlyConfig, PerAxis, MappingType
from torchao.quantization import Int4WeightOnlyConfig, AOPerModuleConfig, Int8DynamicActivationInt4WeightConfig, IntxWeightOnlyConfig, PerAxis, MappingType
weight_dtype = torch.int8
granularity = PerAxis(0)
@ -326,7 +322,7 @@ embedding_config = IntxWeightOnlyConfig(
mapping_type=mapping_type,
)
linear_config = Int8DynamicActivationInt4WeightConfig(group_size=128)
quant_config = ModuleFqnToConfig({"_default": linear_config, "model.decoder.embed_tokens": embedding_config, "model.decoder.embed_positions": None})
quant_config = AOPerModuleConfig({"_default": linear_config, "model.decoder.embed_tokens": embedding_config, "model.decoder.embed_positions": None})
# set `include_embedding` to True in order to include embedding in quantization
# when `include_embedding` is True, we'll remove input embedding from `modules_not_to_convert` as well
quantization_config = TorchAoConfig(quant_type=quant_config, include_embedding=True)
@ -389,7 +385,6 @@ To avoid arbitrary user code execution, torchao sets `weights_only=True` in [tor
<hfoptions id="serialization-examples">
<hfoption id="save-locally">
```py
# don't serialize model with Safetensors
output_dir = "llama3-8b-int4wo-128"
@ -397,7 +392,6 @@ quantized_model.save_pretrained("llama3-8b-int4wo-128", safe_serialization=False
```
</hfoption>
<hfoption id="push-to-huggingface-hub">
```py
# don't serialize model with Safetensors
USER_ID = "your_huggingface_user_id"
@ -433,8 +427,8 @@ quantized_model.save_pretrained(output_dir, safe_serialization=False)
# reload the quantized model
reloaded_model = AutoModelForCausalLM.from_pretrained(
output_dir,
device_map="auto",
output_dir,
device_map="auto",
torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
@ -469,8 +463,8 @@ quantized_model.save_pretrained(output_dir, safe_serialization=False)
# reload the quantized model
reloaded_model = AutoModelForCausalLM.from_pretrained(
output_dir,
device_map="cpu",
output_dir,
device_map="cpu",
torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")

View File

@ -32,8 +32,8 @@ ocalhost:29504 test_train.py
import logging
import os
from collections.abc import Iterable
from contextlib import nullcontext
from typing import Iterable
import torch
import torch.distributed as dist

View File

@ -1,42 +0,0 @@
import datasets
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
torch.set_float32_matmul_precision("high")
model_id = "meta-llama/Llama-3.2-3b-Instruct"
model = AutoModelForCausalLM.from_pretrained(
model_id, attn_implementation="sdpa_paged", torch_dtype=torch.bfloat16, device_map=0
).eval()
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
generation_config = GenerationConfig(
max_new_tokens=512,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
use_cache=False,
num_blocks=2048,
block_size=128,
do_sample=True,
max_batch_tokens=1024, # Maximum number of tokens to process in a single batch
scheduler="prefill_first",
)
train_dataset = datasets.load_dataset("openai/gsm8k", "socratic", split="test")
def tokenize_function(examples):
return tokenizer(examples["question"])
tokenized_datasets = train_dataset.map(tokenize_function, batched=True)
simple_batch_inputs = [item["input_ids"] for item in tokenized_datasets]
batch_outputs = model.generate_batch(
inputs=simple_batch_inputs,
generation_config=generation_config,
progress_bar=False,
enable_visualizer=True,
tokenizer=tokenizer,
)

View File

@ -3,6 +3,6 @@ jaxlib>=0.1.59
flax>=0.3.5
optax>=0.0.8
-f https://download.pytorch.org/whl/torch_stable.html
torch==2.7.1
torch==2.6.0
-f https://download.pytorch.org/whl/torch_stable.html
torchvision==0.12.0+cpu

View File

@ -12,52 +12,24 @@ from torch import nn
from ...cache_utils import Cache, HybridCache, StaticCache
from ...generation import GenerationMixin
from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_outputs import BaseModelOutputWithPast
from ...modeling_utils import PreTrainedModel
from ...processing_utils import Unpack
from ...utils import ModelOutput, auto_docstring, can_return_tuple, is_torchdynamo_compiling
from ..auto import AutoModel
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
replace_return_docstrings,
)
from ..auto import AutoModel, AutoModelForCausalLM
from .configuration_new_task_model import NewTaskModelConfig
@dataclass
class NewTaskModelModelOutputWithPast(BaseModelOutputWithPast):
"""
Base class for NewTaskModel outputs, with hidden states and attentions.
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
image_hidden_states (`torch.FloatTensor`, *optional*):
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
"""
image_hidden_states: Optional[torch.FloatTensor] = None
_CONFIG_FOR_DOC = "NewTaskModelConfig"
@dataclass
class NewTaskModelCausalLMOutputWithPast(ModelOutput):
"""
Base class for NewTaskModel causal language model (or autoregressive) outputs.
Base class for NewTaskModelcausal language model (or autoregressive) outputs.
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
@ -105,10 +77,30 @@ class NewTaskModelMultiModalProjector(nn.Module):
return hidden_states
@auto_docstring
NEW_TASK_MODEL_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`NewTaskModelConfig`] or [`NewTaskModelVisionConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
@add_start_docstrings(
"The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
NEW_TASK_MODEL_START_DOCSTRING,
)
class NewTaskModelPreTrainedModel(PreTrainedModel):
config_class = NewTaskModelConfig
base_model_prefix = ""
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["NewTaskModelMultiModalProjector"]
_skip_keys_device_placement = "past_key_values"
@ -117,8 +109,6 @@ class NewTaskModelPreTrainedModel(PreTrainedModel):
_supports_static_cache = True
_supports_flash_attn_2 = True
_supports_sdpa = True
_supports_flex_attn = True
_supports_attention_backend = True
def _init_weights(self, module):
# important: this ported version of NewTaskModelisn't meant for training from scratch - only
@ -131,24 +121,102 @@ class NewTaskModelPreTrainedModel(PreTrainedModel):
module.bias.data.zero_()
@auto_docstring(
custom_intro="""
The Base NewTaskModel model which consists of a vision backbone and a language model withou language modeling head.,
"""
)
class NewTaskModelModel(NewTaskModelPreTrainedModel):
_checkpoint_conversion_mapping = {"language_model.model": "language_model"}
NEW_TASK_MODEL_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
def __init__(self, config: NewTaskModelConfig):
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
The tensors corresponding to the input images. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`SiglipImageProcessor.__call__`] for details ([]`NewTaskModelProcessor`] uses
[`SiglipImageProcessor`] for processing images).
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
`past_key_values`).
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
information on the default strategy.
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
the complete sequence length.
"""
@add_start_docstrings(
"""The NEW_TASK_MODEL model which consists of a vision backbone and a language model.""",
NEW_TASK_MODEL_START_DOCSTRING,
)
class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
main_input_name: ClassVar[str] = "doc_input_ids" # transformers-related
def __init__(self, config):
super().__init__(config)
self.vision_tower = AutoModel.from_config(config=config.vision_config)
self.multi_modal_projector = NewTaskModelMultiModalProjector(config)
self.vocab_size = config.text_config.vocab_size
language_model = AutoModel.from_config(config=config.text_config)
language_model = AutoModelForCausalLM.from_config(config=config.text_config)
if language_model._tied_weights_keys is not None:
self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
self.language_model = language_model
self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
self.embedding_dim = self.config.embedding_dim
self.custom_text_proj = nn.Linear(self.config.text_config.hidden_size, self.embedding_dim)
if self.language_model._tied_weights_keys is not None:
self._tied_weights_keys = [f"model.language_model.{k}" for k in self.language_model._tied_weights_keys]
self.post_init()
def get_input_embeddings(self):
@ -157,6 +225,18 @@ class NewTaskModelModel(NewTaskModelPreTrainedModel):
def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)
def get_output_embeddings(self):
return self.language_model.get_output_embeddings()
def set_output_embeddings(self, new_embeddings):
self.language_model.set_output_embeddings(new_embeddings)
def set_decoder(self, decoder):
self.language_model.set_decoder(decoder)
def get_decoder(self):
return self.language_model.get_decoder()
def _update_causal_mask(
self,
attention_mask,
@ -241,191 +321,8 @@ class NewTaskModelModel(NewTaskModelPreTrainedModel):
image_features = image_features / (self.config.text_config.hidden_size**0.5)
return image_features
@can_return_tuple
@auto_docstring
def forward(
self,
input_ids: torch.LongTensor = None,
pixel_values: torch.FloatTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
token_type_ids: Optional[torch.LongTensor] = None,
cache_position: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs: Unpack[FlashAttentionKwargs],
) -> Union[Tuple, NewTaskModelModelOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
Example:
```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, NewTaskModelForConditionalGeneration
>>> model = NewTaskModelForConditionalGeneration.from_pretrained("google/new_task_model2-3b-mix-224")
>>> processor = AutoProcessor.from_pretrained("google/new_task_model2-3b-mix-224")
>>> prompt = "Where is the cat standing?"
>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, text=prompt, return_tensors="pt")
>>> # Generate
>>> generate_ids = model.generate(**inputs,)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Where is the cat standing?\nsnow"
```"""
if (input_ids is None) ^ (inputs_embeds is not None):
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
is_training = token_type_ids is not None and labels is not None
# Replace image id woth PAD if the image token if OOV, to avoid index-errors
if input_ids is not None and self.config.image_token_id >= self.vocab_size:
special_image_mask = input_ids == self.config.image_token_id
llm_input_ids = input_ids.clone()
llm_input_ids[special_image_mask] = 0
else:
llm_input_ids = input_ids
if inputs_embeds is None:
inputs_embeds = self.get_input_embeddings()(llm_input_ids)
if cache_position is None:
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
cache_position = torch.arange(
past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
)
if position_ids is None:
position_ids = cache_position.unsqueeze(0) + 1 # NewTaskModel positions are 1-indexed
# Merge text and images
if pixel_values is not None:
image_features = self.get_image_features(pixel_values)
if input_ids is None:
special_image_mask = inputs_embeds == self.get_input_embeddings()(
torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
)
else:
special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1)
special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
image_tokens_in_text = (special_image_mask).sum(dim=1).sum(dim=0)[0]
raise ValueError(
f"Number of images does not match number of special image tokens in the input text. "
f"Got {image_tokens_in_text} image tokens in the text but {image_features.shape[0] * image_features.shape[1]} "
"tokens from image embeddings."
)
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
causal_mask = self._update_causal_mask(
attention_mask, token_type_ids, past_key_values, cache_position, inputs_embeds, is_training
)
outputs = self.language_model(
attention_mask=causal_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=True,
cache_position=cache_position,
**kwargs,
)
return NewTaskModelModelOutputWithPast(
last_hidden_state=outputs.last_hidden_state,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
image_hidden_states=image_features if pixel_values is not None else None,
)
@auto_docstring(
custom_intro="""
The Base NewTaskModel model which consists of a vision backbone and a language model without language modeling head.,
"""
)
class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
_checkpoint_conversion_mapping = {
"^language_model.model": "model.language_model",
"^vision_tower": "model.vision_tower",
"^multi_modal_projector": "model.multi_modal_projector",
"^language_model.lm_head": "lm_head",
}
_tied_weights_keys = ["lm_head.weight"]
main_input_name: ClassVar[str] = "doc_input_ids" # transformers-related
def __init__(self, config):
super().__init__(config)
self.model = NewTaskModelModel(config)
self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
self.embedding_dim = self.config.embedding_dim
self.custom_text_proj = nn.Linear(self.config.text_config.hidden_size, self.embedding_dim)
if self.language_model._tied_weights_keys is not None:
self._tied_weights_keys = [f"model.language_model.{k}" for k in self.language_model._tied_weights_keys]
self.post_init()
def get_input_embeddings(self):
return self.model.get_input_embeddings()
def set_input_embeddings(self, value):
self.model.set_input_embeddings(value)
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model = decoder
def get_decoder(self):
return self.model
# Make modules available throught conditional class for BC
@property
def language_model(self):
return self.model.language_model
@property
def vision_tower(self):
return self.model.vision_tower
@property
def multi_modal_projector(self):
return self.model.multi_modal_projector
@can_return_tuple
@auto_docstring
@add_start_docstrings_to_model_forward(NEW_TASK_MODEL_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=NewTaskModelCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: torch.LongTensor = None,
@ -444,10 +341,19 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
num_logits_to_keep: int = 0,
) -> Union[Tuple, NewTaskModelCausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
logits_to_keep (`int` or `torch.Tensor`, *optional*):
If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
`input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
This is useful when using packed tensor format (single dimension for batch and sequence length).
Returns:
Example:
@ -494,8 +400,7 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
# L2 normalization
embeddings = proj / proj.norm(dim=-1, keepdim=True) # (batch_size, sequence_length, dim)
if attention_mask is not None:
embeddings = embeddings * attention_mask.unsqueeze(-1) # (batch_size, sequence_length, dim)
embeddings = embeddings * attention_mask.unsqueeze(-1) # (batch_size, sequence_length, dim)
return (embeddings,) + vlm_outputs
@ -515,7 +420,7 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
**kwargs,
):
# Overwritten -- custom `position_ids` and `pixel_values` handling
model_inputs = super().prepare_inputs_for_generation(
model_inputs = self.language_model.prepare_inputs_for_generation(
input_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
@ -538,68 +443,13 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
is_training = token_type_ids is not None and labels is not None
if cache_position[0] == 0 and isinstance(past_key_values, HybridCache):
input_tensor = inputs_embeds if inputs_embeds is not None else input_ids
causal_mask = self.model._update_causal_mask(
causal_mask = self._update_causal_mask(
attention_mask, token_type_ids, past_key_values, cache_position, input_tensor, is_training
)
model_inputs["attention_mask"] = causal_mask
return model_inputs
@staticmethod
def _prepare_4d_causal_attention_mask_with_cache_position(
attention_mask: torch.Tensor,
sequence_length: int,
target_length: int,
dtype: torch.dtype,
cache_position: torch.Tensor,
batch_size: int,
**kwargs,
):
"""
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
Args:
attention_mask (`torch.Tensor`):
A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
`(batch_size, 1, query_length, key_value_length)`.
sequence_length (`int`):
The sequence length being processed.
target_length (`int`):
The target length: when generating with static cache, the mask should be as long as the static cache,
to account for the 0 padding, the part of the cache that is not filled yet.
dtype (`torch.dtype`):
The dtype to use for the 4D attention mask.
cache_position (`torch.Tensor`):
Indices depicting the position of the input sequence tokens in the sequence.
batch_size (`torch.Tensor`):
Batch size.
"""
if attention_mask is not None and attention_mask.dim() == 4:
# In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
causal_mask = attention_mask
else:
min_dtype = torch.finfo(dtype).min
causal_mask = torch.full(
(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
)
if sequence_length != 1:
causal_mask = torch.triu(causal_mask, diagonal=1)
causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
if attention_mask is not None:
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
mask_length = attention_mask.shape[-1]
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
causal_mask.device
)
padding_mask = padding_mask == 0
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
padding_mask, min_dtype
)
return causal_mask
def resize_token_embeddings(
self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None, mean_resizing=True
) -> nn.Embedding:

View File

@ -65,8 +65,7 @@ class NewTaskModelForNewTask(PaliGemmaForConditionalGeneration):
# L2 normalization
embeddings = proj / proj.norm(dim=-1, keepdim=True) # (batch_size, sequence_length, dim)
if attention_mask is not None:
embeddings = embeddings * attention_mask.unsqueeze(-1) # (batch_size, sequence_length, dim)
embeddings = embeddings * attention_mask.unsqueeze(-1) # (batch_size, sequence_length, dim)
return (embeddings,) + vlm_outputs

View File

@ -31,9 +31,8 @@ ocalhost:29504 test_train.py
import logging
import os
from collections.abc import Iterable
from contextlib import nullcontext
from typing import Dict, Optional
from typing import Dict, Iterable, Optional
import torch
import torch.distributed as dist

View File

@ -11,7 +11,7 @@ torch.set_float32_matmul_precision("high")
model_id = "meta-llama/Llama-3.2-3b-Instruct"
model = AutoModelForCausalLM.from_pretrained(
model_id, attn_implementation="sdpa_paged", torch_dtype=torch.bfloat16, device_map=0
model_id, attn_implementation="sdpa_paged", torch_dtype=torch.bfloat16, device_map="auto"
).eval()
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")

View File

@ -15,15 +15,10 @@ limitations under the License.
-->
<p align="center">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
<source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
<img alt="Hugging Face Transformers Library" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg" width="352" height="59" style="max-width: 100%;">
</picture>
<br/>
<br/>
<br>
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
<br>
</p>
<p align="center">
<a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a>
<a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a>

View File

@ -40,15 +40,10 @@ checkpoint: जाँच बिंदु
-->
<p align="center">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
<source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
<img alt="Hugging Face Transformers Library" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg" width="352" height="59" style="max-width: 100%;">
</picture>
<br/>
<br/>
<br>
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
<br>
</p>
<p align="center">
<a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a>
<a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a>

View File

@ -50,15 +50,10 @@ user: ユーザ
-->
<p align="center">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
<source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
<img alt="Hugging Face Transformers Library" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg" width="352" height="59" style="max-width: 100%;">
</picture>
<br/>
<br/>
<br>
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
<br>
</p>
<p align="center">
<a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a>
<a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a>

View File

@ -40,15 +40,10 @@ checkpoint: 检查点
-->
<p align="center">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
<source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
<img alt="Hugging Face Transformers Library" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg" width="352" height="59" style="max-width: 100%;">
</picture>
<br/>
<br/>
<br>
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
<br>
</p>
<p align="center">
<a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a>
<a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a>

View File

@ -52,15 +52,10 @@ user: 使用者
-->
<p align="center">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
<source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
<img alt="Hugging Face Transformers Library" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg" width="352" height="59" style="max-width: 100%;">
</picture>
<br/>
<br/>
<br>
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
<br>
</p>
<p align="center">
<a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a>
<a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a>

View File

@ -120,6 +120,7 @@ _deps = [
"huggingface-hub>=0.30.0,<1.0",
"importlib_metadata",
"ipadic>=1.0.0,<2.0",
"isort>=5.5.4",
"jax>=0.4.1,<=0.4.13",
"jaxlib>=0.4.1,<=0.4.13",
"jieba",
@ -141,7 +142,6 @@ _deps = [
"optimum-benchmark>=0.3.0",
"optuna",
"optax>=0.0.8,<=0.1.4",
"pandas<2.3.0", # `datasets` requires `pandas` while `pandas==2.3.0` has issues with CircleCI on 2025/06/05
"packaging>=20.0",
"parameterized",
"phonemizer",
@ -204,7 +204,6 @@ _deps = [
"opentelemetry-api",
"opentelemetry-exporter-otlp",
"opentelemetry-sdk",
"textual",
]
@ -368,7 +367,7 @@ extras["testing"] = (
extras["deepspeed-testing"] = extras["deepspeed"] + extras["testing"] + extras["optuna"] + extras["sentencepiece"]
extras["ruff"] = deps_list("ruff")
extras["quality"] = deps_list("datasets", "ruff", "GitPython", "urllib3", "libcst", "rich", "pandas")
extras["quality"] = deps_list("datasets", "isort", "ruff", "GitPython", "urllib3", "libcst", "rich")
extras["all"] = (
extras["tf"]
@ -442,8 +441,6 @@ extras["benchmark"] = deps_list("optimum-benchmark")
# OpenTelemetry dependencies for metrics collection in continuous batching
extras["open-telemetry"] = deps_list("opentelemetry-api", "opentelemetry-exporter-otlp", "opentelemetry-sdk")
extras["continuous-batching-visualizer"] = deps_list("rich", "textual")
# when modifying the following list, make sure to update src/transformers/dependency_versions_check.py
install_requires = [
deps["filelock"], # filesystem locks, e.g., to prevent parallel downloads

View File

@ -2,9 +2,8 @@ import copy
import importlib.metadata
import json
import os
from collections.abc import Iterable
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple, Union
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
import torch
from packaging import version

View File

@ -21,8 +21,7 @@ from dataclasses import dataclass
from datetime import date
from itertools import chain
from pathlib import Path
from re import Pattern
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from typing import Any, Callable, Dict, List, Optional, Pattern, Tuple, Union
import yaml

View File

@ -1218,7 +1218,6 @@ ALLOWED_LAYER_TYPES = (
"full_attention",
"sliding_attention",
"chunked_attention",
"linear_attention", # used in minimax
)

View File

@ -824,7 +824,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
tokenizer: PreTrainedTokenizerBase
mlm: bool = True
mlm_probability: Optional[float] = 0.15
mlm_probability: float = 0.15
mask_replace_prob: float = 0.8
random_replace_prob: float = 0.1
pad_to_multiple_of: Optional[int] = None
@ -833,15 +833,13 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
seed: Optional[int] = None
def __post_init__(self):
if self.mlm:
if self.tokenizer.mask_token is None:
raise ValueError(
"This tokenizer does not have a mask token which is necessary for masked language modeling. "
"You should pass `mlm=False` to train on causal language modeling instead."
)
if self.mlm_probability is None or self.mlm_probability < 0 or self.mlm_probability > 1:
raise ValueError("mlm_probability should be between 0 and 1.")
self.mlm_probability = float(self.mlm_probability)
if self.mlm and self.tokenizer.mask_token is None:
raise ValueError(
"This tokenizer does not have a mask token which is necessary for masked language modeling. "
"You should pass `mlm=False` to train on causal language modeling instead."
)
if self.mlm_probability < 0 or self.mlm_probability > 1:
raise ValueError("mlm_probability should be between 0 and 1.")
if self.mask_replace_prob + self.random_replace_prob > 1:
raise ValueError("The sum of mask_replace_prob and random_replace_prob should not exceed 1")
if self.mask_replace_prob < 0 or self.mask_replace_prob > 1:
@ -849,6 +847,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
if self.random_replace_prob < 0 or self.random_replace_prob > 1:
raise ValueError("random_replace_prob should be between 0 and 1.")
self.mlm_probability = float(self.mlm_probability)
self.mask_replace_prob = float(self.mask_replace_prob)
self.random_replace_prob = float(self.random_replace_prob)

View File

@ -27,6 +27,7 @@ deps = {
"huggingface-hub": "huggingface-hub>=0.30.0,<1.0",
"importlib_metadata": "importlib_metadata",
"ipadic": "ipadic>=1.0.0,<2.0",
"isort": "isort>=5.5.4",
"jax": "jax>=0.4.1,<=0.4.13",
"jaxlib": "jaxlib>=0.4.1,<=0.4.13",
"jieba": "jieba",
@ -47,7 +48,6 @@ deps = {
"optimum-benchmark": "optimum-benchmark>=0.3.0",
"optuna": "optuna",
"optax": "optax>=0.0.8,<=0.1.4",
"pandas": "pandas<2.3.0",
"packaging": "packaging>=20.0",
"parameterized": "parameterized",
"phonemizer": "phonemizer",

View File

@ -25,7 +25,6 @@ from enum import Enum
from functools import partial
from typing import Deque, Dict, List, Optional, Set, Tuple, Union
from tokenizers import Tokenizer
import torch
import torch.nn as nn
from torch.profiler import profile, schedule, tensorboard_trace_handler
@ -34,7 +33,6 @@ from tqdm import tqdm
from ..cache_utils import Cache
from ..configuration_utils import PretrainedConfig
from ..generation.configuration_utils import GenerationConfig
from ..utils.continuous_batching_visualizer import ContinuousBatchingVisualizer
from ..utils.metrics import ContinuousBatchProcessorMetrics, attach_tracer, traced
@ -1104,7 +1102,6 @@ class ContinuousBatchingManager:
self.profile = getattr(generation_config, "profile", False)
self.manual_eviction = manual_eviction
self.batch_processor: Optional[ContinuousBatchProcessor] = None
self.visualizer = None
@traced
def start(self):
@ -1154,12 +1151,6 @@ class ContinuousBatchingManager:
logger.info("Continuous Batching Manager stopped.")
self._generation_thread = None
def set_tokenizer(self, tokenizer: Tokenizer):
self.tokenizer = tokenizer
def set_visualizer(self, visualizer: ContinuousBatchingVisualizer):
self.visualizer = visualizer
def add_request(
self, input_ids: List[int], request_id: Optional[str] = None, max_new_tokens: Optional[int] = None
) -> str:
@ -1321,13 +1312,13 @@ class ContinuousBatchingManager:
record_shapes=False,
with_stack=True,
) as prof:
while not self.stop_event.is_set():
while not self.stop_event.is_set() or batch_processor.has_pending_requests():
self._inner_generation_loop(batch_processor, is_first)
if is_first:
is_first = False
prof.step()
else:
while not self.stop_event.is_set():
while not self.stop_event.is_set() or batch_processor.has_pending_requests():
self._inner_generation_loop(batch_processor, is_first)
if is_first:
is_first = False
@ -1343,10 +1334,6 @@ class ContinuousBatchingManager:
if torch.cuda.is_available():
torch.cuda.synchronize()
batch_processor.prepare_next_batch()
if self.visualizer is not None:
viz_data = self._collect_visualization_data(batch_processor)
self.visualizer.draw(viz_data)
self.visualizer.wait_for_input()
if torch.cuda.is_available() and self.use_cuda_graph:
if is_first:
self.warmup(batch_processor)
@ -1396,51 +1383,6 @@ class ContinuousBatchingManager:
if self.batch_processor is not None:
self.batch_processor.scheduler.finish_request(request_id)
def _collect_visualization_data(self, batch_processor: ContinuousBatchProcessor) -> Dict:
"""Collect data for visualization."""
data = {
"batch_contents": [],
"words": [],
"request_ids_per_token": [],
}
data["attention_mask"] = batch_processor.attention_mask.clone()
# Collect all tokens and map them to request IDs
all_tokens = []
all_request_ids = []
for req in batch_processor.requests_in_batch:
if self.tokenizer is not None:
decoded = self.tokenizer.decode(req.prompt_ids)
decoded_tokens_list = self.tokenizer.convert_ids_to_tokens(req.prompt_ids)
data["batch_contents"].append({"request_id": req.request_id, "decoded": decoded, "decoded_tokens": decoded_tokens_list})
all_tokens.extend(decoded_tokens_list)
else:
data["batch_contents"].append({"request_id": req.request_id, "tokens": req.prompt_ids})
# Convert token IDs to strings when no tokenizer is available
all_tokens.extend([str(token_id) for token_id in req.prompt_ids])
# Map each token to its request ID
all_request_ids.extend([req.request_id] * len(req.prompt_ids))
data["words"] = all_tokens
data["request_ids_per_token"] = all_request_ids
# Add cache statistics if available
if hasattr(batch_processor, 'cache'):
cache = batch_processor.cache
data["paged_attention_cache"] = {
"total_blocks": cache.num_blocks,
"used_blocks": cache.num_blocks - len(cache._free_blocks),
"free_blocks": len(cache._free_blocks),
"block_size": cache.block_size,
"num_heads": cache.num_key_value_heads,
"head_dim": cache.head_dim,
"utilization": (cache.num_blocks - len(cache._free_blocks)) / cache.num_blocks if cache.num_blocks > 0 else 0.0
}
return data
class ContinuousMixin:
"""Mixin class for models to add continuous batching capabilities."""
@ -1489,8 +1431,6 @@ class ContinuousMixin:
inputs: List[List[int]],
generation_config: Optional[GenerationConfig] = None,
progress_bar: bool = True,
enable_visualizer: bool = False,
tokenizer: Optional[Tokenizer] = None,
**kwargs,
) -> List[List[int]]:
"""Generate sequences for a batch of prompts using continuous batching.
@ -1498,8 +1438,6 @@ class ContinuousMixin:
Args:
inputs: List of input token sequences (prompts)
generation_config: Optional generation configuration
progress_bar: Whether to show a progress bar during generation
visualizer: Whether to visualize the continuous batching process
**kwargs: Additional generation parameters
Returns:
@ -1516,37 +1454,29 @@ class ContinuousMixin:
results = {}
num_requests = len(inputs)
try:
if enable_visualizer:
manager.add_requests(inputs, **kwargs)
visualizer = ContinuousBatchingVisualizer()
if tokenizer is not None:
manager.set_tokenizer(tokenizer)
manager.set_visualizer(visualizer)
visualizer.run()
else:
from tqdm.contrib.logging import logging_redirect_tqdm
from tqdm.contrib.logging import logging_redirect_tqdm
with logging_redirect_tqdm([logger]):
with tqdm(
total=num_requests,
disable=(not progress_bar),
desc=f"Solving {num_requests} requests",
unit="request",
) as pbar:
manager.add_requests(inputs, **kwargs)
finished_count = 0
while finished_count < num_requests:
result = manager.get_result(timeout=1)
if result:
req_id = result.request_id
if result.status == RequestStatus.FINISHED:
results[req_id] = result
finished_count += 1
pbar.update(1)
else:
if not manager.is_running():
logger.error("Generation thread terminated unexpectedly.")
break
with logging_redirect_tqdm([logger]):
with tqdm(
total=num_requests,
disable=(not progress_bar),
desc=f"Solving {num_requests} requests",
unit="request",
) as pbar:
manager.add_requests(inputs, **kwargs)
finished_count = 0
while finished_count < num_requests:
result = manager.get_result(timeout=1)
if result:
req_id = result.request_id
if result.status == RequestStatus.FINISHED:
results[req_id] = result
finished_count += 1
pbar.update(1)
else:
if not manager.is_running():
logger.error("Generation thread terminated unexpectedly.")
break
except Exception as e:
logger.error(f"Error during batch generation: {e}", exc_info=True)

View File

@ -15,8 +15,7 @@
import inspect
import math
from collections.abc import Iterable
from typing import TYPE_CHECKING, Callable, List, Optional, Tuple, Union
from typing import TYPE_CHECKING, Callable, Iterable, List, Optional, Tuple, Union
import numpy as np
import torch

View File

@ -1976,7 +1976,6 @@ class GenerationMixin(ContinuousMixin):
and "jamba" not in self.__class__.__name__.lower()
and "zamba" not in self.__class__.__name__.lower()
and "bamba" not in self.__class__.__name__.lower()
and "minimax" not in self.__class__.__name__.lower()
)
def _prepare_cache_for_generation(

View File

@ -458,16 +458,11 @@ def deepspeed_init(trainer, num_training_steps, inference=False):
model_parameters = None
else:
trainer.optimizer = None # important for when deepspeed_init is used as re-init
deepspeed_tp_size = hf_deepspeed_config.config.get("tensor_parallel", {}).get("autotp_size", 1)
if deepspeed_tp_size > 1:
tp_size = hf_deepspeed_config.config.get("tensor_parallel", {}).get("autotp_size", 0)
if tp_size > 1:
import deepspeed
model = deepspeed.tp_model_init(
model=model,
tp_size=deepspeed_tp_size,
dtype=hf_deepspeed_config.dtype(),
config=hf_deepspeed_config.config,
)
model = deepspeed.tp_model_init(model=model, tp_size=tp_size, dtype=hf_deepspeed_config.dtype())
model_parameters = list(filter(lambda p: p.requires_grad, model.parameters()))
optimizer, lr_scheduler = deepspeed_optim_sched(
trainer, hf_deepspeed_config, args, num_training_steps, model_parameters

View File

@ -23,7 +23,6 @@ if is_torch_npu_available():
import torch_npu
from einops import rearrange, repeat
from torch_npu import npu_rotary_mul
# FlashAttention2 is supported on Ascend NPU with down-right aligned causal mask by default.
@ -248,19 +247,3 @@ def npu_flash_attn_varlen_func(
)[0]
return output
def npu_apply_rotary_emb(x, cos, sin, **kwargs):
# cos tensor after chunk should be repeated through chunked dimension to original shape on Ascend NPU
if len(cos.shape) == 2 and cos.shape[-1] == x.shape[-1] // 2:
cos = cos.repeat(1, 2)
# cos tensor with [S,D] shape should be unsqueezed to 4-d tensor with shape [1,S,1,D]
cos = cos.unsqueeze(0).unsqueeze(2)
# sin tensor after chunk should be repeated through chunked dimension to original shape on Ascend NPU
if len(sin.shape) == 2 and sin.shape[-1] == x.shape[-1] // 2:
sin = sin.repeat(1, 2)
# sin tensor with [S,D] shape should be unsqueezed to 4-d tensor with shape [1,S,1,D]
sin = sin.unsqueeze(0).unsqueeze(2)
return npu_rotary_mul(x, cos, sin)

View File

@ -53,6 +53,10 @@ def ForCausalLMLoss(
shift_labels: Optional[torch.Tensor] = None,
**kwargs,
) -> torch.Tensor:
if labels is None and shift_labels is None:
return None
# Upcast to float if we need to compute the loss to avoid potential precision issues
logits = logits.float()

View File

@ -40,8 +40,9 @@ if is_flash_attn_2_available():
# patch functions in package `flash-attn` when using flash-attention on Ascend NPU.
if is_torch_npu_available():
from torch_npu import npu_rotary_mul as apply_rotary_emb # noqa
from .integrations.npu_flash_attention import index_first_axis, pad_input, unpad_input
from .integrations.npu_flash_attention import npu_apply_rotary_emb as apply_rotary_emb # noqa
from .integrations.npu_flash_attention import npu_flash_attn_func as flash_attn_func
from .integrations.npu_flash_attention import npu_flash_attn_varlen_func as flash_attn_varlen_func

View File

@ -2277,13 +2277,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
if not isinstance(requested_attn_implementation, dict)
else requested_attn_implementation.get(key, None)
)
# For models with backbone sub-config might be not initialized. Set the requested att
# if the config hasn't got any attn pre-set and the requested attn in not `None` (i.e not the default attn)
if (
sub_config is not None
and sub_config._attn_implementation_internal is None
and curr_attn_implementation is not None
):
# For models with backbone sub-config might be not initialized
if sub_config is not None:
sub_config._attn_implementation_internal = curr_attn_implementation
if config._attn_implementation == "flash_attention_2":
@ -3755,7 +3750,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
index = None
if state_dict_split.is_sharded:
index = {
"metadata": {"total_parameters": self.num_parameters(), **state_dict_split.metadata},
"metadata": state_dict_split.metadata,
"weight_map": state_dict_split.tensor_to_filename,
}

View File

@ -185,7 +185,6 @@ if TYPE_CHECKING:
from .megatron_gpt2 import *
from .mgp_str import *
from .mimi import *
from .minimax import *
from .mistral import *
from .mistral3 import *
from .mixtral import *

View File

@ -16,7 +16,7 @@
"""ALBERT model configuration"""
from collections import OrderedDict
from collections.abc import Mapping
from typing import Mapping
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig

View File

@ -18,8 +18,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections.abc import Iterable
from typing import List, Optional, Tuple, Union
from typing import Iterable, List, Optional, Tuple, Union
import numpy as np

View File

@ -12,8 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections.abc import Iterable
from typing import Dict, List, Optional, Tuple, Union
from typing import Dict, Iterable, List, Optional, Tuple, Union
import numpy as np

View File

@ -384,8 +384,6 @@ class ASTPreTrainedModel(PreTrainedModel):
supports_gradient_checkpointing = True
_supports_sdpa = True
_supports_flash_attn_2 = True
_supports_flex_attn = True
_supports_attention_backend = True
def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
"""Initialize the weights"""

View File

@ -442,12 +442,8 @@ class _BaseAutoModelClass:
else:
repo_id = config.name_or_path
model_class = get_class_from_dynamic_module(class_ref, repo_id, **kwargs)
# This block handles the case where the user is loading a model with `trust_remote_code=True`
# but a library model exists with the same name. We don't want to override the autoclass
# mappings in this case, or all future loads of that model will be the remote code model.
if not has_local_code:
cls.register(config.__class__, model_class, exist_ok=True)
model_class.register_for_auto_class(auto_class=cls)
model_class.register_for_auto_class(auto_class=cls)
cls.register(config.__class__, model_class, exist_ok=True)
_ = kwargs.pop("code_revision", None)
model_class = add_generation_mixin_to_remote_model(model_class)
return model_class._from_config(config, **kwargs)
@ -583,12 +579,8 @@ class _BaseAutoModelClass:
class_ref, pretrained_model_name_or_path, code_revision=code_revision, **hub_kwargs, **kwargs
)
_ = hub_kwargs.pop("code_revision", None)
# This block handles the case where the user is loading a model with `trust_remote_code=True`
# but a library model exists with the same name. We don't want to override the autoclass
# mappings in this case, or all future loads of that model will be the remote code model.
if not has_local_code:
cls.register(config.__class__, model_class, exist_ok=True)
model_class.register_for_auto_class(auto_class=cls)
cls.register(config.__class__, model_class, exist_ok=True)
model_class.register_for_auto_class(auto_class=cls)
model_class = add_generation_mixin_to_remote_model(model_class)
return model_class.from_pretrained(
pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs

View File

@ -211,7 +211,6 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
("megatron-bert", "MegatronBertConfig"),
("mgp-str", "MgpstrConfig"),
("mimi", "MimiConfig"),
("minimax", "MiniMaxConfig"),
("mistral", "MistralConfig"),
("mistral3", "Mistral3Config"),
("mixtral", "MixtralConfig"),
@ -587,7 +586,6 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
("megatron_gpt2", "Megatron-GPT2"),
("mgp-str", "MGP-STR"),
("mimi", "Mimi"),
("minimax", "MiniMax"),
("mistral", "Mistral"),
("mistral3", "Mistral3"),
("mixtral", "Mixtral"),

View File

@ -170,7 +170,7 @@ else:
("vitmatte", ("VitMatteImageProcessor", "VitMatteImageProcessorFast")),
("xclip", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
("yolos", ("YolosImageProcessor", "YolosImageProcessorFast")),
("zoedepth", ("ZoeDepthImageProcessor", "ZoeDepthImageProcessorFast")),
("zoedepth", ("ZoeDepthImageProcessor",)),
]
)

View File

@ -201,7 +201,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
("megatron-bert", "MegatronBertModel"),
("mgp-str", "MgpstrForSceneTextRecognition"),
("mimi", "MimiModel"),
("minimax", "MiniMaxModel"),
("mistral", "MistralModel"),
("mistral3", "Mistral3Model"),
("mixtral", "MixtralModel"),
@ -595,7 +594,6 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
("mbart", "MBartForCausalLM"),
("mega", "MegaForCausalLM"),
("megatron-bert", "MegatronBertForCausalLM"),
("minimax", "MiniMaxForCausalLM"),
("mistral", "MistralForCausalLM"),
("mixtral", "MixtralForCausalLM"),
("mllama", "MllamaForCausalLM"),
@ -1108,7 +1106,6 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("mbart", "MBartForSequenceClassification"),
("mega", "MegaForSequenceClassification"),
("megatron-bert", "MegatronBertForSequenceClassification"),
("minimax", "MiniMaxForSequenceClassification"),
("mistral", "MistralForSequenceClassification"),
("mixtral", "MixtralForSequenceClassification"),
("mobilebert", "MobileBertForSequenceClassification"),
@ -1200,7 +1197,6 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
("mbart", "MBartForQuestionAnswering"),
("mega", "MegaForQuestionAnswering"),
("megatron-bert", "MegatronBertForQuestionAnswering"),
("minimax", "MiniMaxForQuestionAnswering"),
("mistral", "MistralForQuestionAnswering"),
("mixtral", "MixtralForQuestionAnswering"),
("mobilebert", "MobileBertForQuestionAnswering"),
@ -1307,7 +1303,6 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("markuplm", "MarkupLMForTokenClassification"),
("mega", "MegaForTokenClassification"),
("megatron-bert", "MegatronBertForTokenClassification"),
("minimax", "MiniMaxForTokenClassification"),
("mistral", "MistralForTokenClassification"),
("mixtral", "MixtralForTokenClassification"),
("mobilebert", "MobileBertForTokenClassification"),

View File

@ -342,13 +342,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
("mega", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
("megatron-bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
("mgp-str", ("MgpstrTokenizer", None)),
(
"minimax",
(
"GPT2Tokenizer" if is_sentencepiece_available() else None,
"GPT2TokenizerFast" if is_tokenizers_available() else None,
),
),
(
"mistral",
(

View File

@ -97,7 +97,6 @@ class AyaVisionPreTrainedModel(PreTrainedModel):
_supports_sdpa = True
_supports_quantized_cache = False
_supports_static_cache = False
_supports_flex_attn = True
_supports_attention_backend = True
def _init_weights(self, module):

View File

@ -16,8 +16,7 @@
import warnings
from collections import OrderedDict
from collections.abc import Mapping
from typing import Any, Optional
from typing import Any, Mapping, Optional
from ... import PreTrainedTokenizer
from ...configuration_utils import PretrainedConfig

View File

@ -16,7 +16,7 @@
import warnings
from collections import OrderedDict
from collections.abc import Mapping
from typing import Mapping
from packaging import version

View File

@ -16,7 +16,7 @@
"""BERT model configuration"""
from collections import OrderedDict
from collections.abc import Mapping
from typing import Mapping
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig

View File

@ -15,7 +15,7 @@
"""BigBird model configuration"""
from collections import OrderedDict
from collections.abc import Mapping
from typing import Mapping
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig

View File

@ -15,8 +15,7 @@
"""BigBirdPegasus model configuration"""
from collections import OrderedDict
from collections.abc import Mapping
from typing import Any, Optional
from typing import Any, Mapping, Optional
from ... import PreTrainedTokenizer
from ...configuration_utils import PretrainedConfig

View File

@ -15,8 +15,7 @@
"""Blenderbot model configuration"""
from collections import OrderedDict
from collections.abc import Mapping
from typing import Any, Optional
from typing import Any, Mapping, Optional
from ... import PreTrainedTokenizer
from ...configuration_utils import PretrainedConfig

View File

@ -15,8 +15,7 @@
"""BlenderbotSmall model configuration"""
from collections import OrderedDict
from collections.abc import Mapping
from typing import Any, Optional
from typing import Any, Mapping, Optional
from ... import PreTrainedTokenizer
from ...configuration_utils import PretrainedConfig

View File

@ -1196,8 +1196,6 @@ class Blip2QFormerModel(Blip2PreTrainedModel):
query_length if query_length is not None else query_embeds.shape[1] if query_embeds is not None else 0
)
# `Blip2QFormerModel` is kept as fp32
query_embeds = query_embeds.to(self.layernorm.weight.dtype)
embedding_output = self.layernorm(query_embeds)
embedding_output = self.dropout(embedding_output)
@ -1739,7 +1737,6 @@ class Blip2TextModelWithProjection(Blip2PreTrainedModel):
)
pooled_output = text_outputs[0] if not return_dict else text_outputs.last_hidden_state
pooled_output = pooled_output.to(dtype=self.text_projection.weight.dtype)
text_embeds = self.text_projection(pooled_output)
text_embeds = nn.functional.normalize(text_embeds, dim=-1)
@ -1840,7 +1837,6 @@ class Blip2VisionModelWithProjection(Blip2PreTrainedModel):
)
embeds = query_outputs[0] if not return_dict else query_outputs.last_hidden_state
embeds = embeds.to(dtype=self.vision_projection.weight.dtype)
image_embeds = self.vision_projection(embeds)
image_embeds = nn.functional.normalize(image_embeds, dim=-1)
@ -2399,7 +2395,6 @@ class Blip2ForImageTextRetrieval(Blip2PreTrainedModel):
return_dict=return_dict,
)
text_embeds = text_outputs[0] if not return_dict else text_outputs.last_hidden_state
text_embeds = text_embeds.to(dtype=self.itm_head.weight.dtype)
output = self.itm_head(text_embeds[:, : query_tokens.size(1), :])
logits_per_image = output.mean(dim=1)
@ -2413,7 +2408,6 @@ class Blip2ForImageTextRetrieval(Blip2PreTrainedModel):
return_dict=return_dict,
)
image_embeds = query_outputs[0] if not return_dict else query_outputs.last_hidden_state
image_embeds = image_embeds.to(dtype=self.vision_projection.weight.dtype)
query_embeds = self.embeddings(
input_ids=input_ids,
@ -2425,7 +2419,6 @@ class Blip2ForImageTextRetrieval(Blip2PreTrainedModel):
return_dict=return_dict,
)
question_embeds = text_outputs[0] if not return_dict else text_outputs.last_hidden_state
question_embeds = question_embeds.to(dtype=self.text_projection.weight.dtype)
# normalized features
image_embeds = nn.functional.normalize(self.vision_projection(image_embeds), dim=-1)

View File

@ -15,8 +15,7 @@
"""Bloom configuration"""
from collections import OrderedDict
from collections.abc import Mapping
from typing import TYPE_CHECKING, Any, List, Optional
from typing import TYPE_CHECKING, Any, List, Mapping, Optional
from packaging import version

View File

@ -14,8 +14,7 @@
# limitations under the License.
"""Image processor class for BridgeTower."""
from collections.abc import Iterable
from typing import Any, Dict, List, Optional, Tuple, Union
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
import numpy as np

View File

@ -14,8 +14,7 @@
# limitations under the License.
"""Fast Image processor class for BridgeTower."""
from collections.abc import Iterable
from typing import Dict, Optional, Tuple, Union
from typing import Dict, Iterable, Optional, Tuple, Union
from ...image_processing_utils_fast import (
BaseImageProcessorFast,

View File

@ -16,7 +16,7 @@
"""CamemBERT configuration"""
from collections import OrderedDict
from collections.abc import Mapping
from typing import Mapping
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig

View File

@ -830,7 +830,6 @@ class ChameleonPreTrainedModel(PreTrainedModel):
_supports_cache_class = True
_supports_static_cache = True
_supports_param_buffer_assignment = False
_supports_flex_attn = True
_supports_attention_backend = True
def _init_weights(self, module):
@ -1243,6 +1242,7 @@ class ChameleonForConditionalGeneration(ChameleonPreTrainedModel, GenerationMixi
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
**kwargs: Unpack[KwargsForCausalLM],
) -> Union[Tuple, CausalLMOutputWithPast]:
@ -1276,6 +1276,7 @@ class ChameleonForConditionalGeneration(ChameleonPreTrainedModel, GenerationMixi
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
outputs = self.model(
@ -1288,7 +1289,7 @@ class ChameleonForConditionalGeneration(ChameleonPreTrainedModel, GenerationMixi
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=True,
return_dict=return_dict,
cache_position=cache_position,
**kwargs,
)

View File

@ -15,8 +15,7 @@
"""Chinese-CLIP model configuration"""
from collections import OrderedDict
from collections.abc import Mapping
from typing import TYPE_CHECKING, Any, Optional
from typing import TYPE_CHECKING, Any, Mapping, Optional
if TYPE_CHECKING:

View File

@ -15,8 +15,7 @@
"""CLIP model configuration"""
from collections import OrderedDict
from collections.abc import Mapping
from typing import TYPE_CHECKING, Any, Optional
from typing import TYPE_CHECKING, Any, Mapping, Optional
if TYPE_CHECKING:

View File

@ -450,8 +450,6 @@ class CLIPPreTrainedModel(PreTrainedModel):
supports_gradient_checkpointing = True
_supports_sdpa = True
_supports_flash_attn_2 = True
_supports_flex_attn = True
_supports_attention_backend = True
def _init_weights(self, module):
"""Initialize the weights"""

View File

@ -15,8 +15,7 @@
"""CodeGen model configuration"""
from collections import OrderedDict
from collections.abc import Mapping
from typing import Any, List, Optional
from typing import Any, List, Mapping, Optional
from ... import PreTrainedTokenizer, TensorType, is_torch_available
from ...configuration_utils import PretrainedConfig

View File

@ -162,8 +162,7 @@ class ColPaliForRetrieval(ColPaliPreTrainedModel):
# L2 normalization
embeddings = embeddings / embeddings.norm(dim=-1, keepdim=True) # (batch_size, sequence_length, dim)
if attention_mask is not None:
embeddings = embeddings * attention_mask.unsqueeze(-1) # (batch_size, sequence_length, dim)
embeddings = embeddings * attention_mask.unsqueeze(-1) # (batch_size, sequence_length, dim)
return ColPaliForRetrievalOutput(
embeddings=embeddings,

View File

@ -15,7 +15,7 @@
"""Conditional DETR model configuration"""
from collections import OrderedDict
from collections.abc import Mapping
from typing import Mapping
from packaging import version

View File

@ -17,8 +17,7 @@
import io
import pathlib
from collections import defaultdict
from collections.abc import Iterable
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
import numpy as np

View File

@ -15,7 +15,7 @@
"""ConvBERT model configuration"""
from collections import OrderedDict
from collections.abc import Mapping
from typing import Mapping
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig

View File

@ -15,7 +15,7 @@
"""ConvNeXT model configuration"""
from collections import OrderedDict
from collections.abc import Mapping
from typing import Mapping
from packaging import version

View File

@ -15,7 +15,7 @@
"""Data2VecText configuration"""
from collections import OrderedDict
from collections.abc import Mapping
from typing import Mapping
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig

View File

@ -15,7 +15,7 @@
"""Data2VecVision model configuration"""
from collections import OrderedDict
from collections.abc import Mapping
from typing import Mapping
from packaging import version

View File

@ -15,8 +15,7 @@
"""DeBERTa model configuration"""
from collections import OrderedDict
from collections.abc import Mapping
from typing import TYPE_CHECKING, Any, Optional, Union
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig

View File

@ -17,8 +17,7 @@
from __future__ import annotations
import math
from collections.abc import Sequence
from typing import Dict, Optional, Tuple, Union
from typing import Dict, Optional, Sequence, Tuple, Union
import numpy as np
import tensorflow as tf

View File

@ -15,8 +15,7 @@
"""DeBERTa-v2 model configuration"""
from collections import OrderedDict
from collections.abc import Mapping
from typing import TYPE_CHECKING, Any, Optional, Union
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig

View File

@ -17,8 +17,7 @@
import io
import pathlib
from collections import defaultdict
from collections.abc import Iterable
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
import numpy as np

View File

@ -15,7 +15,7 @@
"""DeiT model configuration"""
from collections import OrderedDict
from collections.abc import Mapping
from typing import Mapping
from packaging import version

View File

@ -450,8 +450,6 @@ class DeiTPreTrainedModel(PreTrainedModel):
_no_split_modules = ["DeiTLayer"]
_supports_sdpa = True
_supports_flash_attn_2 = True
_supports_flex_attn = True
_supports_attention_backend = True
def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
"""Initialize the weights"""

View File

@ -15,8 +15,7 @@
"""Image processor class for Deformable DETR."""
import pathlib
from collections.abc import Iterable
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
import numpy as np

View File

@ -1,8 +1,7 @@
# Copyright (c) Microsoft Corporation and HuggingFace
# Licensed under the MIT License.
from collections.abc import Mapping
from typing import Any, Dict, List
from typing import Any, Dict, List, Mapping
import numpy as np
import torch

View File

@ -15,8 +15,7 @@
"""PyTorch Graphormer model."""
import math
from collections.abc import Iterable, Iterator
from typing import List, Optional, Tuple, Union
from typing import Iterable, Iterator, List, Optional, Tuple, Union
import torch
import torch.nn as nn

View File

@ -15,7 +15,7 @@
"""MEGA configuration"""
from collections import OrderedDict
from collections.abc import Mapping
from typing import Mapping
from ....configuration_utils import PretrainedConfig
from ....onnx import OnnxConfig

View File

@ -15,7 +15,7 @@
"""DETR model configuration"""
from collections import OrderedDict
from collections.abc import Mapping
from typing import Mapping
from packaging import version

View File

@ -17,8 +17,7 @@
import io
import pathlib
from collections import defaultdict
from collections.abc import Iterable
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
import numpy as np

View File

@ -15,7 +15,7 @@
"""DINOv2 model configuration"""
from collections import OrderedDict
from collections.abc import Mapping
from typing import Mapping
from packaging import version

View File

@ -491,11 +491,9 @@ class Dinov2PreTrainedModel(PreTrainedModel):
base_model_prefix = "dinov2"
main_input_name = "pixel_values"
supports_gradient_checkpointing = True
_no_split_modules = ["Dinov2Layer"]
_no_split_modules = ["Dinov2SwiGLUFFN"]
_supports_sdpa = True
_supports_flash_attn_2 = True
_supports_flex_attn = True
_supports_attention_backend = True
def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
"""Initialize the weights"""

Some files were not shown because too many files have changed in this diff Show More