Compare commits

..

2 Commits

Author SHA1 Message Date
756c2edae2 run_ci_manually 2024-11-15 22:01:07 +01:00
3441993d3b run_ci_manually 2024-11-15 19:12:25 +01:00
159 changed files with 1977 additions and 7598 deletions

View File

@ -3,7 +3,7 @@ name: Build docker images (scheduled)
on:
push:
branches:
- test_docker_run_quantization
- build_ci_docker_image*
repository_dispatch:
workflow_call:
inputs:
@ -18,341 +18,341 @@ concurrency:
cancel-in-progress: false
jobs:
# latest-docker:
# name: "Latest PyTorch + TensorFlow [dev]"
# runs-on:
# group: aws-general-8-plus
# steps:
# -
# name: Set up Docker Buildx
# uses: docker/setup-buildx-action@v3
# -
# name: Check out code
# uses: actions/checkout@v4
# -
# name: Login to DockerHub
# uses: docker/login-action@v3
# with:
# username: ${{ secrets.DOCKERHUB_USERNAME }}
# password: ${{ secrets.DOCKERHUB_PASSWORD }}
# -
# name: Build and push
# uses: docker/build-push-action@v5
# with:
# context: ./docker/transformers-all-latest-gpu
# build-args: |
# REF=main
# push: true
# tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}
# # Push CI images still need to be re-built daily
# -
# name: Build and push (for Push CI) in a daily basis
# # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
# # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
# if: inputs.image_postfix != '-push-ci'
# uses: docker/build-push-action@v5
# with:
# context: ./docker/transformers-all-latest-gpu
# build-args: |
# REF=main
# push: true
# tags: huggingface/transformers-all-latest-gpu-push-ci
latest-docker:
name: "Latest PyTorch + TensorFlow [dev]"
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-all-latest-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}
# Push CI images still need to be re-built daily
-
name: Build and push (for Push CI) in a daily basis
# This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
# The later case is useful for manual image building for debugging purpose. Use another tag in this case!
if: inputs.image_postfix != '-push-ci'
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-all-latest-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-all-latest-gpu-push-ci
# - name: Post to Slack
# if: always()
# uses: huggingface/hf-workflows/.github/actions/post-slack@main
# with:
# slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
# title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build
# status: ${{ job.status }}
# slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
# latest-torch-deepspeed-docker:
# name: "Latest PyTorch + DeepSpeed"
# runs-on:
# group: aws-general-8-plus
# steps:
# -
# name: Set up Docker Buildx
# uses: docker/setup-buildx-action@v3
# -
# name: Check out code
# uses: actions/checkout@v4
# -
# name: Login to DockerHub
# uses: docker/login-action@v3
# with:
# username: ${{ secrets.DOCKERHUB_USERNAME }}
# password: ${{ secrets.DOCKERHUB_PASSWORD }}
# -
# name: Build and push
# uses: docker/build-push-action@v5
# with:
# context: ./docker/transformers-pytorch-deepspeed-latest-gpu
# build-args: |
# REF=main
# push: true
# tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }}
latest-torch-deepspeed-docker:
name: "Latest PyTorch + DeepSpeed"
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-pytorch-deepspeed-latest-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }}
# - name: Post to Slack
# if: always()
# uses: huggingface/hf-workflows/.github/actions/post-slack@main
# with:
# slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER}}
# title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu docker build
# status: ${{ job.status }}
# slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER}}
title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu docker build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
# # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`)
# latest-torch-deepspeed-docker-for-push-ci-daily-build:
# name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)"
# runs-on:
# group: aws-general-8-plus
# steps:
# -
# name: Set up Docker Buildx
# uses: docker/setup-buildx-action@v3
# -
# name: Check out code
# uses: actions/checkout@v4
# -
# name: Login to DockerHub
# uses: docker/login-action@v3
# with:
# username: ${{ secrets.DOCKERHUB_USERNAME }}
# password: ${{ secrets.DOCKERHUB_PASSWORD }}
# # Push CI images still need to be re-built daily
# -
# name: Build and push (for Push CI) in a daily basis
# # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
# # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
# if: inputs.image_postfix != '-push-ci'
# uses: docker/build-push-action@v5
# with:
# context: ./docker/transformers-pytorch-deepspeed-latest-gpu
# build-args: |
# REF=main
# push: true
# tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
# Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`)
latest-torch-deepspeed-docker-for-push-ci-daily-build:
name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)"
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
# Push CI images still need to be re-built daily
-
name: Build and push (for Push CI) in a daily basis
# This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
# The later case is useful for manual image building for debugging purpose. Use another tag in this case!
if: inputs.image_postfix != '-push-ci'
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-pytorch-deepspeed-latest-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
# - name: Post to Slack
# if: always()
# uses: huggingface/hf-workflows/.github/actions/post-slack@main
# with:
# slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
# title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build
# status: ${{ job.status }}
# slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
# doc-builder:
# name: "Doc builder"
# # Push CI doesn't need this image
# if: inputs.image_postfix != '-push-ci'
# runs-on:
# group: aws-general-8-plus
# steps:
# -
# name: Set up Docker Buildx
# uses: docker/setup-buildx-action@v3
# -
# name: Check out code
# uses: actions/checkout@v4
# -
# name: Login to DockerHub
# uses: docker/login-action@v3
# with:
# username: ${{ secrets.DOCKERHUB_USERNAME }}
# password: ${{ secrets.DOCKERHUB_PASSWORD }}
# -
# name: Build and push
# uses: docker/build-push-action@v5
# with:
# context: ./docker/transformers-doc-builder
# push: true
# tags: huggingface/transformers-doc-builder
doc-builder:
name: "Doc builder"
# Push CI doesn't need this image
if: inputs.image_postfix != '-push-ci'
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-doc-builder
push: true
tags: huggingface/transformers-doc-builder
# - name: Post to Slack
# if: always()
# uses: huggingface/hf-workflows/.github/actions/post-slack@main
# with:
# slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
# title: 🤗 Results of the huggingface/transformers-doc-builder docker build
# status: ${{ job.status }}
# slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the huggingface/transformers-doc-builder docker build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
# latest-pytorch:
# name: "Latest PyTorch [dev]"
# # Push CI doesn't need this image
# if: inputs.image_postfix != '-push-ci'
# runs-on:
# group: aws-general-8-plus
# steps:
# -
# name: Set up Docker Buildx
# uses: docker/setup-buildx-action@v3
# -
# name: Check out code
# uses: actions/checkout@v4
# -
# name: Login to DockerHub
# uses: docker/login-action@v3
# with:
# username: ${{ secrets.DOCKERHUB_USERNAME }}
# password: ${{ secrets.DOCKERHUB_PASSWORD }}
# -
# name: Build and push
# uses: docker/build-push-action@v5
# with:
# context: ./docker/transformers-pytorch-gpu
# build-args: |
# REF=main
# push: true
# tags: huggingface/transformers-pytorch-gpu
latest-pytorch:
name: "Latest PyTorch [dev]"
# Push CI doesn't need this image
if: inputs.image_postfix != '-push-ci'
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-pytorch-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-gpu
# - name: Post to Slack
# if: always()
# uses: huggingface/hf-workflows/.github/actions/post-slack@main
# with:
# slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
# title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build
# status: ${{ job.status }}
# slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
# latest-pytorch-amd:
# name: "Latest PyTorch (AMD) [dev]"
# runs-on:
# group: aws-general-8-plus
# steps:
# -
# name: Set up Docker Buildx
# uses: docker/setup-buildx-action@v3
# -
# name: Check out code
# uses: actions/checkout@v4
# -
# name: Login to DockerHub
# uses: docker/login-action@v3
# with:
# username: ${{ secrets.DOCKERHUB_USERNAME }}
# password: ${{ secrets.DOCKERHUB_PASSWORD }}
# -
# name: Build and push
# uses: docker/build-push-action@v5
# with:
# context: ./docker/transformers-pytorch-amd-gpu
# build-args: |
# REF=main
# push: true
# tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }}
# # Push CI images still need to be re-built daily
# -
# name: Build and push (for Push CI) in a daily basis
# # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
# # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
# if: inputs.image_postfix != '-push-ci'
# uses: docker/build-push-action@v5
# with:
# context: ./docker/transformers-pytorch-amd-gpu
# build-args: |
# REF=main
# push: true
# tags: huggingface/transformers-pytorch-amd-gpu-push-ci
latest-pytorch-amd:
name: "Latest PyTorch (AMD) [dev]"
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-pytorch-amd-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }}
# Push CI images still need to be re-built daily
-
name: Build and push (for Push CI) in a daily basis
# This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
# The later case is useful for manual image building for debugging purpose. Use another tag in this case!
if: inputs.image_postfix != '-push-ci'
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-pytorch-amd-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-amd-gpu-push-ci
# - name: Post to Slack
# if: always()
# uses: huggingface/hf-workflows/.github/actions/post-slack@main
# with:
# slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
# title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build
# status: ${{ job.status }}
# slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
# latest-tensorflow:
# name: "Latest TensorFlow [dev]"
# # Push CI doesn't need this image
# if: inputs.image_postfix != '-push-ci'
# runs-on:
# group: aws-general-8-plus
# steps:
# -
# name: Set up Docker Buildx
# uses: docker/setup-buildx-action@v3
# -
# name: Check out code
# uses: actions/checkout@v4
# -
# name: Login to DockerHub
# uses: docker/login-action@v3
# with:
# username: ${{ secrets.DOCKERHUB_USERNAME }}
# password: ${{ secrets.DOCKERHUB_PASSWORD }}
# -
# name: Build and push
# uses: docker/build-push-action@v5
# with:
# context: ./docker/transformers-tensorflow-gpu
# build-args: |
# REF=main
# push: true
# tags: huggingface/transformers-tensorflow-gpu
latest-tensorflow:
name: "Latest TensorFlow [dev]"
# Push CI doesn't need this image
if: inputs.image_postfix != '-push-ci'
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-tensorflow-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-tensorflow-gpu
# - name: Post to Slack
# if: always()
# uses: huggingface/hf-workflows/.github/actions/post-slack@main
# with:
# slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
# title: 🤗 Results of the huggingface/transformers-tensorflow-gpu build
# status: ${{ job.status }}
# slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the huggingface/transformers-tensorflow-gpu build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
# latest-pytorch-deepspeed-amd:
# name: "PyTorch + DeepSpeed (AMD) [dev]"
# runs-on:
# group: aws-general-8-plus
# steps:
# -
# name: Set up Docker Buildx
# uses: docker/setup-buildx-action@v3
# -
# name: Check out code
# uses: actions/checkout@v4
# -
# name: Login to DockerHub
# uses: docker/login-action@v3
# with:
# username: ${{ secrets.DOCKERHUB_USERNAME }}
# password: ${{ secrets.DOCKERHUB_PASSWORD }}
# -
# name: Build and push
# uses: docker/build-push-action@v5
# with:
# context: ./docker/transformers-pytorch-deepspeed-amd-gpu
# build-args: |
# REF=main
# push: true
# tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}
# # Push CI images still need to be re-built daily
# -
# name: Build and push (for Push CI) in a daily basis
# # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
# # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
# if: inputs.image_postfix != '-push-ci'
# uses: docker/build-push-action@v5
# with:
# context: ./docker/transformers-pytorch-deepspeed-amd-gpu
# build-args: |
# REF=main
# push: true
# tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci
latest-pytorch-deepspeed-amd:
name: "PyTorch + DeepSpeed (AMD) [dev]"
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-pytorch-deepspeed-amd-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}
# Push CI images still need to be re-built daily
-
name: Build and push (for Push CI) in a daily basis
# This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
# The later case is useful for manual image building for debugging purpose. Use another tag in this case!
if: inputs.image_postfix != '-push-ci'
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-pytorch-deepspeed-amd-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci
# - name: Post to Slack
# if: always()
# uses: huggingface/hf-workflows/.github/actions/post-slack@main
# with:
# slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
# title: 🤗 Results of the transformers-pytorch-deepspeed-amd-gpu build
# status: ${{ job.status }}
# slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the transformers-pytorch-deepspeed-amd-gpu build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
latest-quantization-torch-docker:
name: "Latest Pytorch + Quantization [dev]"

View File

@ -7,7 +7,7 @@ on:
- cron: "17 2 * * *"
push:
branches:
- run_scheduled_ci*
- run_ci_manually
jobs:
model-ci:
@ -20,59 +20,3 @@ jobs:
docker: huggingface/transformers-all-latest-gpu
ci_event: Daily CI
secrets: inherit
torch-pipeline:
name: Torch pipeline CI
uses: ./.github/workflows/self-scheduled.yml
with:
job: run_pipelines_torch_gpu
slack_report_channel: "#transformers-ci-daily-pipeline-torch"
runner: daily-ci
docker: huggingface/transformers-pytorch-gpu
ci_event: Daily CI
secrets: inherit
tf-pipeline:
name: TF pipeline CI
uses: ./.github/workflows/self-scheduled.yml
with:
job: run_pipelines_tf_gpu
slack_report_channel: "#transformers-ci-daily-pipeline-tf"
runner: daily-ci
docker: huggingface/transformers-tensorflow-gpu
ci_event: Daily CI
secrets: inherit
example-ci:
name: Example CI
uses: ./.github/workflows/self-scheduled.yml
with:
job: run_examples_gpu
slack_report_channel: "#transformers-ci-daily-examples"
runner: daily-ci
docker: huggingface/transformers-all-latest-gpu
ci_event: Daily CI
secrets: inherit
deepspeed-ci:
name: DeepSpeed CI
uses: ./.github/workflows/self-scheduled.yml
with:
job: run_torch_cuda_extensions_gpu
slack_report_channel: "#transformers-ci-daily-deepspeed"
runner: daily-ci
docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
ci_event: Daily CI
working-directory-prefix: /workspace
secrets: inherit
quantization-ci:
name: Quantization CI
uses: ./.github/workflows/self-scheduled.yml
with:
job: run_quantization_torch_gpu
slack_report_channel: "#transformers-ci-daily-quantization"
runner: daily-ci
docker: huggingface/transformers-quantization-latest-gpu
ci_event: Daily CI
secrets: inherit

View File

@ -9,7 +9,7 @@ SHELL ["sh", "-lc"]
# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
# to be used as arguments for docker build (so far).
ARG PYTORCH='2.5.1'
ARG PYTORCH='2.4.1'
# Example: `cu102`, `cu113`, etc.
ARG CUDA='cu118'

View File

@ -218,8 +218,6 @@
title: CPU inference
- local: perf_infer_gpu_one
title: GPU inference
- local: perf_infer_gpu_multi
title: Multi-GPU inference
title: Optimizing inference
- local: big_models
title: Instantiate a big model
@ -516,8 +514,6 @@
title: Nyströmformer
- local: model_doc/olmo
title: OLMo
- local: model_doc/olmo_1124
title: OLMo November 2024
- local: model_doc/olmoe
title: OLMoE
- local: model_doc/open-llama

View File

@ -416,6 +416,16 @@ Assisted decoding assumes the main and assistant models have the same tokenizer,
Currently, only greedy search and sampling are supported with assisted decoding, and assisted decoding doesn't support batched inputs.
To learn more about assisted decoding, check [this blog post](https://huggingface.co/blog/assisted-generation).
#### Universal Assisted Decoding
Universal Assisted Decoding (UAD) adds support for main and assistant models with different tokenizers.
To use it, simply pass the tokenizers using the `tokenizer` and `assistant_tokenizer` arguments (see below).
Internally, the main model input tokens are re-encoded into assistant model tokens, then candidate tokens are generated in the assistant encoding, which are
in turn re-encoded into main model candidate tokens. Validation then proceeds as explained above.
The re-encoding steps involve decoding token ids into text and then encoding the text using a different tokenizer.
Since re-encoding the tokens may result in tokenization discrepancies, UAD finds the longest common subsequence between the source and target encodings,
to ensure the new tokens include the correct prompt suffix.
To enable assisted decoding, set the `assistant_model` argument with a model.
```python
@ -435,6 +445,26 @@ To enable assisted decoding, set the `assistant_model` argument with a model.
['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
```
If the main and assistant models have different tokenizers, use Universal Assisted Decoding.
```python
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
>>> prompt = "Alice and Bob"
>>> checkpoint = "google/gemma-2-9b"
>>> assistant_checkpoint = "double7/vicuna-68m"
>>> assistant_tokenizer = AutoTokenizer.from_pretrained(assistant_checkpoint)
>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
>>> inputs = tokenizer(prompt, return_tensors="pt")
>>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
>>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
>>> outputs = model.generate(**inputs, assistant_model=assistant_model, tokenizer=tokenizer, assistant_tokenizer=assistant_tokenizer)
>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
```
When using assisted decoding with sampling methods, you can use the `temperature` argument to control the randomness,
just like in multinomial sampling. However, in assisted decoding, reducing the temperature may help improve the latency.
@ -456,63 +486,9 @@ just like in multinomial sampling. However, in assisted decoding, reducing the t
['Alice and Bob, a couple of friends of mine, who are both in the same office as']
```
#### Universal Assisted Decoding
Universal Assisted Decoding (UAD) adds support for main and assistant models with different tokenizers.
To use it, simply pass the tokenizers using the `tokenizer` and `assistant_tokenizer` arguments (see below).
Internally, the main model input tokens are re-encoded into assistant model tokens, then candidate tokens are generated in the assistant encoding, which are
in turn re-encoded into main model candidate tokens. Validation then proceeds as explained above.
The re-encoding steps involve decoding token ids into text and then encoding the text using a different tokenizer.
Since re-encoding the tokens may result in tokenization discrepancies, UAD finds the longest common subsequence between the source and target encodings,
to ensure the new tokens include the correct prompt suffix.
```python
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
>>> prompt = "Alice and Bob"
>>> checkpoint = "google/gemma-2-9b"
>>> assistant_checkpoint = "double7/vicuna-68m"
>>> assistant_tokenizer = AutoTokenizer.from_pretrained(assistant_checkpoint)
>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
>>> inputs = tokenizer(prompt, return_tensors="pt")
>>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
>>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
>>> outputs = model.generate(**inputs, assistant_model=assistant_model, tokenizer=tokenizer, assistant_tokenizer=assistant_tokenizer)
>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
```
#### Prompt Lookup
Alternatively, you can also set the `prompt_lookup_num_tokens` to trigger n-gram based assisted decoding, as opposed
to model based assisted decoding. You can read more about it [here](https://twitter.com/joao_gante/status/1747322413006643259).
#### Self-Speculative Decoding
An LLM can be trained to also use its language modeling head with earlier hidden states as input, effectively
skipping layers to yield a lower-quality output -- a technique called early exiting.
We use the lower-quality early exit output as an assistant output, and apply self-speculation to fix the output using the remaining layers. The final generation of that self-speculative solution is the same (or has the same distribution) as the original model's generation.
If the model you're using was trained to do early exit, you can pass
`assistant_early_exit` (integer). In this case, the assistant model will be the same model but exiting early, hence the
"self-speculative" name. Because the assistant model is a portion of the target model, caches and weights can be shared, which results in lower memory requirements. As in other assisted generation methods, the final generated result has the same quality as if no assistant had been used.
```python
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
>>> prompt = "Alice and Bob"
>>> checkpoint = "facebook/layerskip-llama3.2-1B"
>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
>>> inputs = tokenizer(prompt, return_tensors="pt")
>>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
>>> outputs = model.generate(**inputs, assistant_early_exit=4, do_sample=False, max_new_tokens=20)
>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
```
### DoLa Decoding
**D**ecoding by C**o**ntrasting **La**yers (DoLa) is a contrastive decoding strategy to improve the factuality and reduce the

View File

@ -87,7 +87,6 @@ For now the supported model architectures are the architectures that have been v
- Starcoder2
- T5
- Mamba
- Nemotron
## Example usage

View File

@ -240,7 +240,6 @@ Flax), PyTorch, and/or TensorFlow.
| [Nougat](model_doc/nougat) | ✅ | ✅ | ✅ |
| [Nyströmformer](model_doc/nystromformer) | ✅ | ❌ | ❌ |
| [OLMo](model_doc/olmo) | ✅ | ❌ | ❌ |
| [OLMo November 2024](model_doc/olmo_1124) | ✅ | ❌ | ❌ |
| [OLMoE](model_doc/olmoe) | ✅ | ❌ | ❌ |
| [OmDet-Turbo](model_doc/omdet-turbo) | ✅ | ❌ | ❌ |
| [OneFormer](model_doc/oneformer) | ✅ | ❌ | ❌ |

View File

@ -40,10 +40,6 @@ The original code can be found [here](https://github.com/salesforce/LAVIS/tree/5
- BLIP-2 can be used for conditional text generation given an image and an optional text prompt. At inference time, it's recommended to use the [`generate`] method.
- One can use [`Blip2Processor`] to prepare images for the model, and decode the predicted tokens ID's back to text.
> [!NOTE]
> BLIP models after release v4.46 will raise warnings about adding `processor.num_query_tokens = {{num_query_tokens}}` and expand model embeddings layer to add special `<image>` token. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. Adding these attributes means that BLIP will add the number of query tokens required per image and expand the text with as many `<image>` placeholders as there will be query tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there wil be failure when merging the embeddings.
The attributes can be obtained from model config, as `model.config.num_query_tokens` and model embeddings expansion can be done by following [this link](https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042).
## Resources
A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BLIP-2.

View File

@ -54,12 +54,6 @@ If you're interested in submitting a resource to be included here, please feel f
- preprocess
- post_process_object_detection
## DeformableDetrImageProcessorFast
[[autodoc]] DeformableDetrImageProcessorFast
- preprocess
- post_process_object_detection
## DeformableDetrFeatureExtractor
[[autodoc]] DeformableDetrFeatureExtractor

View File

@ -33,10 +33,6 @@ The original code can be found [here](https://github.com/salesforce/LAVIS/tree/m
InstructBLIP uses the same architecture as [BLIP-2](blip2) with a tiny but important difference: it also feeds the text prompt (instruction) to the Q-Former.
> [!NOTE]
> BLIP models after release v4.46 will raise warnings about adding `processor.num_query_tokens = {{num_query_tokens}}` and expand model embeddings layer to add special `<image>` token. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. Adding these attributes means that BLIP will add the number of query tokens required per image and expand the text with as many `<image>` placeholders as there will be query tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there wil be failure when merging the embeddings.
The attributes can be obtained from model config, as `model.config.num_query_tokens` and model embeddings expansion can be done by following [this link](https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042).
## InstructBlipConfig
[[autodoc]] InstructBlipConfig

View File

@ -35,10 +35,6 @@ The original code can be found [here](https://github.com/salesforce/LAVIS/tree/m
- The model was trained by sampling 4 frames per video, so it's recommended to sample 4 frames
> [!NOTE]
> BLIP models after release v4.46 will raise warnings about adding `processor.num_query_tokens = {{num_query_tokens}}` and expand model embeddings layer to add special `<image>` token. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. Adding these attributes means that BLIP will add the number of query tokens required per image and expand the text with as many `<image>` placeholders as there will be query tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there wil be failure when merging the embeddings.
The attributes can be obtained from model config, as `model.config.num_query_tokens` and model embeddings expansion can be done by following [this link](https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042).
## InstructBlipVideoConfig
[[autodoc]] InstructBlipVideoConfig

View File

@ -40,13 +40,6 @@ The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/
- Note the model has not been explicitly trained to process multiple images in the same prompt, although this is technically possible, you may experience inaccurate results.
> [!NOTE]
> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you.
Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `<image>` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings.
The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches.
### Single image inference
For best results, we recommend users to use the processor's `apply_chat_template()` method to format your prompt correctly. For that you need to construct a conversation history, passing in a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities, as follows:
@ -92,10 +85,10 @@ LLaVa also supports batched inference. Here is how you can do it:
import requests
from PIL import Image
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration
from transformers import AutoProcessor, LLavaForConditionalGeneration
# Load the model in half-precision
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", torch_dtype=torch.float16, device_map="auto")
model = LLavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", torch_dtype=torch.float16, device_map="auto")
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
# Get two different images

View File

@ -53,12 +53,6 @@ The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/
</Tip>
> [!NOTE]
> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you.
Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `<image>` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings.
The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches.
- Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. You can use the processor's `apply_chat_template` to format your prompts correctly. For that you have to construct a conversation history, passing a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities. Below is an example of how to do that and the list of formats accepted by each checkpoint.
We will use [llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) and a conversation history of text and image. Each content field has to be a list of dicts, as follows:

View File

@ -50,12 +50,6 @@ The original code can be found [here](https://github.com/LLaVA-VL/LLaVA-NeXT/tre
</Tip>
> [!NOTE]
> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you.
Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `<image>` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings.
The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches.
- Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. You can use tokenizer's `apply_chat_template` to format your prompts correctly. Below is an example of how to do that.
We will use [LLaVA-NeXT-Video-7B-hf](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf) and a conversation history of videos and images. Each content field has to be a list of dicts, as follows:

View File

@ -1,46 +0,0 @@
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# OLMo November 2024
## Overview
The OLMo November 2024 model is a successor of the OLMo model, which was proposed in
[OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838).
The architectural changes from the original OLMo model to this model are:
- RMSNorm is used instead of standard layer norm.
- Norm is applied to attention queries and keys.
- Norm is applied after attention/feedforward layers rather than before.
This model was contributed by [shanearora](https://huggingface.co/shanearora).
The original code can be found [here](https://github.com/allenai/OLMo/tree/main/olmo).
## Olmo1124Config
[[autodoc]] Olmo1124Config
## Olmo1124Model
[[autodoc]] Olmo1124Model
- forward
## Olmo1124ForCausalLM
[[autodoc]] Olmo1124ForCausalLM
- forward

View File

@ -57,7 +57,7 @@ Initially, an image is processed using a pre-trained convolutional neural networ
>>> with torch.no_grad():
... outputs = model(**inputs)
>>> results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3)
>>> results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=0.3)
>>> for result in results:
... for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]):

View File

@ -54,12 +54,6 @@ This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanT
The original code can be found [here](https://github.com/PKU-YuanGroup/Video-LLaVA).
> [!NOTE]
> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you.
Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `<image>` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings.
The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches.
## Usage example
### Single Media Mode

View File

@ -39,12 +39,6 @@ This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada)
- Note the model has not been explicitly trained to process multiple images in the same prompt, although this is technically possible, you may experience inaccurate results.
> [!NOTE]
> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you.
Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `<image>` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings.
The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches.
- For better results, we recommend users to use the processor's `apply_chat_template()` method to format your prompt correctly. For that you need to construct a conversation history, passing in a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities, as follows:
```python

View File

@ -1,68 +0,0 @@
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# Multi-GPU inference
Built-in Tensor Parallelism (TP) is now available with certain models using PyTorch. Tensor parallelism shards a model onto multiple GPUs, enabling larger model sizes, and parallelizes computations such as matrix multiplication.
To enable tensor parallel, pass the argument `tp_plan="auto"` to [`~AutoModelForCausalLM.from_pretrained`]:
```python
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
# Initialize distributed
rank = int(os.environ["RANK"])
device = torch.device(f"cuda:{rank}")
torch.distributed.init_process_group("nccl", device_id=device)
# Retrieve tensor parallel model
model = AutoModelForCausalLM.from_pretrained(
model_id,
tp_plan="auto",
)
# Prepare input tokens
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Can I help"
inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
# Distributed run
outputs = model(inputs)
```
You can use `torchrun` to launch the above script with multiple processes, each mapping to a GPU:
```
torchrun --nproc-per-node 4 demo.py
```
PyTorch tensor parallel is currently supported for the following models:
* [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
You can request to add tensor parallel support for another model by opening a GitHub Issue or Pull Request.
### Expected speedups
You can benefit from considerable speedups for inference, especially for inputs with large batch size or long sequences.
For a single forward pass on [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel) with a sequence length of 512 and various batch sizes, the expected speedup is as follows:
<div style="text-align: center">
<img src="huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Meta-Llama-3-8B-Instruct, seqlen = 512, python, w_ compile.png">
</div>

View File

@ -77,7 +77,6 @@ FlashAttention-2 is currently supported for the following architectures:
* [Nemotron](https://huggingface.co/docs/transformers/model_doc/nemotron)
* [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)
* [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel)
* [OLMo November 2024](https://huggingface.co/docs/transformers/model_doc/olmo_1124#transformers.Olmo1124Model)
* [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel)
* [OPT](https://huggingface.co/docs/transformers/model_doc/opt#transformers.OPTModel)
* [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration)
@ -261,7 +260,6 @@ For now, Transformers supports SDPA inference and training for the following arc
* [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel)
* [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)
* [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel)
* [OLMo November 2024](https://huggingface.co/docs/transformers/model_doc/olmo_1124#transformers.Olmo1124Model)
* [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel)
* [OPT](https://huggingface.co/docs/transformers/en/model_doc/opt)
* [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration)

View File

@ -53,7 +53,7 @@ sections we go through the steps to run inference on CPU and single/multi-GPU se
* [Inference on a single CPU](perf_infer_cpu)
* [Inference on a single GPU](perf_infer_gpu_one)
* [Multi-GPU inference](perf_infer_gpu_multi)
* [Multi-GPU inference](perf_infer_gpu_one)
* [XLA Integration for TensorFlow Models](tf_xla)

View File

@ -45,19 +45,19 @@ In short, supporting a wide range of quantization methods allows you to pick the
Use the table below to help you decide which quantization method to use.
| Quantization method | On the fly quantization | CPU | CUDA GPU | RoCm GPU (AMD) | Metal (Apple Silicon) | Intel GPU | torch.compile() support | Number of bits | Supports fine-tuning (through PEFT) | Serializable with 🤗 transformers | 🤗 transformers support | Link to library |
|-------------------------------------|-------------------------|-----|----------|----------------|-----------------------|-----------|-------------------------|----------------|-------------------------------------|--------------|------------------------|---------------------------------------------|
| [AQLM](./aqlm) | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 1 / 2 | 🟢 | 🟢 | 🟢 | https://github.com/Vahe1994/AQLM |
| [AWQ](./awq) | 🔴 | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | ? | 4 | 🟢 | 🟢 | 🟢 | https://github.com/casper-hansen/AutoAWQ |
| [bitsandbytes](./bitsandbytes) | 🟢 | 🟡 * | 🟢 | 🟡 * | 🔴 ** | 🟡 * | 🔴 (soon!) | 4 / 8 | 🟢 | 🟢 | 🟢 | https://github.com/bitsandbytes-foundation/bitsandbytes |
| [compressed-tensors](./compressed_tensors) | 🔴 | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 1 - 8 | 🟢 | 🟢 | 🟢 | https://github.com/neuralmagic/compressed-tensors |
| [EETQ](./eetq) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | ? | 8 | 🟢 | 🟢 | 🟢 | https://github.com/NetEase-FuXi/EETQ |
| GGUF / GGML (llama.cpp) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 1 - 8 | 🔴 | [See GGUF section](../gguf) | [See GGUF section](../gguf) | https://github.com/ggerganov/llama.cpp |
| [GPTQ](./gptq) | 🔴 | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 2 - 3 - 4 - 8 | 🟢 | 🟢 | 🟢 | https://github.com/AutoGPTQ/AutoGPTQ |
| [HQQ](./hqq) | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 1 - 8 | 🟢 | 🔴 | 🟢 | https://github.com/mobiusml/hqq/ |
| [optimum-quanto](./quanto) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🔴 | 🟢 | 2 / 4 / 8 | 🔴 | 🔴 | 🟢 | https://github.com/huggingface/optimum-quanto |
| [FBGEMM_FP8](./fbgemm_fp8.md) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 🔴 | 8 | 🔴 | 🟢 | 🟢 | https://github.com/pytorch/FBGEMM |
| [torchao](./torchao.md) | 🟢 | | 🟢 | 🔴 | partial support (int4 weight only) | 🔴 | | 4 / 8 | | 🟢🔴 | 🟢 | https://github.com/pytorch/ao |
| Quantization method | On the fly quantization | CPU | CUDA GPU | RoCm GPU (AMD) | Metal (Apple Silicon) | torch.compile() support | Number of bits | Supports fine-tuning (through PEFT) | Serializable with 🤗 transformers | 🤗 transformers support | Link to library |
|-------------------------------------|-------------------------|-----|----------|----------------|-----------------------|-------------------------|----------------|-------------------------------------|--------------|------------------------|---------------------------------------------|
| [AQLM](./aqlm) | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 🟢 | 1 / 2 | 🟢 | 🟢 | 🟢 | https://github.com/Vahe1994/AQLM |
| [AWQ](./awq) | 🔴 | 🔴 | 🟢 | 🟢 | 🔴 | ? | 4 | 🟢 | 🟢 | 🟢 | https://github.com/casper-hansen/AutoAWQ |
| [bitsandbytes](./bitsandbytes) | 🟢 | 🟡 * | 🟢 | 🟡 * | 🔴 ** | 🔴 (soon!) | 4 / 8 | 🟢 | 🟢 | 🟢 | https://github.com/bitsandbytes-foundation/bitsandbytes |
| [compressed-tensors](./compressed_tensors) | 🔴 | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 1 - 8 | 🟢 | 🟢 | 🟢 | https://github.com/neuralmagic/compressed-tensors |
| [EETQ](./eetq) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | ? | 8 | 🟢 | 🟢 | 🟢 | https://github.com/NetEase-FuXi/EETQ |
| GGUF / GGML (llama.cpp) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🔴 | 1 - 8 | 🔴 | [See GGUF section](../gguf) | [See GGUF section](../gguf) | https://github.com/ggerganov/llama.cpp |
| [GPTQ](./gptq) | 🔴 | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 2 - 3 - 4 - 8 | 🟢 | 🟢 | 🟢 | https://github.com/AutoGPTQ/AutoGPTQ |
| [HQQ](./hqq) | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 🟢 | 1 - 8 | 🟢 | 🔴 | 🟢 | https://github.com/mobiusml/hqq/ |
| [Quanto](./quanto) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🟢 | 2 / 4 / 8 | 🔴 | 🔴 | 🟢 | https://github.com/huggingface/quanto |
| [FBGEMM_FP8](./fbgemm_fp8.md) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 8 | 🔴 | 🟢 | 🟢 | https://github.com/pytorch/FBGEMM |
| [torchao](./torchao.md) | 🟢 | | 🟢 | 🔴 | partial support (int4 weight only) | | 4 / 8 | | 🟢🔴 | 🟢 | https://github.com/pytorch/ao |
<Tip>

View File

@ -14,21 +14,21 @@ rendered properly in your Markdown viewer.
-->
# Optimum-quanto
# Quanto
<Tip>
Try optimum-quanto + transformers with this [notebook](https://colab.research.google.com/drive/16CXfVmtdQvciSh9BopZUDYcmXCDpvgrT?usp=sharing)!
Try Quanto + transformers with this [notebook](https://colab.research.google.com/drive/16CXfVmtdQvciSh9BopZUDYcmXCDpvgrT?usp=sharing)!
</Tip>
[🤗 optimum-quanto](https://github.com/huggingface/optimum-quanto) library is a versatile pytorch quantization toolkit. The quantization method used is the linear quantization. Quanto provides several unique features such as:
[🤗 Quanto](https://github.com/huggingface/quanto) library is a versatile pytorch quantization toolkit. The quantization method used is the linear quantization. Quanto provides several unique features such as:
- weights quantization (`float8`,`int8`,`int4`,`int2`)
- activation quantization (`float8`,`int8`)
- modality agnostic (e.g CV,LLM)
- device agnostic (e.g CUDA,XPU,MPS,CPU)
- device agnostic (e.g CUDA,MPS,CPU)
- compatibility with `torch.compile`
- easy to add custom kernel for specific device
- supports quantization aware training
@ -37,12 +37,12 @@ Try optimum-quanto + transformers with this [notebook](https://colab.research.go
Before you begin, make sure the following libraries are installed:
```bash
pip install optimum-quanto accelerate transformers
pip install quanto accelerate transformers
```
Now you can quantize a model by passing [`QuantoConfig`] object in the [`~PreTrainedModel.from_pretrained`] method. This works for any model in any modality, as long as it contains `torch.nn.Linear` layers.
The integration with transformers only supports weights quantization. For the more complex use case such as activation quantization, calibration and quantization aware training, you should use [optimum-quanto](https://github.com/huggingface/optimum-quanto) library instead.
The integration with transformers only supports weights quantization. For the more complex use case such as activation quantization, calibration and quantization aware training, you should use [quanto](https://github.com/huggingface/quanto) library instead.
```py
from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
@ -55,7 +55,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cud
Note that serialization is not supported yet with transformers but it is coming soon! If you want to save the model, you can use quanto library instead.
Optimum-quanto library uses linear quantization algorithm for quantization. Even though this is a basic quantization technique, we get very good results! Have a look at the following benchmark (llama-2-7b on perplexity metric). You can find more benchmarks [here](https://github.com/huggingface/optimum-quanto/tree/main/bench/generation)
Quanto library uses linear quantization algorithm for quantization. Even though this is a basic quantization technique, we get very good results! Have a look at the following benchmark (llama-2-7b on perplexity metric). You can find more benchmarks [here](https://github.com/huggingface/quanto/tree/main/bench/generation)
<div class="flex gap-4">
<div>

View File

@ -26,7 +26,7 @@ after a natural disaster, monitoring crop health, or helping screen medical imag
This guide illustrates how to:
1. Fine-tune [ViT](../model_doc/vit) on the [Food-101](https://huggingface.co/datasets/food101) dataset to classify a food item in an image.
1. Fine-tune [ViT](model_doc/vit) on the [Food-101](https://huggingface.co/datasets/food101) dataset to classify a food item in an image.
2. Use your fine-tuned model for inference.
<Tip>

View File

@ -120,46 +120,6 @@ print(generated_texts)
## ['User: What do we see in this image? \nAssistant: In this image we can see two cats on the nets. \nUser: And how about this image? \nAssistant: In this image we can see flowers, plants and insect.']
```
## Pipeline
The fastest way to get started is to use the [`Pipeline`] API. Specify the `"image-text-to-text"` task and the model you want to use.
```python
from transformers import pipeline
pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
```
The example below uses chat templates to format the text inputs.
```python
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg",
},
{"type": "text", "text": "Describe this image."},
],
},
{
"role": "assistant",
"content": [
{"type": "text", "text": "There's a pink flower"},
],
},
]
```
Pass the chat template formatted text and image to [`Pipeline`] and set `return_full_text=False` to remove the input from the generated output.
```python
outputs = pipe(text=messages, max_new_tokens=20, return_full_text=False)
outputs[0]["generated_text"]
# with a yellow center in the foreground. The flower is surrounded by red and white flowers with green stems
```
## Streaming
We can use [text streaming](./generation_strategies#streaming) for a better generation experience. Transformers supports streaming with the [`TextStreamer`] or [`TextIteratorStreamer`] classes. We will use the [`TextIteratorStreamer`] with IDEFICS-8B.

View File

@ -182,7 +182,7 @@ There are three main components to Mask2Former:
The mask predictions are generated by combining the pixel-embeddings with the final decoder hidden states. The sigmoid cross-entropy and dice loss is calculated between the logits and the ground truth mask to find the most likely mask.
Ready to try your hand at image segmentation? Check out our complete [image segmentation guide](tasks/semantic_segmentation) to learn how to finetune SegFormer and use it for inference!
Ready to try your hand at object detection? Check out our complete [image segmentation guide](tasks/semantic_segmentation) to learn how to finetune SegFormer and use it for inference!
### Depth estimation
@ -292,4 +292,4 @@ Ready to try your hand at translation? Check out our complete [translation guide
For more information about text generation, check out the [text generation strategies](generation_strategies) guide!
</Tip>
</Tip>

View File

@ -428,7 +428,7 @@ pytest --instafail
### To GPU or not to GPU
On a GPU-enabled setup, to test in CPU-only mode add `CUDA_VISIBLE_DEVICES=""` for CUDA GPUs:
On a GPU-enabled setup, to test in CPU-only mode add `CUDA_VISIBLE_DEVICES=""`:
```bash
CUDA_VISIBLE_DEVICES="" pytest tests/utils/test_logging.py
@ -441,12 +441,10 @@ second gpu if you have gpus `0` and `1`, you can run:
CUDA_VISIBLE_DEVICES="1" pytest tests/utils/test_logging.py
```
For Intel GPUs, use `ZE_AFFINITY_MASK` instead of `CUDA_VISIBLE_DEVICES` in the above example.
This is handy when you want to run different tasks on different GPUs.
Some tests must be run on CPU-only, others on either CPU or GPU or TPU, yet others on multiple-GPUs. The following skip
decorators are used to set the requirements of tests CPU/GPU/XPU/TPU-wise:
decorators are used to set the requirements of tests CPU/GPU/TPU-wise:
- `require_torch` - this test will run only under torch
- `require_torch_gpu` - as `require_torch` plus requires at least 1 GPU

View File

@ -174,7 +174,7 @@ trainer = Trainer(
processing_class=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback()],
callback=[EarlyStoppingCallback()],
)
```

View File

@ -287,10 +287,9 @@ model.fit(tf_dataset)
At this point, you may need to restart your notebook or execute the following code to free some memory:
```py
from accelerate.utils.memory import clear_device_cache
del model
del trainer
clear_device_cache()
torch.cuda.empty_cache()
```
Next, manually postprocess `tokenized_dataset` to prepare it for training.
@ -365,9 +364,8 @@ Lastly, specify `device` to use a GPU if you have access to one. Otherwise, trai
```py
>>> import torch
>>> from accelerate.test_utils.testing import get_backend
>>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
>>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
>>> model.to(device)
```

View File

@ -86,8 +86,6 @@
title: 🤗Transformers能做什么
- local: tokenizer_summary
title: 分词器的摘要
- local: attention
title: 注意力机制
title: 概念指南
- sections:
- sections:

View File

@ -1,37 +0,0 @@
<!--版权2023年HuggingFace团队保留所有权利。
根据Apache许可证第2.0版(“许可证”)许可;除非符合许可证,否则您不得使用此文件。您可以在以下网址获取许可证的副本:
http://www.apache.org/licenses/LICENSE-2.0
除非适用法律要求或书面同意,否则按“按原样”分发的软件,无论是明示还是暗示的,都没有任何担保或条件。请参阅许可证以了解特定语言下的权限和限制。
⚠️ 请注意本文件虽然使用Markdown编写但包含了特定的语法适用于我们的doc-builder类似于MDX可能无法在您的Markdown查看器中正常渲染。
-->
# 注意力机制
大多数 transformer 模型使用完全注意力机制该机制采用正方形的注意力矩阵。当输入很长的文本时这将导致巨大的计算瓶颈。Longformer 和 Reformer 是提高注意力机制效率的改进模型,它们使用稀疏化的注意力矩阵来加速训练。
## 局部敏感哈希注意力机制LSH attention
[Reformer](model_doc/reformer)使用LSH局部敏感哈希的注意力机制。在计算softmax(QK^t)时只有矩阵QK^t中的最大元素在softmax维度上会做出有用的贡献。所以对于Q中的每个查询q我们只需要考虑K中与q接近的键k这里使用了一个哈希函数来确定q和k是否接近。注意力掩码被修改以掩盖当前的词符token除了第一个位置之外因为这样会使得查询和键相等因此非常相似。由于哈希可能会有些随机性所以在实践中使用多个哈希函数由n_rounds参数确定,然后一起求平均。
## 局部注意力机制Local attention
[Longformer](model_doc/longformer)使用局部注意力机制:通常情况下,局部上下文(例如,左边和右边的两个词符是什么?)对于给定词符的操作已经足够了。此外,通过堆叠具有小窗口的注意力层,最后一层将拥有不仅仅是窗口内词符的感受野,这使得它们能构建整个句子的表示。
一些预先选定的输入词符也被赋予全局注意力:对于这些少数词符注意力矩阵可以访问所有词符tokens并且这个过程是对称的:所有其他词符除了它们局部窗口内的词符之外也可以访问这些特定的词符。这在论文的图2d中有展示下面是一个样本注意力掩码
<div class="flex justify-center">
<img scale="50 %" align="center" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/local_attention_mask.png"/>
</div>
使用参数更少的注意力矩阵,可以让模型处理更长的输入序列。
## 其他技巧
### 轴向位置编码
[Reformer](model_doc/reformer)模型使用轴向位置编码在传统的transformer模型中位置编码矩阵E的大小是\\(l\\)乘以\\(d\\),其中\\(l\\)是序列长度,\\(d\\)是隐藏状态的维度。如果你有非常长的文本这个矩阵可能会非常大将会占用大量的GPU显存。为了缓解这个问题轴向位置编码将这个大矩阵E分解成两个较小的矩阵E1和E2它们的维度分别是\\(l_{1} \times d_{1}\\) 和\\(l_{2} \times d_{2}\\),满足\\(l_{1} \times l_{2} = l\\)和\\(d_{1} + d_{2} = d\\)通过长度的乘积最终得到的矩阵要小得多。在E中对于时间步\\(j\\) 的嵌入是通过连接E1中时间步 \\(j \% l1\\) 的嵌入和E2中时间步\\(j // l1\\)的嵌入来获得的。

View File

@ -130,16 +130,6 @@ class MyNewModelConfig(PretrainedConfig):
model_type = "my_new_model"
keys_to_ignore_at_inference = ["past_key_values"]
# Default tensor parallel plan for base model `MyNewModelModel`
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise",
"layers.*.self_attn.k_proj": "colwise",
"layers.*.self_attn.v_proj": "colwise",
"layers.*.self_attn.o_proj": "rowwise",
"layers.*.mlp.gate_proj": "colwise",
"layers.*.mlp.up_proj": "colwise",
"layers.*.mlp.down_proj": "rowwise",
}
def __init__(
self,

View File

@ -33,16 +33,6 @@ class MyNewModel2Config(PretrainedConfig):
model_type = "my_new_model2"
keys_to_ignore_at_inference = ["past_key_values"]
# Default tensor parallel plan for base model `MyNewModel2Model`
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise",
"layers.*.self_attn.k_proj": "colwise",
"layers.*.self_attn.v_proj": "colwise",
"layers.*.self_attn.o_proj": "rowwise",
"layers.*.mlp.gate_proj": "colwise",
"layers.*.mlp.up_proj": "colwise",
"layers.*.mlp.down_proj": "rowwise",
}
def __init__(
self,

View File

@ -8,6 +8,7 @@ import math
from typing import List, Optional, Tuple, Union
import torch
import torch.nn.functional as F
from torch import nn
from ...activations import ACT2FN
@ -149,7 +150,25 @@ class DummyMLP(nn.Module):
self.act_fn = ACT2FN[config.hidden_act]
def forward(self, x):
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
if self.config.pretraining_tp > 1:
slice = self.intermediate_size // self.config.pretraining_tp
gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
up_proj_slices = self.up_proj.weight.split(slice, dim=0)
down_proj_slices = self.down_proj.weight.split(slice, dim=1)
gate_proj = torch.cat(
[F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
)
up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
down_proj = [
F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
]
down_proj = sum(down_proj)
else:
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
return down_proj
@ -245,14 +264,31 @@ class DummyAttention(nn.Module):
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
bsz, q_len, _ = hidden_states.size()
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
if self.config.pretraining_tp > 1:
key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
query_slices = self.q_proj.weight.split(
(self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
)
key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
# use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used
query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
query_states = torch.cat(query_states, dim=-1)
key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
key_states = torch.cat(key_states, dim=-1)
value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
value_states = torch.cat(value_states, dim=-1)
else:
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
if position_embeddings is None:
logger.warning_once(
@ -294,7 +330,12 @@ class DummyAttention(nn.Module):
attn_output = attn_output.reshape(bsz, q_len, -1)
attn_output = self.o_proj(attn_output)
if self.config.pretraining_tp > 1:
attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
else:
attn_output = self.o_proj(attn_output)
if not output_attentions:
attn_weights = None
@ -467,10 +508,9 @@ class DummySdpaAttention(DummyAttention):
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
# use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used
query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
if position_embeddings is None:
logger.warning_once(
@ -754,10 +794,7 @@ class DummyModel(DummyPreTrainedModel):
)
self.norm = DummyRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.rotary_emb = DummyRotaryEmbedding(config=config)
self.gradient_checkpointing = False
if getattr(config, "pretraining_tp", 1) != 1:
logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")
# Initialize weights and apply final processing
self.post_init()
@ -837,7 +874,7 @@ class DummyModel(DummyPreTrainedModel):
all_self_attns = () if output_attentions else None
next_decoder_cache = None
for decoder_layer in self.layers[: self.config.num_hidden_layers]:
for decoder_layer in self.layers:
if output_hidden_states:
all_hidden_states += (hidden_states,)

View File

@ -667,10 +667,7 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
[MyNewModel2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
)
self.norm = MyNewModel2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.gradient_checkpointing = False
if getattr(config, "pretraining_tp", 1) != 1:
logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")
# Initialize weights and apply final processing
self.post_init()
@ -755,7 +752,7 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
all_self_attns = () if output_attentions else None
next_decoder_cache = None
for decoder_layer in self.layers[: self.config.num_hidden_layers]:
for decoder_layer in self.layers:
if output_hidden_states:
all_hidden_states += (hidden_states,)

View File

@ -8,6 +8,7 @@ import math
from typing import List, Optional, Tuple, Union
import torch
import torch.nn.functional as F
from torch import nn
from ...activations import ACT2FN
@ -149,7 +150,25 @@ class SuperMLP(nn.Module):
self.act_fn = ACT2FN[config.hidden_act]
def forward(self, x):
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
if self.config.pretraining_tp > 1:
slice = self.intermediate_size // self.config.pretraining_tp
gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
up_proj_slices = self.up_proj.weight.split(slice, dim=0)
down_proj_slices = self.down_proj.weight.split(slice, dim=1)
gate_proj = torch.cat(
[F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
)
up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
down_proj = [
F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
]
down_proj = sum(down_proj)
else:
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
return down_proj
@ -245,14 +264,31 @@ class SuperAttention(nn.Module):
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
bsz, q_len, _ = hidden_states.size()
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
if self.config.pretraining_tp > 1:
key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
query_slices = self.q_proj.weight.split(
(self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
)
key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
# use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used
query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
query_states = torch.cat(query_states, dim=-1)
key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
key_states = torch.cat(key_states, dim=-1)
value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
value_states = torch.cat(value_states, dim=-1)
else:
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
if position_embeddings is None:
logger.warning_once(
@ -294,7 +330,12 @@ class SuperAttention(nn.Module):
attn_output = attn_output.reshape(bsz, q_len, -1)
attn_output = self.o_proj(attn_output)
if self.config.pretraining_tp > 1:
attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
else:
attn_output = self.o_proj(attn_output)
if not output_attentions:
attn_weights = None
@ -467,10 +508,9 @@ class SuperSdpaAttention(SuperAttention):
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
# use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used
query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
if position_embeddings is None:
logger.warning_once(
@ -754,10 +794,7 @@ class SuperModel(SuperPreTrainedModel):
)
self.norm = SuperRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.rotary_emb = SuperRotaryEmbedding(config=config)
self.gradient_checkpointing = False
if getattr(config, "pretraining_tp", 1) != 1:
logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")
# Initialize weights and apply final processing
self.post_init()

View File

@ -331,7 +331,7 @@ def main():
config = AutoConfig.from_pretrained(
args.model_name_or_path,
num_labels=len(labels),
id2label=id2label,
i2label=id2label,
label2id=label2id,
finetuning_task="image-classification",
trust_remote_code=args.trust_remote_code,

View File

@ -148,7 +148,7 @@ with torch.no_grad():
outputs = model(**inputs)
# Post-process outputs
outputs = image_processor.post_process_instance_segmentation(outputs, target_sizes=[(image.height, image.width)])
outputs = image_processor.post_process_instance_segmentation(outputs, target_sizes=[image.size[::-1]])
print("Mask shape: ", outputs[0]["segmentation"].shape)
print("Mask values: ", outputs[0]["segmentation"].unique())

View File

@ -1,5 +1,5 @@
absl-py==1.0.0
aiohttp==3.10.11
aiohttp==3.10.2
aiosignal==1.2.0
alembic==1.7.7
appdirs==1.4.4

View File

@ -117,7 +117,7 @@ _deps = [
"fugashi>=1.0",
"GitPython<3.1.19",
"hf-doc-builder>=0.3.0",
"huggingface-hub>=0.24.0,<1.0",
"huggingface-hub>=0.23.2,<1.0",
"importlib_metadata",
"ipadic>=1.0.0,<2.0",
"isort>=5.5.4",

View File

@ -620,7 +620,6 @@ _import_structure = {
"models.nougat": ["NougatProcessor"],
"models.nystromformer": ["NystromformerConfig"],
"models.olmo": ["OlmoConfig"],
"models.olmo_1124": ["Olmo1124Config"],
"models.olmoe": ["OlmoeConfig"],
"models.omdet_turbo": [
"OmDetTurboConfig",
@ -1186,7 +1185,7 @@ else:
)
_import_structure["models.convnext"].extend(["ConvNextFeatureExtractor", "ConvNextImageProcessor"])
_import_structure["models.deformable_detr"].extend(
["DeformableDetrFeatureExtractor", "DeformableDetrImageProcessor", "DeformableDetrImageProcessorFast"]
["DeformableDetrFeatureExtractor", "DeformableDetrImageProcessor"]
)
_import_structure["models.deit"].extend(["DeiTFeatureExtractor", "DeiTImageProcessor"])
_import_structure["models.deprecated.deta"].append("DetaImageProcessor")
@ -2920,13 +2919,6 @@ else:
"OlmoPreTrainedModel",
]
)
_import_structure["models.olmo_1124"].extend(
[
"Olmo1124ForCausalLM",
"Olmo1124Model",
"Olmo1124PreTrainedModel",
]
)
_import_structure["models.olmoe"].extend(
[
"OlmoeForCausalLM",
@ -5514,7 +5506,6 @@ if TYPE_CHECKING:
NystromformerConfig,
)
from .models.olmo import OlmoConfig
from .models.olmo_1124 import Olmo1124Config
from .models.olmoe import OlmoeConfig
from .models.omdet_turbo import (
OmDetTurboConfig,
@ -6100,7 +6091,6 @@ if TYPE_CHECKING:
from .models.deformable_detr import (
DeformableDetrFeatureExtractor,
DeformableDetrImageProcessor,
DeformableDetrImageProcessorFast,
)
from .models.deit import DeiTFeatureExtractor, DeiTImageProcessor
from .models.deprecated.deta import DetaImageProcessor
@ -7533,11 +7523,6 @@ if TYPE_CHECKING:
OlmoModel,
OlmoPreTrainedModel,
)
from .models.olmo_1124 import (
Olmo1124ForCausalLM,
Olmo1124Model,
Olmo1124PreTrainedModel,
)
from .models.olmoe import (
OlmoeForCausalLM,
OlmoeModel,

View File

@ -23,7 +23,6 @@ import json
import os
import tempfile
from functools import lru_cache, wraps
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Union
from huggingface_hub import create_repo, get_collection, hf_hub_download, metadata_update, upload_folder
@ -46,7 +45,7 @@ from ..utils import (
is_vision_available,
logging,
)
from .agent_types import ImageType, handle_agent_inputs, handle_agent_outputs
from .agent_types import handle_agent_inputs, handle_agent_outputs
logger = logging.get_logger(__name__)
@ -419,9 +418,7 @@ class Tool:
)
@staticmethod
def from_space(
space_id: str, name: str, description: str, api_name: Optional[str] = None, token: Optional[str] = None
):
def from_space(space_id, name, description):
"""
Creates a [`Tool`] from a Space given its id on the Hub.
@ -432,73 +429,34 @@ class Tool:
The name of the tool.
description (`str`):
The description of the tool.
api_name (`str`, *optional*):
The specific api_name to use, if the space has several tabs. If not precised, will default to the first available api.
token (`str`, *optional*):
Add your token to access private spaces or increase your GPU quotas.
Returns:
[`Tool`]:
The Space, as a tool.
The created tool.
Examples:
Example:
```
image_generator = Tool.from_space(
space_id="black-forest-labs/FLUX.1-schnell",
name="image-generator",
description="Generate an image from a prompt"
)
image = image_generator("Generate an image of a cool surfer in Tahiti")
```
```
face_swapper = Tool.from_space(
"tuan2308/face-swap",
"face_swapper",
"Tool that puts the face shown on the first image on the second image. You can give it paths to images.",
)
image = face_swapper('./aymeric.jpeg', './ruth.jpg')
tool = Tool.from_space("black-forest-labs/FLUX.1-schnell", "image-generator", "Generate an image from a prompt")
```
"""
from gradio_client import Client, handle_file
from gradio_client.utils import is_http_url_like
from gradio_client import Client
class SpaceToolWrapper(Tool):
def __init__(
self,
space_id: str,
name: str,
description: str,
api_name: Optional[str] = None,
token: Optional[str] = None,
):
self.client = Client(space_id, hf_token=token)
def __init__(self, space_id, name, description):
self.client = Client(space_id)
self.name = name
self.description = description
space_description = self.client.view_api(return_format="dict", print_info=False)["named_endpoints"]
# If api_name is not defined, take the first of the available APIs for this space
if api_name is None:
api_name = list(space_description.keys())[0]
logger.warning(
f"Since `api_name` was not defined, it was automatically set to the first avilable API: `{api_name}`."
)
self.api_name = api_name
try:
space_description_api = space_description[api_name]
except KeyError:
raise KeyError(f"Could not find specified {api_name=} among available api names.")
space_description = self.client.view_api(return_format="dict")["named_endpoints"]
route = list(space_description.keys())[0]
space_description_route = space_description[route]
self.inputs = {}
for parameter in space_description_api["parameters"]:
for parameter in space_description_route["parameters"]:
if not parameter["parameter_has_default"]:
parameter_type = parameter["type"]["type"]
if parameter_type == "object":
parameter_type = "any"
self.inputs[parameter["parameter_name"]] = {
"type": parameter_type,
"type": parameter["type"]["type"],
"description": parameter["python_type"]["description"],
}
output_component = space_description_api["returns"][0]["component"]
output_component = space_description_route["returns"][0]["component"]
if output_component == "Image":
self.output_type = "image"
elif output_component == "Audio":
@ -506,33 +464,10 @@ class Tool:
else:
self.output_type = "any"
def sanitize_argument_for_prediction(self, arg):
if isinstance(arg, ImageType):
temp_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
arg.save(temp_file.name)
arg = temp_file.name
if (isinstance(arg, (str, Path)) and Path(arg).exists() and Path(arg).is_file()) or is_http_url_like(
arg
):
arg = handle_file(arg)
return arg
def forward(self, *args, **kwargs):
# Preprocess args and kwargs:
args = list(args)
for i, arg in enumerate(args):
args[i] = self.sanitize_argument_for_prediction(arg)
for arg_name, arg in kwargs.items():
kwargs[arg_name] = self.sanitize_argument_for_prediction(arg)
return self.client.predict(*args, **kwargs)[0] # Usually the first output is the result
output = self.client.predict(*args, api_name=self.api_name, **kwargs)
if isinstance(output, tuple) or isinstance(output, list):
return output[
0
] # Sometime the space also returns the generation seed, in which case the result is at index 0
return output
return SpaceToolWrapper(space_id, name, description, api_name=api_name, token=token)
return SpaceToolWrapper(space_id, name, description)
@staticmethod
def from_gradio(gradio_tool):

View File

@ -433,22 +433,19 @@ class DynamicCache(Cache):
self._seen_tokens += key_states.shape[-2]
# Update the cache
if key_states is not None:
if len(self.key_cache) <= layer_idx:
# There may be skipped layers, fill them with empty lists
for _ in range(len(self.key_cache), layer_idx):
self.key_cache.append([])
self.value_cache.append([])
self.key_cache.append(key_states)
self.value_cache.append(value_states)
elif (
len(self.key_cache[layer_idx]) == 0
): # fills previously skipped layers; checking for tensor causes errors
self.key_cache[layer_idx] = key_states
self.value_cache[layer_idx] = value_states
else:
self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
if len(self.key_cache) <= layer_idx:
# There may be skipped layers, fill them with empty lists
for _ in range(len(self.key_cache), layer_idx):
self.key_cache.append([])
self.value_cache.append([])
self.key_cache.append(key_states)
self.value_cache.append(value_states)
elif len(self.key_cache[layer_idx]) == 0: # fills previously skipped layers; checking for tensor causes errors
self.key_cache[layer_idx] = key_states
self.value_cache[layer_idx] = value_states
else:
self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
return self.key_cache[layer_idx], self.value_cache[layer_idx]
@ -528,7 +525,7 @@ class DynamicCache(Cache):
cache = cls()
for idx in range(len(splits[0])):
key_cache = [current.key_cache[idx] for current in splits if current.key_cache[idx] != []]
value_cache = [current.value_cache[idx] for current in splits if current.value_cache[idx] != []]
value_cache = [current.key_cache[idx] for current in splits if current.key_cache[idx] != []]
if key_cache != []:
layer_keys = torch.cat(key_cache, dim=0)
layer_values = torch.cat(value_cache, dim=0)
@ -784,11 +781,6 @@ class QuantoQuantizedCache(QuantizedCache):
super().__init__(cache_config)
if is_optimum_quanto_available():
optimum_quanto_version = version.parse(importlib.metadata.version("optimum-quanto"))
if optimum_quanto_version <= version.parse("0.2.5"):
raise ImportError(
f"You need optimum-quanto package version to be greater or equal than 0.2.5 to use `QuantoQuantizedCache`. Detected version {optimum_quanto_version}."
)
from optimum.quanto import MaxOptimizer, qint2, qint4
elif is_quanto_available():
logger.warning_once(
@ -821,8 +813,7 @@ class QuantoQuantizedCache(QuantizedCache):
if is_optimum_quanto_available():
from optimum.quanto import quantize_weight
scale, zeropoint = self.optimizer(tensor, self.qtype, axis, self.q_group_size)
qtensor = quantize_weight(tensor, self.qtype, axis, scale, zeropoint, self.q_group_size)
qtensor = quantize_weight(tensor, self.qtype, axis, self.q_group_size)
return qtensor
elif is_quanto_available():
logger.warning_once(
@ -1523,10 +1514,7 @@ class EncoderDecoderCache(Cache):
self.check_dynamic_cache(self.crop.__name__)
self.self_attention_cache.crop(maximum_length)
@deprecate_kwarg("num_hidden_layers", version="4.47.0")
def batch_split(
self, full_batch_size: int, split_size: int, num_hidden_layers: int = None
) -> "List[EncoderDecoderCache]":
def batch_split(self, full_batch_size: int, split_size: int) -> "List[EncoderDecoderCache]":
"""Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
`_split_model_inputs()` in `generation.utils`"""
self.check_dynamic_cache(self.batch_split.__name__)
@ -1539,10 +1527,7 @@ class EncoderDecoderCache(Cache):
return out
@classmethod
@deprecate_kwarg("num_hidden_layers", version="4.47.0")
def from_batch_splits(
cls, splits: List["EncoderDecoderCache"], num_hidden_layers: int = None
) -> "EncoderDecoderCache":
def from_batch_splits(cls, splits: List["EncoderDecoderCache"]) -> "EncoderDecoderCache":
"""This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in
`generation.utils`"""
self_attention_cache = DynamicCache()

View File

@ -71,8 +71,6 @@ class PretrainedConfig(PushToHubMixin):
outputs of the model during inference.
- **attribute_map** (`Dict[str, str]`) -- A dict that maps model specific attribute names to the standardized
naming of attributes.
- **base_model_tp_plan** (`Dict[str, Any]`) -- A dict that maps sub-modules FQNs of a base model to a tensor
parallel plan applied to the sub-module when `model.tensor_parallel` is called.
Common attributes (present in all subclasses):
@ -196,7 +194,6 @@ class PretrainedConfig(PushToHubMixin):
sub_configs: Dict[str, "PretrainedConfig"] = {}
is_composition: bool = False
attribute_map: Dict[str, str] = {}
base_model_tp_plan: Optional[Dict[str, Any]] = None
_auto_class: Optional[str] = None
def __setattr__(self, key, value):
@ -851,9 +848,6 @@ class PretrainedConfig(PushToHubMixin):
if "_attn_implementation_internal" in serializable_config_dict:
del serializable_config_dict["_attn_implementation_internal"]
# Do not serialize `base_model_tp_plan` for now
if "base_model_tp_plan" in serializable_config_dict:
del serializable_config_dict["base_model_tp_plan"]
return serializable_config_dict
@ -873,9 +867,6 @@ class PretrainedConfig(PushToHubMixin):
del output["_commit_hash"]
if "_attn_implementation_internal" in output:
del output["_attn_implementation_internal"]
# Do not serialize `base_model_tp_plan` for now
if "base_model_tp_plan" in output:
del output["base_model_tp_plan"]
# Transformers version when serializing the model
output["transformers_version"] = __version__

View File

@ -24,7 +24,7 @@ deps = {
"fugashi": "fugashi>=1.0",
"GitPython": "GitPython<3.1.19",
"hf-doc-builder": "hf-doc-builder>=0.3.0",
"huggingface-hub": "huggingface-hub>=0.24.0,<1.0",
"huggingface-hub": "huggingface-hub>=0.23.2,<1.0",
"importlib_metadata": "importlib_metadata",
"ipadic": "ipadic>=1.0.0,<2.0",
"isort": "isort>=5.5.4",

View File

@ -49,7 +49,6 @@ else:
_import_structure["candidate_generator"] = [
"AssistedCandidateGenerator",
"CandidateGenerator",
"EarlyExitCandidateGenerator",
"PromptLookupCandidateGenerator",
]
_import_structure["logits_process"] = [
@ -207,12 +206,7 @@ if TYPE_CHECKING:
else:
from .beam_constraints import Constraint, ConstraintListState, DisjunctiveConstraint, PhrasalConstraint
from .beam_search import BeamHypotheses, BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer
from .candidate_generator import (
AssistedCandidateGenerator,
CandidateGenerator,
EarlyExitCandidateGenerator,
PromptLookupCandidateGenerator,
)
from .candidate_generator import AssistedCandidateGenerator, CandidateGenerator, PromptLookupCandidateGenerator
from .logits_process import (
AlternatingCodebooksLogitsProcessor,
ClassifierFreeGuidanceLogitsProcessor,

View File

@ -255,8 +255,7 @@ class AssistedCandidateGenerator(CandidateGenerator):
"heuristic",
"heuristic_transient",
}:
# len(scores[0])-1 is the number of candidates according to the target tokenizer.
if num_matches == len(scores[0]) - 1:
if num_matches == int(self.num_assistant_tokens):
self.num_assistant_tokens += 2.0
else:
self.num_assistant_tokens = max(1.0, self.num_assistant_tokens - 1.0)
@ -671,62 +670,6 @@ class PromptLookupCandidateGenerator(CandidateGenerator):
return
class EarlyExitCandidateGenerator(AssistedCandidateGenerator):
"""
`CandidateGenerator` class to be used for assisted generation and speculative decoding. This class generates
candidates through the use of **the model itself**, exiting early. Can only be used with models that support early
exit, e.g., `facebook/layerskip-llama3.2-1B`.
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
assistant_model (`PreTrainedModel`):
The original model. This model must support early exit (i.e. is trained to compute logits in earlier
layers).
generation_config (`~generation.GenerationConfig`, *optional*):
The generation configuration to be used as base parametrization for the generation call.
logits_processor (`LogitsProcessorList`):
An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
used to modify the prediction scores of the language modeling head applied at each generation step.
model_kwargs (`Dict`):
The keyword arguments that will be passed to the main model, and are used as base inputs for the assistant
model as well.
inputs_tensor (`torch.Tensor`, *optional*):
The model input tensor. In encoder-decoder models, this is the encoder input.
"""
def __init__(
self,
input_ids: torch.LongTensor,
assistant_model: "PreTrainedModel",
generation_config: "GenerationConfig",
model_kwargs: Dict,
inputs_tensor: Optional[torch.Tensor] = None,
logits_processor: "LogitsProcessorList" = None,
):
super().__init__(
input_ids=input_ids,
assistant_model=assistant_model,
generation_config=generation_config,
model_kwargs=model_kwargs,
inputs_tensor=inputs_tensor,
logits_processor=logits_processor,
)
# We have to move early exit out of the generation config, otherwise the assistant will also call `generate`
# with early exit
self.assistant_early_exit = self.generation_config.assistant_early_exit
self.generation_config.assistant_early_exit = None
def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
# Temporarily sets the number of hidden layers to the early exit value
base_model = getattr(self.assistant_model, self.assistant_model.base_model_prefix)
original_num_hidden_layers = base_model.config.num_hidden_layers
base_model.config.num_hidden_layers = self.assistant_early_exit
candidate_ids, candidate_logits = super().get_candidates(input_ids)
base_model.config.num_hidden_layers = original_num_hidden_layers
return candidate_ids, candidate_logits
def _crop_past_key_values(model, past_key_values, max_length):
"""Crops the past key values up to a certain maximum length."""
new_past = []

View File

@ -353,13 +353,10 @@ class GenerationConfig(PushToHubMixin):
than this threshold, the assistant model stops the current token generation iteration, even if the number of _speculative tokens_
(defined by `num_assistant_tokens`) is not yet reached. It is an unsupervised version of the dynamic speculation lookahead
from Dynamic Speculation Lookahead Accelerates Speculative Decoding of Large Language Models <https://arxiv.org/abs/2405.04304>.
prompt_lookup_num_tokens (`int`, *optional*):
prompt_lookup_num_tokens (`int`, *optional*, default to `None`):
The number of tokens to be output as candidate tokens.
max_matching_ngram_size (`int`, *optional*):
max_matching_ngram_size (`int`, *optional*, default to `None`):
The maximum ngram size to be considered for matching in the prompt. Default to 2 if not provided.
assistant_early_exit(`int`, *optional*):
If set to a positive integer, early exit of the model will be used as an assistant. Can only be used with
models that support early exit (i.e. models where logits from intermediate layers can be interpreted by the LM head).
> Wild card
@ -457,9 +454,10 @@ class GenerationConfig(PushToHubMixin):
self.num_assistant_tokens = kwargs.pop("num_assistant_tokens", 20)
self.num_assistant_tokens_schedule = kwargs.pop("num_assistant_tokens_schedule", "constant")
self.assistant_confidence_threshold = kwargs.pop("assistant_confidence_threshold", 0.4)
# Prompt lookup decoding
self.prompt_lookup_num_tokens = kwargs.pop("prompt_lookup_num_tokens", None)
self.max_matching_ngram_size = kwargs.pop("max_matching_ngram_size", None)
self.assistant_early_exit = kwargs.pop("assistant_early_exit", None)
# Wild card
self.generation_kwargs = kwargs.pop("generation_kwargs", {})
@ -536,11 +534,7 @@ class GenerationConfig(PushToHubMixin):
generation_mode = GenerationMode.BEAM_SEARCH
# Assisted generation may extend some generation modes
if (
assistant_model is not None
or self.prompt_lookup_num_tokens is not None
or self.assistant_early_exit is not None
):
if assistant_model is not None or self.prompt_lookup_num_tokens is not None:
if generation_mode in ("greedy_search", "sample"):
generation_mode = GenerationMode.ASSISTED_GENERATION
else:

View File

@ -398,11 +398,7 @@ class FlaxGenerationMixin:
)
generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
else: # by default let's always generate 10 new tokens
if generation_config.max_length == GenerationConfig().max_length:
generation_config.max_length = generation_config.max_length + input_ids_seq_length
max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
if max_position_embeddings is not None:
generation_config.max_length = min(generation_config.max_length, max_position_embeddings)
generation_config.max_length = generation_config.max_length + input_ids_seq_length
if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length:
raise ValueError(

View File

@ -54,7 +54,6 @@ from .candidate_generator import (
AssistedCandidateGenerator,
AssistedCandidateGeneratorDifferentTokenizers,
CandidateGenerator,
EarlyExitCandidateGenerator,
PromptLookupCandidateGenerator,
_crop_past_key_values,
_prepare_attention_mask,
@ -823,16 +822,7 @@ class GenerationMixin:
"""
different_tokenizers = all(v is not None for v in (assistant_model, target_tokenizer, assistant_tokenizer))
if generation_config.assistant_early_exit is not None:
candidate_generator = EarlyExitCandidateGenerator(
input_ids=input_ids,
assistant_model=self,
generation_config=generation_config,
model_kwargs=model_kwargs,
inputs_tensor=inputs_tensor,
logits_processor=logits_processor,
)
elif generation_config.prompt_lookup_num_tokens is not None:
if generation_config.prompt_lookup_num_tokens is not None:
candidate_generator = PromptLookupCandidateGenerator(
eos_token_id=generation_config._eos_token_tensor,
num_output_tokens=generation_config.prompt_lookup_num_tokens,
@ -1462,11 +1452,10 @@ class GenerationMixin:
):
generation_config.max_length -= inputs_tensor.shape[1]
elif has_default_max_length: # by default let's always generate 20 new tokens
if generation_config.max_length == GenerationConfig().max_length:
generation_config.max_length = generation_config.max_length + input_ids_length
max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
if max_position_embeddings is not None:
generation_config.max_length = min(generation_config.max_length, max_position_embeddings)
generation_config.max_length = generation_config.max_length + input_ids_length
max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
if max_position_embeddings is not None:
generation_config.max_length = min(generation_config.max_length, max_position_embeddings)
# same for min length
if generation_config.min_new_tokens is not None:
@ -1646,10 +1635,7 @@ class GenerationMixin:
# This is needed here if we don't want to make changes in accelerate in order to save execution_device
# For offloaded case, we need to get the execution device, not just the device where it is offloaded
if hasattr(self, "hf_device_map"):
if set(self.hf_device_map.values()) == {"cpu"} or set(self.hf_device_map.values()) == {"cpu", "disk"}:
main_device = "cpu"
else:
main_device = [d for d in self.hf_device_map.values() if d not in ["cpu", "disk"]][0]
main_device = [d for d in self.hf_device_map.values() if d not in ["cpu", "disk"]][0]
execution_device_map = {
name: main_device if device in ["cpu", "disk"] else device
for name, device in self.hf_device_map.items()
@ -3246,7 +3232,7 @@ class GenerationMixin:
# Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
# (the clone itself is always small)
next_token_logits = outputs.logits[:, -1, :].clone().float()
next_token_logits = outputs.logits.clone()[:, -1, :].float()
next_token_logits = next_token_logits.to(input_ids.device)
# pre-process distribution

View File

@ -248,20 +248,6 @@ GGUF_TENSOR_MAPPING = {
"output_norm": "backbone.norm_f",
"output.weight": "lm_head.weight",
},
"nemotron": {
"token_embd": "model.embed_tokens",
"blk": "model.layers",
"ffn_up": "mlp.up_proj",
"ffn_down": "mlp.down_proj",
"ffn_norm": "post_attention_layernorm",
"attn_norm": "input_layernorm",
"attn_q": "self_attn.q_proj",
"attn_v": "self_attn.v_proj",
"attn_k": "self_attn.k_proj",
"attn_output": "self_attn.o_proj",
"output.weight": "lm_head.weight",
"output_norm": "model.norm",
},
}
@ -411,18 +397,6 @@ GGUF_CONFIG_MAPPING = {
"ssm.time_step_rank": "time_step_rank",
"ssm.inner_size": "intermediate_size",
},
"nemotron": {
"context_length": "max_position_embeddings",
"block_count": "num_hidden_layers",
"feed_forward_length": "intermediate_size",
"embedding_length": "hidden_size",
"rope.dimension_count": None,
"rope.freq_base": "rope_theta",
"attention.head_count": "num_attention_heads",
"attention.head_count_kv": "num_key_value_heads",
"attention.layer_norm_rms_epsilon": "norm_eps",
"vocab_size": "vocab_size",
},
}
GGUF_TOKENIZER_MAPPING = {
@ -819,7 +793,6 @@ GGUF_TO_FAST_CONVERTERS = {
"starcoder2": GGUFGPTConverter,
"t5": GGUFT5Converter,
"mamba": GGUFGPTConverter,
"nemotron": GGUFGPTConverter,
}

View File

@ -208,7 +208,7 @@ def hp_params(trial):
if is_optuna_available():
import optuna
if isinstance(trial, optuna.trial.BaseTrial):
if isinstance(trial, optuna.Trial):
return trial.params
if is_ray_tune_available():
if isinstance(trial, dict):
@ -230,7 +230,7 @@ def run_hp_search_optuna(trainer, n_trials: int, direction: str, **kwargs) -> Be
if trainer.args.process_index == 0:
def _objective(trial: optuna.Trial, checkpoint_dir=None):
def _objective(trial, checkpoint_dir=None):
checkpoint = None
if checkpoint_dir:
for subdir in os.listdir(checkpoint_dir):
@ -240,11 +240,10 @@ def run_hp_search_optuna(trainer, n_trials: int, direction: str, **kwargs) -> Be
if trainer.args.world_size > 1:
if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
raise RuntimeError("only support DDP optuna HPO for ParallelMode.DISTRIBUTED currently.")
trainer.hp_space(trial)
fixed_trial = optuna.trial.FixedTrial(trial.params, trial.number)
trial_main_rank_list = [fixed_trial]
torch.distributed.broadcast_object_list(trial_main_rank_list, src=0)
trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
trainer._hp_search_setup(trial)
args_main_rank_list = [pickle.dumps(trainer.args)]
torch.distributed.broadcast_object_list(args_main_rank_list, src=0)
trainer.train(resume_from_checkpoint=checkpoint)
else:
trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
# If there hasn't been any evaluation during the training loop.
@ -269,11 +268,15 @@ def run_hp_search_optuna(trainer, n_trials: int, direction: str, **kwargs) -> Be
else:
for i in range(n_trials):
trainer.objective = None
trial_main_rank_list = [None]
args_main_rank_list = [None]
if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
raise RuntimeError("only support DDP optuna HPO for ParallelMode.DISTRIBUTED currently.")
torch.distributed.broadcast_object_list(trial_main_rank_list, src=0)
trainer.train(resume_from_checkpoint=None, trial=trial_main_rank_list[0])
torch.distributed.broadcast_object_list(args_main_rank_list, src=0)
args = pickle.loads(bytes(args_main_rank_list[0]))
for key, value in asdict(args).items():
if key != "local_rank":
setattr(trainer.args, key, value)
trainer.train(resume_from_checkpoint=None)
# If there hasn't been any evaluation during the training loop.
if getattr(trainer, "objective", None) is None:
metrics = trainer.evaluate()
@ -915,7 +918,7 @@ class WandbCallback(TrainerCallback):
if self._log_model.is_enabled and self._initialized and state.is_world_process_zero:
from ..trainer import Trainer
fake_trainer = Trainer(args=args, model=model, processing_class=tokenizer, eval_dataset=["fake"])
fake_trainer = Trainer(args=args, model=model, processing_class=tokenizer)
with tempfile.TemporaryDirectory() as temp_dir:
fake_trainer.save_model(temp_dir)
metadata = (

View File

@ -15,7 +15,7 @@
# limitations under the License.
import re
from typing import Dict, NamedTuple, Optional
from typing import Dict, Optional
import numpy as np
from tqdm import tqdm
@ -55,200 +55,6 @@ GGUF_TO_TRANSFORMERS_MAPPING = {
GGUF_SUPPORTED_ARCHITECTURES = list(GGUF_TO_TRANSFORMERS_MAPPING["tensors"].keys())
class GGUFTensor(NamedTuple):
weights: np.ndarray
name: str
metadata: dict
class TensorProcessor:
def __init__(self, config=None):
self.config = config or {}
def process(self, weights, name, **kwargs):
return GGUFTensor(weights, name, {})
class LlamaTensorProcessor(TensorProcessor):
def __init__(self, config=None):
super().__init__(config=config)
def process(self, weights, name, **kwargs):
if ".attn_k." in name or ".attn_q." in name:
num_heads = self.config.get("num_attention_heads")
num_kv_heads = self.config.get("num_key_value_heads")
if None in (num_heads, num_kv_heads):
return GGUFTensor(weights, name, {})
if ".attn_q." in name:
weights = self._reverse_permute_weights(weights, num_heads, num_heads)
elif ".attn_k." in name:
weights = self._reverse_permute_weights(weights, num_heads, num_kv_heads)
return GGUFTensor(weights, name, {})
def _reverse_permute_weights(
self, weights: np.ndarray, n_head: int, num_kv_heads: Optional[int] = None
) -> np.ndarray:
# Original permutation implementation
# https://github.com/ggerganov/llama.cpp/blob/a38b884c6c4b0c256583acfaaabdf556c62fabea/convert_hf_to_gguf.py#L1402-L1408
if num_kv_heads is not None and n_head != num_kv_heads:
n_head = num_kv_heads
dim = weights.shape[0] // n_head // 2
w = weights.reshape(n_head, dim, 2, *weights.shape[1:])
return w.swapaxes(2, 1).reshape(weights.shape)
class Qwen2MoeTensorProcessor(TensorProcessor):
def __init__(self, config=None):
super().__init__(config=config)
def process(self, weights, name, **kwargs):
if "_exp" in name:
tensor_key_mapping = kwargs.get("tensor_key_mapping")
parsed_parameters = kwargs.get("parsed_parameters")
if tensor_key_mapping:
self._split_moe_expert_tensor(weights, parsed_parameters, name, tensor_key_mapping)
return GGUFTensor(weights, None, {})
if "ffn_gate_inp_shexp" in name:
# for compatibility tensor shared_expert_gate must be (1, 2048) dim,
# quantized one is (2048)
weights = np.expand_dims(weights, axis=0)
return GGUFTensor(weights, name, {})
def _split_moe_expert_tensor(
self, weights: np.ndarray, parsed_parameters: Dict[str, Dict], name: str, tensor_key_mapping: dict
):
# Original merge implementation
# https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L1994-L2022
exp_name = ""
if "ffn_gate_exps" in name:
exp_name = "gate_proj"
elif "ffn_down_exps" in name:
exp_name = "down_proj"
elif "ffn_up_exps" in name:
exp_name = "up_proj"
else:
raise ValueError(f"Cannot map expert tensor {name} in Qwen2Moe architecture.")
for tensor_name in tensor_key_mapping:
if tensor_name in name:
name = name.replace(tensor_name, tensor_key_mapping[tensor_name])
w_counter = self.config.get("num_experts", 60)
for i in range(0, w_counter):
temp_name = name.replace(".weight", f".{i}.{exp_name}.weight")
exp_weight = weights[i]
parsed_parameters["tensors"][temp_name] = torch.from_numpy(np.copy(exp_weight))
class BloomTensorProcessor(TensorProcessor):
def __init__(self, config=None):
super().__init__(config=config)
def process(self, weights, name, **kwargs):
if "attn_qkv" in name:
num_heads = self.config["n_head"]
n_embed = self.config["hidden_size"]
if "weight" in name:
weights = self._reverse_reshape_weights(weights, num_heads, n_embed)
else:
weights = self._reverse_reshape_bias(weights, num_heads, n_embed)
return GGUFTensor(weights, name, {})
def _reverse_reshape_weights(self, weights: np.ndarray, n_head: int, n_embed: int):
# Original reshape implementation
# https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L972-L985
q, k, v = np.array_split(weights, 3, axis=0)
q = q.reshape(n_head, n_embed // n_head, n_embed)
k = k.reshape(n_head, n_embed // n_head, n_embed)
v = v.reshape(n_head, n_embed // n_head, n_embed)
qkv_weights = np.stack([q, k, v], axis=1)
return qkv_weights.reshape(n_head * 3 * (n_embed // n_head), n_embed)
def _reverse_reshape_bias(self, weights: np.ndarray, n_head: int, n_embed: int):
# Original reshape implementation
# https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L986-L998
q_bias, k_bias, v_bias = np.array_split(weights, 3)
q_bias = q_bias.reshape(n_head, n_embed // n_head)
k_bias = k_bias.reshape(n_head, n_embed // n_head)
v_bias = v_bias.reshape(n_head, n_embed // n_head)
qkv_bias = np.stack([q_bias, k_bias, v_bias], axis=1).flatten()
return qkv_bias
class T5TensorProcessor(TensorProcessor):
def __init__(self, config=None):
super().__init__(config=config)
def process(self, weights, name, **kwargs):
bid = None
for chunk in name.split("."):
if chunk.isdigit():
bid = int(chunk)
break
return GGUFTensor(weights, name, {"bid": bid})
class GPT2TensorProcessor(TensorProcessor):
def __init__(self, config=None):
super().__init__(config=config)
def process(self, weights, name, **kwargs):
# Original transpose implementation
# https://github.com/ggerganov/llama.cpp/blob/a38b884c6c4b0c256583acfaaabdf556c62fabea/convert_hf_to_gguf.py#L2060-L2061
if (
"attn_qkv.weight" in name
or "ffn_down.weight" in name
or "ffn_up.weight" in name
or "attn_output.weight" in name
):
weights = weights.T
# Handle special case for output.weight
if name == "output.weight":
# output.weight has conflicts with attn_output.weight in name checking
# Store the tensor directly and signal to skip further processing
name = "lm_head.weight"
parsed_parameters = kwargs.get("parsed_parameters", {})
parsed_parameters["tensors"][name] = torch.from_numpy(np.copy(weights))
name = None # Signal to skip further processing
return GGUFTensor(weights, name, {})
class MambaTensorProcessor(TensorProcessor):
def __init__(self, config=None):
super().__init__(config=config)
def process(self, weights, name, **kwargs):
if "ssm_d" in name and "bias" not in name and "weight" not in name:
# ssm_d has conflicts with ssm_dt in name checking
# we have to explicitly check that name is exactly ssm_d
name = name.replace("ssm_d", "mixer.D")
if "ssm_conv1d.weight" in name:
# for compatibility tensor ssm_conv1d must be (5120, 1, 4]) dim,
# quantized one is (5120, 4)
weights = np.expand_dims(weights, axis=1)
if "ssm_a" in name:
# Original exponential implementation
# https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L2975-L2977
weights = np.log(-weights)
return GGUFTensor(weights, name, {})
TENSOR_PROCESSORS = {
"llama": LlamaTensorProcessor,
"qwen2moe": Qwen2MoeTensorProcessor,
"bloom": BloomTensorProcessor,
"t5": T5TensorProcessor,
"t5encoder": T5TensorProcessor,
"gpt2": GPT2TensorProcessor,
"mamba": MambaTensorProcessor,
}
def read_field(reader, field):
value = reader.fields[field]
return [_gguf_parse_value(value.parts[_data_index], value.types) for _data_index in value.data]
@ -371,28 +177,73 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
if return_tensors:
tensor_key_mapping = GGUF_TO_TRANSFORMERS_MAPPING["tensors"][architecture + model_size]
config = parsed_parameters.get("config", {})
ProcessorClass = TENSOR_PROCESSORS.get(architecture, TensorProcessor)
processor = ProcessorClass(config=config)
for tensor in tqdm(reader.tensors, desc="Converting and de-quantizing GGUF tensors..."):
name = tensor.name
weights = dequantize(tensor.data, tensor.tensor_type)
result = processor.process(
weights=weights,
name=name,
tensor_key_mapping=tensor_key_mapping,
parsed_parameters=parsed_parameters,
)
if architecture == "llama" and (".attn_k." in name or ".attn_q." in name):
num_heads = parsed_parameters["config"]["num_attention_heads"]
num_kv_heads = parsed_parameters["config"]["num_key_value_heads"]
if ".attn_q." in name:
weights = reverse_permute_weights(weights, num_heads, num_heads)
elif ".attn_k." in name:
weights = reverse_permute_weights(weights, num_heads, num_kv_heads)
weights = result.weights
name = result.name
bid = result.metadata.get("bid")
if architecture == "qwen2moe":
if "_exp" in name:
split_moe_expert_tensor(weights, parsed_parameters, name, tensor_key_mapping)
continue
if "ffn_gate_inp_shexp" in name:
# for compatibility tensor shared_expert_gate must be (1, 2048) dim,
# quantized one is (2048)
weights = np.expand_dims(weights, axis=0)
if name is None:
continue
if architecture == "bloom" and "attn_qkv" in name:
num_heads = parsed_parameters["config"]["n_head"]
n_embed = parsed_parameters["config"]["hidden_size"]
if "weight" in name:
weights = reverse_reshape_weights(weights, num_heads, n_embed)
else:
weights = reverse_reshape_bias(weights, num_heads, n_embed)
bid = None
if architecture in ("t5", "t5encoder"):
for chunk in name.split("."):
if chunk.isdigit():
bid = int(chunk)
break
if architecture == "gpt2":
if (
"attn_qkv.weight" in name
or "ffn_down.weight" in name
or "ffn_up.weight" in name
or "attn_output.weight" in name
):
# Original transpose implementation
# https://github.com/ggerganov/llama.cpp/blob/a38b884c6c4b0c256583acfaaabdf556c62fabea/convert_hf_to_gguf.py#L2060-L2061
weights = weights.T
if name == "output.weight":
# output.weight has conflicts with attn_output.weight in name checking
# we have to explicitly check that name is exactly output.weight
name = "lm_head.weight"
parsed_parameters["tensors"][name] = torch.from_numpy(np.copy(weights))
continue
if architecture == "mamba":
if "ssm_d" in name and "bias" not in name and "weight" not in name:
# ssm_d has conflicts with ssm_dt in name checking
# we have to explicitly check that name is exactly ssm_d
name = name.replace("ssm_d", "mixer.D")
if "ssm_conv1d.weight" in name:
# for compatibility tensor ssm_conv1d must be (5120, 1, 4]) dim,
# quantized one is (5120, 4)
weights = np.expand_dims(weights, axis=1)
if "ssm_a" in name:
# Original exponential implementation
# https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L2975-L2977
weights = np.log(-weights)
for tensor_name in tensor_key_mapping:
if tensor_name.format(bid=bid) in name:
@ -405,3 +256,64 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
logger.info(f"Some keys of the GGUF file were not considered: {reader_keys}")
return parsed_parameters
def reverse_permute_weights(weights: np.ndarray, n_head: int, num_kv_heads: Optional[int] = None) -> np.ndarray:
# Original permutation implementation
# https://github.com/ggerganov/llama.cpp/blob/a38b884c6c4b0c256583acfaaabdf556c62fabea/convert_hf_to_gguf.py#L1402-L1408
if num_kv_heads is not None and n_head != num_kv_heads:
n_head = num_kv_heads
dim = weights.shape[0] // n_head // 2
w = weights.reshape(n_head, dim, 2, *weights.shape[1:])
return w.swapaxes(2, 1).reshape(weights.shape)
def reverse_reshape_weights(weights: np.ndarray, n_head: int, n_embed: int):
# Original reshape implementation
# https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L972-L985
q, k, v = np.array_split(weights, 3, axis=0)
q = q.reshape(n_head, n_embed // n_head, n_embed)
k = k.reshape(n_head, n_embed // n_head, n_embed)
v = v.reshape(n_head, n_embed // n_head, n_embed)
qkv_weights = np.stack([q, k, v], axis=1)
return qkv_weights.reshape(n_head * 3 * (n_embed // n_head), n_embed)
def reverse_reshape_bias(weights: np.ndarray, n_head: int, n_embed: int):
# Original reshape implementation
# https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L986-L998
q_bias, k_bias, v_bias = np.array_split(weights, 3)
q_bias = q_bias.reshape(n_head, n_embed // n_head)
k_bias = k_bias.reshape(n_head, n_embed // n_head)
v_bias = v_bias.reshape(n_head, n_embed // n_head)
qkv_bias = np.stack([q_bias, k_bias, v_bias], axis=1).flatten()
return qkv_bias
def split_moe_expert_tensor(
weights: np.ndarray, parsed_parameters: Dict[str, Dict], name: str, tensor_key_mapping: dict
):
# Original merge implementation
# https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L1994-L2022
exp_name = ""
if "ffn_gate_exps" in name:
exp_name = "gate_proj"
elif "ffn_down_exps" in name:
exp_name = "down_proj"
elif "ffn_up_exps" in name:
exp_name = "up_proj"
else:
raise ValueError(f"Cannot map expert tensor {name} in Qwen2Moe architecture.")
for tensor_name in tensor_key_mapping:
if tensor_name in name:
name = name.replace(tensor_name, tensor_key_mapping[tensor_name])
w_counter = parsed_parameters["config"].get("num_experts", 60)
for i in range(0, w_counter):
temp_name = name.replace(".weight", f".{i}.{exp_name}.weight")
exp_weight = weights[i]
parsed_parameters["tensors"][temp_name] = torch.from_numpy(np.copy(exp_weight))

View File

@ -52,11 +52,9 @@ from .pytorch_utils import ( # noqa: F401
find_pruneable_heads_and_indices,
id_tensor_storage,
is_torch_greater_or_equal_than_1_13,
is_torch_greater_or_equal_than_2_4,
prune_conv1d_layer,
prune_layer,
prune_linear_layer,
translate_to_torch_parallel_style,
)
from .quantizers import AutoHfQuantizer, HfQuantizer
from .quantizers.quantizers_utils import get_module_from_name
@ -96,7 +94,7 @@ from .utils import (
replace_return_docstrings,
strtobool,
)
from .utils.hub import create_and_tag_model_card, get_checkpoint_shard_files
from .utils.hub import convert_file_size_to_int, create_and_tag_model_card, get_checkpoint_shard_files
from .utils.import_utils import (
ENV_VARS_TRUE_VALUES,
is_sagemaker_mp_enabled,
@ -139,7 +137,6 @@ logger = logging.get_logger(__name__)
_init_weights = True
_is_quantized = False
_is_ds_init_called = False
def is_fsdp_enabled():
@ -227,19 +224,6 @@ def set_quantized_state():
_is_quantized = False
# Skip recursive calls to deepspeed.zero.Init to avoid pinning errors.
# This issue occurs with ZeRO stage 3 when using NVMe offloading.
# For more details, refer to issue #34429.
@contextmanager
def set_zero3_state():
global _is_ds_init_called
_is_ds_init_called = True
try:
yield
finally:
_is_ds_init_called = False
def get_parameter_device(parameter: Union[nn.Module, "ModuleUtilsMixin"]):
try:
return next(parameter.parameters()).device
@ -375,9 +359,6 @@ def check_support_param_buffer_assignment(model_to_load, state_dict, start_prefi
Note: We fully disable this if we are using `deepspeed`
"""
if model_to_load.device.type == "meta":
return False
if len([key for key in state_dict if key.startswith(start_prefix)]) == 0:
return False
@ -392,7 +373,7 @@ def check_support_param_buffer_assignment(model_to_load, state_dict, start_prefi
return False
# If the model does, the incoming `state_dict` and the `model_to_load` must be the same dtype
first_key = next(iter(model_to_load.state_dict().keys()))
first_key = list(model_to_load.state_dict().keys())[0]
if start_prefix + first_key in state_dict:
return state_dict[start_prefix + first_key].dtype == model_to_load.state_dict()[first_key].dtype
@ -400,6 +381,92 @@ def check_support_param_buffer_assignment(model_to_load, state_dict, start_prefi
return False
def shard_checkpoint(
state_dict: Dict[str, torch.Tensor], max_shard_size: Union[int, str] = "10GB", weights_name: str = WEIGHTS_NAME
):
"""
Splits a model state dictionary in sub-checkpoints so that the final size of each sub-checkpoint does not exceed a
given size.
The sub-checkpoints are determined by iterating through the `state_dict` in the order of its keys, so there is no
optimization made to make each sub-checkpoint as close as possible to the maximum size passed. For example, if the
limit is 10GB and we have weights of sizes [6GB, 6GB, 2GB, 6GB, 2GB, 2GB] they will get sharded as [6GB], [6+2GB],
[6+2+2GB] and not [6+2+2GB], [6+2GB], [6GB].
<Tip warning={true}>
If one of the model's weight is bigger than `max_shard_size`, it will end up in its own sub-checkpoint which will
have a size greater than `max_shard_size`.
</Tip>
Args:
state_dict (`Dict[str, torch.Tensor]`): The state dictionary of a model to save.
max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
The maximum size of each sub-checkpoint. If expressed as a string, needs to be digits followed by a unit
(like `"5MB"`).
weights_name (`str`, *optional*, defaults to `"pytorch_model.bin"`):
The name of the model save file.
"""
logger.warning(
"Note that `shard_checkpoint` is deprecated and will be removed in v4.44. We recommend you using "
"split_torch_state_dict_into_shards from huggingface_hub library"
)
max_shard_size = convert_file_size_to_int(max_shard_size)
sharded_state_dicts = [{}]
last_block_size = 0
total_size = 0
storage_id_to_block = {}
for key, weight in state_dict.items():
# when bnb serialization is used the weights in the state dict can be strings
# check: https://github.com/huggingface/transformers/pull/24416 for more details
if isinstance(weight, str):
continue
else:
storage_id = id_tensor_storage(weight)
# If a `weight` shares the same underlying storage as another tensor, we put `weight` in the same `block`
if storage_id in storage_id_to_block and weight.device != torch.device("meta"):
block_id = storage_id_to_block[storage_id]
sharded_state_dicts[block_id][key] = weight
continue
weight_size = weight.numel() * dtype_byte_size(weight.dtype)
# If this weight is going to tip up over the maximal size, we split, but only if we have put at least one
# weight in the current shard.
if last_block_size + weight_size > max_shard_size and len(sharded_state_dicts[-1]) > 0:
sharded_state_dicts.append({})
last_block_size = 0
sharded_state_dicts[-1][key] = weight
last_block_size += weight_size
total_size += weight_size
storage_id_to_block[storage_id] = len(sharded_state_dicts) - 1
# If we only have one shard, we return it
if len(sharded_state_dicts) == 1:
return {weights_name: sharded_state_dicts[0]}, None
# Otherwise, let's build the index
weight_map = {}
shards = {}
for idx, shard in enumerate(sharded_state_dicts):
shard_file = weights_name.replace(".bin", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.bin")
shard_file = shard_file.replace(
".safetensors", f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.safetensors"
)
shards[shard_file] = shard
for key in shard.keys():
weight_map[key] = shard_file
# Add the metadata
metadata = {"total_size": total_size}
index = {"metadata": metadata, "weight_map": weight_map}
return shards, index
def load_sharded_checkpoint(model, folder, strict=True, prefer_safe=True):
"""
This is the same as
@ -946,10 +1013,7 @@ def _load_state_dict_into_meta_model(
param_to = "cpu"
if is_fsdp_enabled() and not is_local_dist_rank_0():
param_to = "meta"
val_kwargs = {}
if hasattr(module, "weight") and module.weight.__class__.__name__ == "Int8Params":
val_kwargs["requires_grad"] = False
value = type(value)(value.data.to(param_to), **val_kwargs, **value.__dict__)
value = type(value)(value.data.to(param_to), **value.__dict__)
setattr(module, tensor_name, value)
# TODO: consider removing used param_parts from state_dict before return
@ -1345,12 +1409,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
# Has support for a `QuantoQuantizedCache` instance as `past_key_values`
_supports_quantized_cache = False
# A tensor parallel plan to be applied to the model when TP is enabled. For
# top-level models, this attribute is currently defined in respective model
# code. For base models, this attribute comes from
# `config.base_model_tp_plan` during `post_init`.
_tp_plan = None
@property
def dummy_inputs(self) -> Dict[str, torch.Tensor]:
"""
@ -1395,9 +1453,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
"""
self.init_weights()
self._backward_compatibility_gradient_checkpointing()
# If current model is a base model, attach `base_model_tp_plan` from config
if self.base_model is self:
self._tp_plan = self.config.base_model_tp_plan
def dequantize(self):
"""
@ -1487,14 +1542,13 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
torch_dtype=torch_dtype,
)
if is_deepspeed_zero3_enabled() and not _is_quantized and not _is_ds_init_called:
if is_deepspeed_zero3_enabled() and not _is_quantized:
import deepspeed
logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
# this immediately partitions the model across all gpus, to avoid the overhead in time
# and memory copying it on CPU or each GPU first
init_contexts = [deepspeed.zero.Init(config_dict_or_path=deepspeed_config()), set_zero3_state()]
with ContextManagers(init_contexts):
with deepspeed.zero.Init(config_dict_or_path=deepspeed_config()):
model = cls(config, **kwargs)
else:
@ -1537,7 +1591,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
"eager",
"sdpa",
"flash_attention_2",
"flex_attention",
]:
message = f'Specified `attn_implementation="{config._attn_implementation}"` is not supported. The only possible arguments are `attn_implementation="eager"` (manual attention implementation)'
if cls._supports_flash_attn_2:
@ -3429,11 +3482,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
# Cache path to the GGUF file
gguf_path = None
tp_plan = kwargs.pop("tp_plan", None)
if tp_plan is not None and tp_plan != "auto":
# TODO: we can relax this check when we support taking tp_plan from a json file, for example.
raise ValueError(f"tp_plan supports 'auto' only for now but got {tp_plan}.")
if is_fsdp_enabled():
low_cpu_mem_usage = True
@ -3617,11 +3665,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
if hf_quantizer is not None:
hf_quantizer.validate_environment(
torch_dtype=torch_dtype,
from_tf=from_tf,
from_flax=from_flax,
device_map=device_map,
weights_only=weights_only,
torch_dtype=torch_dtype, from_tf=from_tf, from_flax=from_flax, device_map=device_map
)
torch_dtype = hf_quantizer.update_torch_dtype(torch_dtype)
device_map = hf_quantizer.update_device_map(device_map)
@ -4039,32 +4083,18 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
# Instantiate model.
init_contexts = [no_init_weights(_enable=_fast_init)]
tp_device = None
if is_deepspeed_zero3_enabled() and not is_quantized and not _is_ds_init_called:
if is_deepspeed_zero3_enabled() and not is_quantized:
import deepspeed
logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
init_contexts = [
deepspeed.zero.Init(config_dict_or_path=deepspeed_config()),
set_zero3_state(),
] + init_contexts
init_contexts = [deepspeed.zero.Init(config_dict_or_path=deepspeed_config())] + init_contexts
elif low_cpu_mem_usage:
if not is_accelerate_available():
raise ImportError(
f"Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`"
)
init_contexts.append(init_empty_weights())
elif tp_plan is not None:
if not torch.distributed.is_initialized():
raise ValueError("Tensor Parallel requires torch.distributed to be initialized first.")
# Detect the accelerator on the machine. If no accelerator is available, it returns CPU.
device_type = torch._C._get_accelerator().type
device_module = torch.get_device_module(device_type)
# Get device with index assuming equal number of devices per host
tp_device = torch.device(device_type, torch.distributed.get_rank() % device_module.device_count())
init_contexts.append(tp_device)
if is_deepspeed_zero3_enabled() and is_quantized:
init_contexts.append(set_quantized_state())
@ -4198,38 +4228,32 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
if dtype_orig is not None:
torch.set_default_dtype(dtype_orig)
load_contexts = []
# Make sure we load onto targeted device
if tp_device is not None:
load_contexts.append(tp_device)
with ContextManagers(load_contexts):
(
model,
missing_keys,
unexpected_keys,
mismatched_keys,
offload_index,
error_msgs,
) = cls._load_pretrained_model(
model,
state_dict,
loaded_state_dict_keys, # XXX: rename?
resolved_archive_file,
pretrained_model_name_or_path,
ignore_mismatched_sizes=ignore_mismatched_sizes,
sharded_metadata=sharded_metadata,
_fast_init=_fast_init,
low_cpu_mem_usage=low_cpu_mem_usage,
device_map=device_map,
offload_folder=offload_folder,
offload_state_dict=offload_state_dict,
dtype=torch_dtype,
hf_quantizer=hf_quantizer,
keep_in_fp32_modules=keep_in_fp32_modules,
gguf_path=gguf_path,
weights_only=weights_only,
)
(
model,
missing_keys,
unexpected_keys,
mismatched_keys,
offload_index,
error_msgs,
) = cls._load_pretrained_model(
model,
state_dict,
loaded_state_dict_keys, # XXX: rename?
resolved_archive_file,
pretrained_model_name_or_path,
ignore_mismatched_sizes=ignore_mismatched_sizes,
sharded_metadata=sharded_metadata,
_fast_init=_fast_init,
low_cpu_mem_usage=low_cpu_mem_usage,
device_map=device_map,
offload_folder=offload_folder,
offload_state_dict=offload_state_dict,
dtype=torch_dtype,
hf_quantizer=hf_quantizer,
keep_in_fp32_modules=keep_in_fp32_modules,
gguf_path=gguf_path,
weights_only=weights_only,
)
# make sure token embedding weights are still tied if needed
model.tie_weights()
@ -4313,16 +4337,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
}
return model, loading_info
if tp_plan is not None:
assert tp_device is not None, "tp_device not set!"
if not model.supports_tp_plan:
raise NotImplementedError("This model does not have a tensor parallel plan.")
# Assuming sharding the model onto the world
world_size = torch.distributed.get_world_size()
device_mesh = torch.distributed.init_device_mesh(tp_device.type, (world_size,))
# Apply Tensor Parallelism
model.tensor_parallel(device_mesh)
return model
@classmethod
@ -5012,56 +5026,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
return self.hf_quantizer.is_trainable
@property
def supports_tp_plan(self):
"""
Returns whether the model has a tensor parallelism plan.
"""
if self._tp_plan is not None:
return True
# Check if base model has a TP plan
if getattr(self.base_model, "_tp_plan", None) is not None:
return True
return False
def tensor_parallel(self, device_mesh):
"""
Tensor parallelize the model across the given device mesh.
Args:
device_mesh (`torch.distributed.DeviceMesh`):
The device mesh to use for tensor parallelism.
"""
if not is_torch_greater_or_equal_than_2_4:
raise EnvironmentError("tensor parallel is only supported for `torch>=2.5`.")
# Tensor parallelize a nn.Module based on the `_tp_plan` attribute of the module.
# No op if `_tp_plan` attribute does not exist under the module.
# This is a helper function to be used with `model.apply` to recursively
# parallelize a model.
def tplize(mod: torch.nn.Module) -> None:
tp_plan = getattr(mod, "_tp_plan", None)
if tp_plan is None:
return
logger.debug(f"Applying tensor parallel to {mod.__class__.__name__}: {tp_plan}")
# In model configs, we use a neutral type (string) to specify
# parallel styles, here we translate them into torch TP types.
# Using tree_map because `tp_plan` is a dict.
tp_plan = torch.utils._pytree.tree_map(
translate_to_torch_parallel_style,
tp_plan,
)
# Apply TP to current module.
torch.distributed.tensor.parallel.parallelize_module(
mod,
device_mesh=device_mesh,
parallelize_plan=tp_plan,
)
# `apply` is a native method of `nn.Module` that recursively applies a
# function to every submodule.
self.apply(tplize)
@property
def loss_function(self):
if getattr(self.config, "loss_type", None) is not None:

View File

@ -177,7 +177,6 @@ from . import (
nougat,
nystromformer,
olmo,
olmo_1124,
olmoe,
omdet_turbo,
oneformer,

View File

@ -195,7 +195,6 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("nougat", "VisionEncoderDecoderConfig"),
("nystromformer", "NystromformerConfig"),
("olmo", "OlmoConfig"),
("olmo_1124", "Olmo1124Config"),
("olmoe", "OlmoeConfig"),
("omdet-turbo", "OmDetTurboConfig"),
("oneformer", "OneFormerConfig"),
@ -511,7 +510,6 @@ MODEL_NAMES_MAPPING = OrderedDict(
("nougat", "Nougat"),
("nystromformer", "Nyströmformer"),
("olmo", "OLMo"),
("olmo_1124", "OLMo November 2024"),
("olmoe", "OLMoE"),
("omdet-turbo", "OmDet-Turbo"),
("oneformer", "OneFormer"),

View File

@ -68,7 +68,7 @@ else:
("convnextv2", ("ConvNextImageProcessor",)),
("cvt", ("ConvNextImageProcessor",)),
("data2vec-vision", ("BeitImageProcessor",)),
("deformable_detr", ("DeformableDetrImageProcessor", "DeformableDetrImageProcessorFast")),
("deformable_detr", ("DeformableDetrImageProcessor",)),
("deit", ("DeiTImageProcessor",)),
("depth_anything", ("DPTImageProcessor",)),
("deta", ("DetaImageProcessor",)),

View File

@ -184,7 +184,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
("nllb-moe", "NllbMoeModel"),
("nystromformer", "NystromformerModel"),
("olmo", "OlmoModel"),
("olmo_1124", "Olmo1124Model"),
("olmoe", "OlmoeModel"),
("omdet-turbo", "OmDetTurboForObjectDetection"),
("oneformer", "OneFormerModel"),
@ -517,7 +516,6 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
("mvp", "MvpForCausalLM"),
("nemotron", "NemotronForCausalLM"),
("olmo", "OlmoForCausalLM"),
("olmo_1124", "Olmo1124ForCausalLM"),
("olmoe", "OlmoeForCausalLM"),
("open-llama", "OpenLlamaForCausalLM"),
("openai-gpt", "OpenAIGPTLMHeadModel"),

View File

@ -348,7 +348,6 @@ else:
),
),
("olmo", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
("olmo_1124", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
("olmoe", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
(
"omdet-turbo",

View File

@ -249,7 +249,7 @@ def convert_blip2_checkpoint(
{"image": original_pixel_values, "text_input": [caption]}, match_head="itm"
)
logits = hf_model(
pixel_values=pixel_values,
pixel_values=original_pixel_values,
input_ids=input_ids,
attention_mask=attention_mask,
use_image_text_matching_head=True,
@ -274,7 +274,7 @@ def convert_blip2_checkpoint(
{"image": original_pixel_values, "text_input": [caption]}, match_head="itc"
)
logits = hf_model(
pixel_values=pixel_values,
pixel_values=original_pixel_values,
input_ids=input_ids,
attention_mask=attention_mask,
use_image_text_matching_head=False,

View File

@ -2203,7 +2203,7 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin):
logger.warning_once(
"Expanding inputs for image tokens in BLIP-2 should be done in processing. "
"Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
)
inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
attention_mask = torch.cat(
@ -2326,7 +2326,7 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin):
logger.warning_once(
"Expanding inputs for image tokens in BLIP-2 should be done in processing. "
"Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
)
inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
attention_mask = torch.cat(

View File

@ -153,7 +153,7 @@ class Blip2Processor(ProcessorMixin):
logger.warning_once(
"Expanding inputs for image tokens in BLIP-2 should be done in processing. "
"Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
)
# cast to desired return tensors type

View File

@ -890,7 +890,7 @@ class CohereModel(CoherePreTrainedModel):
all_self_attns = () if output_attentions else None
next_decoder_cache = None
for decoder_layer in self.layers[: self.config.num_hidden_layers]:
for decoder_layer in self.layers:
if output_hidden_states:
all_hidden_states += (hidden_states,)
@ -1068,7 +1068,7 @@ class CohereModel(CoherePreTrainedModel):
return causal_mask
# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with Llama->Cohere
# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with Llama->Cohere
class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
_tied_weights_keys = ["lm_head.weight"]

View File

@ -29,7 +29,6 @@ except OptionalDependencyNotAvailable:
else:
_import_structure["feature_extraction_deformable_detr"] = ["DeformableDetrFeatureExtractor"]
_import_structure["image_processing_deformable_detr"] = ["DeformableDetrImageProcessor"]
_import_structure["image_processing_deformable_detr_fast"] = ["DeformableDetrImageProcessorFast"]
try:
if not is_torch_available():
@ -55,7 +54,6 @@ if TYPE_CHECKING:
else:
from .feature_extraction_deformable_detr import DeformableDetrFeatureExtractor
from .image_processing_deformable_detr import DeformableDetrImageProcessor
from .image_processing_deformable_detr_fast import DeformableDetrImageProcessorFast
try:
if not is_torch_available():

View File

@ -224,16 +224,16 @@ class DepthAnythingFeatureFusionStage(nn.Module):
hidden_states = hidden_states[::-1]
fused_hidden_states = []
fused_hidden_state = None
# first layer only uses the last hidden_state
size = hidden_states[1].shape[2:]
fused_hidden_state = self.layers[0](hidden_states[0], size=size)
fused_hidden_states.append(fused_hidden_state)
for idx, (hidden_state, layer) in enumerate(zip(hidden_states, self.layers)):
size = hidden_states[idx + 1].shape[2:] if idx != (len(hidden_states) - 1) else None
# looping from the last layer to the second
for idx, (hidden_state, layer) in enumerate(zip(hidden_states[1:], self.layers[1:])):
size = hidden_states[1:][idx + 1].shape[2:] if idx != (len(hidden_states[1:]) - 1) else None
if fused_hidden_state is None:
# first layer only uses the last hidden_state
fused_hidden_state = layer(hidden_state, size=size)
else:
fused_hidden_state = layer(fused_hidden_state, hidden_state, size=size)
fused_hidden_state = layer(fused_hidden_state, hidden_state, size=size)
fused_hidden_states.append(fused_hidden_state)

View File

@ -416,7 +416,7 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
"""
Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
created using from_dict and kwargs e.g. `DetrImageProcessorFast.from_pretrained(checkpoint, size=600,
created using from_dict and kwargs e.g. `DetrImageProcessor.from_pretrained(checkpoint, size=600,
max_size=800)`
"""
image_processor_dict = image_processor_dict.copy()
@ -863,7 +863,6 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
input_data_format = infer_channel_dimension_format(images[0])
if input_data_format == ChannelDimension.LAST:
images = [image.permute(2, 0, 1).contiguous() for image in images]
input_data_format = ChannelDimension.FIRST
if do_rescale and do_normalize:
# fused rescale and normalize

View File

@ -689,13 +689,12 @@ class DPTFeatureFusionStage(nn.Module):
hidden_states = hidden_states[::-1]
fused_hidden_states = []
fused_hidden_state = None
for hidden_state, layer in zip(hidden_states, self.layers):
if fused_hidden_state is None:
# first layer only uses the last hidden_state
fused_hidden_state = layer(hidden_state)
else:
fused_hidden_state = layer(fused_hidden_state, hidden_state)
# first layer only uses the last hidden_state
fused_hidden_state = self.layers[0](hidden_states[0])
fused_hidden_states.append(fused_hidden_state)
# looping from the last layer to the second
for hidden_state, layer in zip(hidden_states[1:], self.layers[1:]):
fused_hidden_state = layer(fused_hidden_state, hidden_state)
fused_hidden_states.append(fused_hidden_state)
return fused_hidden_states

View File

@ -1277,18 +1277,12 @@ class FalconForCausalLM(FalconPreTrainedModel, GenerationMixin):
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
num_logits_to_keep: int = 0,
) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
`labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
num_logits_to_keep (`int`, *optional*):
Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
`input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@ -1308,7 +1302,7 @@ class FalconForCausalLM(FalconPreTrainedModel, GenerationMixin):
)
hidden_states = transformer_outputs[0]
lm_logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
lm_logits = self.lm_head(hidden_states)
loss = None
if labels is not None:

View File

@ -346,7 +346,7 @@ class FuyuForCausalLM(FuyuPreTrainedModel, GenerationMixin):
):
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
if past_key_values is not None:
if past_key_values:
input_ids = input_ids[:, -1:]
position_ids = kwargs.get("position_ids", None)
@ -355,7 +355,7 @@ class FuyuForCausalLM(FuyuPreTrainedModel, GenerationMixin):
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
if past_key_values:
position_ids = position_ids[:, -1:]
position_ids = position_ids[:, -1].unsqueeze(-1)
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and past_key_values is None:
@ -377,12 +377,3 @@ class FuyuForCausalLM(FuyuPreTrainedModel, GenerationMixin):
}
)
return model_inputs
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past

View File

@ -720,10 +720,7 @@ class GemmaModel(GemmaPreTrainedModel):
[GemmaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
)
self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.gradient_checkpointing = False
if getattr(config, "pretraining_tp", 1) != 1:
logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")
# Initialize weights and apply final processing
self.post_init()
@ -808,7 +805,7 @@ class GemmaModel(GemmaPreTrainedModel):
all_self_attns = () if output_attentions else None
next_decoder_cache = None
for decoder_layer in self.layers[: self.config.num_hidden_layers]:
for decoder_layer in self.layers:
if output_hidden_states:
all_hidden_states += (hidden_states,)
@ -985,7 +982,6 @@ class GemmaModel(GemmaPreTrainedModel):
class GemmaForCausalLM(GemmaPreTrainedModel, GenerationMixin):
_tied_weights_keys = ["lm_head.weight"]
_tp_plan = {"lm_head": "colwise_rep"}
def __init__(self, config):
super().__init__(config)

View File

@ -886,7 +886,7 @@ class GemmaModel(LlamaModel):
all_self_attns = () if output_attentions else None
next_decoder_cache = None
for decoder_layer in self.layers[: self.config.num_hidden_layers]:
for decoder_layer in self.layers:
if output_hidden_states:
all_hidden_states += (hidden_states,)

View File

@ -41,7 +41,7 @@ from ...utils import (
add_start_docstrings_to_model_forward,
is_flash_attn_2_available,
is_flash_attn_greater_or_equal,
is_torch_greater_or_equal,
is_flash_attn_greater_or_equal_2_10,
logging,
replace_return_docstrings,
)
@ -51,8 +51,6 @@ from .configuration_gemma2 import Gemma2Config
if is_flash_attn_2_available():
from ...modeling_flash_attention_utils import _flash_attention_forward
if is_torch_greater_or_equal("2.5"):
from torch.nn.attention.flex_attention import flex_attention
logger = logging.get_logger(__name__)
@ -170,127 +168,6 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
def eager_attention_forward(config, query, key, value, mask, **_kwargs):
key_states = repeat_kv(key, config.num_key_value_groups)
value_states = repeat_kv(value, config.num_key_value_groups)
attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * config.scaling
if config.attn_logit_softcapping is not None:
attn_weights = attn_weights / config.attn_logit_softcapping
attn_weights = torch.tanh(attn_weights)
attn_weights = attn_weights * config.attn_logit_softcapping
if mask is not None: # no matter the length, we just slice it
causal_mask = mask[:, :, :, : key_states.shape[-2]]
attn_weights = attn_weights + causal_mask
# upcast attention to fp32
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
attn_weights = nn.functional.dropout(attn_weights, p=config.attention_dropout, training=config.training)
attn_output = torch.matmul(attn_weights, value_states)
attn_output = attn_output.transpose(1, 2).contiguous()
return attn_output, attn_weights
def flash_attention_forward(config, query, key, value, mask, target_dtype=torch.float16, **_kwargs):
if mask is not None:
seq_len = mask.shape[1]
query = query[:, :, :seq_len]
value = value[:, :, :seq_len]
# TODO: These transpose are quite inefficient but Flash Attention requires the layout
# [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor rotary embedding
query_states = query.transpose(1, 2)
key_states = key.transpose(1, 2)
value_states = value.transpose(1, 2)
dropout_rate = config.attention_dropout if config.training else 0.0
input_dtype = query_states.dtype
if input_dtype == torch.float32:
query_states = query_states.to(target_dtype)
key_states = key_states.to(target_dtype)
value_states = value_states.to(target_dtype)
attn_output = _flash_attention_forward(
query_states,
key_states,
value_states,
mask,
seq_len,
dropout=dropout_rate,
softmax_scale=config.scaling,
is_causal=config.is_causal,
sliding_window=config.sliding_window,
use_top_left_mask=config._flash_attn_uses_top_left_mask,
softcap=config.attn_logit_softcapping if is_flash_attn_greater_or_equal("2.6.0") else None,
)
return attn_output, None
def flex_attention_forward(config, query, key, value, mask, output_attentions=False, **_kwargs):
def tanh_softcap(score, b, h, q_idx, kv_idx):
soft_cap = config.attn_logit_softcapping
score = soft_cap * torch.tanh(score / soft_cap)
if mask is not None:
return score + mask[b][0][q_idx][kv_idx]
return score
attn_output = flex_attention(
query,
key,
value,
score_mod=tanh_softcap,
enable_gqa=True,
scale=config.scaling,
return_lse=output_attentions,
)
if not output_attentions:
return attn_output, None
else:
return attn_output[0], attn_output[1]
def sdpa_attention_forward(config, query, key, value, mask, **_kwargs):
key = repeat_kv(key, config.num_key_value_groups)
value = repeat_kv(value, config.num_key_value_groups)
causal_mask = mask
if mask is not None:
causal_mask = causal_mask[:, :, :, : key.shape[-2]]
# SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
# Reference: https://github.com/pytorch/pytorch/issues/112577.
if query.device.type == "cuda" and causal_mask is not None:
query = query.contiguous()
key = key.contiguous()
value = value.contiguous()
# We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
# in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
is_causal = True if causal_mask is None and query.shape[1] > 1 else False
attn_output = torch.nn.functional.scaled_dot_product_attention(
query,
key,
value,
attn_mask=causal_mask,
dropout_p=config.attention_dropout if config.training else 0.0,
is_causal=is_causal,
scale=config.scaling,
)
return attn_output, None
GEMMA2_ATTENTION_FUNCTION = {
"flash_attention_2": flash_attention_forward,
"flex_attention": flex_attention_forward,
"eager": eager_attention_forward,
"sdpa": sdpa_attention_forward,
}
class Gemma2Attention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
@ -298,6 +175,12 @@ class Gemma2Attention(nn.Module):
super().__init__()
self.config = config
self.layer_idx = layer_idx
if layer_idx is None:
logger.warning_once(
f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
"lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
"when creating this class."
)
self.attention_dropout = config.attention_dropout
self.hidden_size = config.hidden_size
@ -309,8 +192,7 @@ class Gemma2Attention(nn.Module):
self.rope_theta = config.rope_theta
self.is_causal = True
self.scaling = config.query_pre_attn_scalar**-0.5
self.sliding_window = config.sliding_window if not bool(layer_idx % 2) else None
self.attn_logit_softcapping = config.attn_logit_softcapping
if self.hidden_size % self.num_heads != 0:
raise ValueError(
f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
@ -326,6 +208,7 @@ class Gemma2Attention(nn.Module):
max_position_embeddings=self.max_position_embeddings,
base=self.rope_theta,
)
self.sliding_window = config.sliding_window if not bool(layer_idx % 2) else None
def forward(
self,
@ -360,14 +243,145 @@ class Gemma2Attention(nn.Module):
}
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
if output_attentions and self.config._attn_implementation in ["sdpa", "flash_attention_2"]:
logger.warning_once("Setting `attention_type` to `flex_attention` because `output_attentions=True`")
attention_type = "eager"
else:
attention_type = self.config._attn_implementation
key_states = repeat_kv(key_states, self.num_key_value_groups)
value_states = repeat_kv(value_states, self.num_key_value_groups)
attn_output, attn_weights = GEMMA2_ATTENTION_FUNCTION[attention_type](
self, query_states, key_states, value_states, attention_mask, output_attentions=output_attentions
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling
if self.config.attn_logit_softcapping is not None:
attn_weights = attn_weights / self.config.attn_logit_softcapping
attn_weights = torch.tanh(attn_weights)
attn_weights = attn_weights * self.config.attn_logit_softcapping
if attention_mask is not None: # no matter the length, we just slice it
causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
attn_weights = attn_weights + causal_mask
# upcast attention to fp32
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
attn_output = torch.matmul(attn_weights, value_states)
if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
raise ValueError(
f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
f" {attn_output.size()}"
)
attn_output = attn_output.transpose(1, 2).contiguous()
attn_output = attn_output.view(bsz, q_len, -1)
attn_output = self.o_proj(attn_output)
if not output_attentions:
attn_weights = None
return attn_output, attn_weights, past_key_value
class Gemma2FlashAttention2(Gemma2Attention):
"""
Gemma2 flash attention module. This module inherits from `Gemma2Attention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
output_attentions = False
bsz, q_len, _ = hidden_states.size()
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
# Flash attention requires the input to have the shape
# batch_size x seq_length x head_dim x hidden_dim
# therefore we just need to keep the original shape
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
cos, sin = self.rotary_emb(value_states, position_ids)
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_value is not None:
# sin and cos are specific to RoPE models; cache_position needed for the static cache
cache_kwargs = {
"sin": sin,
"cos": cos,
"sliding_window": self.sliding_window,
"cache_position": cache_position,
}
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
if attention_mask is not None:
seq_len = attention_mask.shape[1]
key_states = key_states[:, :, :seq_len]
value_states = value_states[:, :, :seq_len]
# TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
# to be able to avoid many of these transpose/reshape/view.
query_states = query_states.transpose(1, 2)
key_states = key_states.transpose(1, 2)
value_states = value_states.transpose(1, 2)
dropout_rate = self.attention_dropout if self.training else 0.0
# In PEFT, usually we cast the layer norms in float32 for training stability reasons
# therefore the input hidden states gets silently casted in float32. Hence, we need
# cast them back in the correct dtype just to be sure everything works as expected.
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
# in fp32. (Gemma2RMSNorm handles it correctly)
input_dtype = query_states.dtype
if input_dtype == torch.float32:
if torch.is_autocast_enabled():
target_dtype = torch.get_autocast_gpu_dtype()
# Handle the case where the model is quantized
elif hasattr(self.config, "_pre_quantization_dtype"):
target_dtype = self.config._pre_quantization_dtype
else:
target_dtype = self.q_proj.weight.dtype
logger.warning_once(
f"The input hidden states seems to be silently casted in float32, this might be related to"
f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
f" {target_dtype}."
)
query_states = query_states.to(target_dtype)
key_states = key_states.to(target_dtype)
value_states = value_states.to(target_dtype)
attn_output = _flash_attention_forward(
query_states,
key_states,
value_states,
attention_mask,
q_len,
dropout=dropout_rate,
softmax_scale=self.scaling,
is_causal=self.is_causal,
sliding_window=self.sliding_window,
use_top_left_mask=self._flash_attn_uses_top_left_mask,
softcap=self.config.attn_logit_softcapping if is_flash_attn_greater_or_equal("2.6.0") else None,
)
attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
@ -379,37 +393,116 @@ class Gemma2Attention(nn.Module):
return attn_output, attn_weights, past_key_value
class Gemma2FlashAttention2(Gemma2Attention):
def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
super().__init__(config, layer_idx)
self.config._attn_implementation = "flash_attention_2"
logger.warning_once(
"The `Gemma2FlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`"
"attribute of the `GemmaAttention` class! It will be removed in v4.48"
)
class Gemma2SdpaAttention(Gemma2Attention):
def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
super().__init__(config, layer_idx)
self.config._attn_implementation = "sdpa"
logger.warning_once(
"The `Gemma2FlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`"
"attribute of the `GemmaAttention` class! It will be removed in v4.48"
"""
Gemma2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
`Gemma2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
SDPA API.
"""
# Adapted from Gemma2Attention.forward
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
if output_attentions:
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
logger.warning_once(
"Gemma2Model is using Gemma2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
return super().forward(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
cache_position=cache_position,
)
bsz, q_len, _ = hidden_states.size()
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
cos, sin = self.rotary_emb(value_states, position_ids)
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_value is not None:
# sin and cos are specific to RoPE models; cache_position needed for the static cache
cache_kwargs = {
"sin": sin,
"cos": cos,
"sliding_window": self.sliding_window,
"cache_position": cache_position,
}
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
key_states = repeat_kv(key_states, self.num_key_value_groups)
value_states = repeat_kv(value_states, self.num_key_value_groups)
causal_mask = attention_mask
if attention_mask is not None:
causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
# SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
# Reference: https://github.com/pytorch/pytorch/issues/112577.
if query_states.device.type == "cuda" and causal_mask is not None:
query_states = query_states.contiguous()
key_states = key_states.contiguous()
value_states = value_states.contiguous()
# We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
# in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
is_causal = True if causal_mask is None and q_len > 1 else False
attn_output = torch.nn.functional.scaled_dot_product_attention(
query_states,
key_states,
value_states,
attn_mask=causal_mask,
dropout_p=self.attention_dropout if self.training else 0.0,
is_causal=is_causal,
scale=self.scaling,
)
attn_output = attn_output.transpose(1, 2).contiguous()
attn_output = attn_output.view(bsz, q_len, -1)
attn_output = self.o_proj(attn_output)
return attn_output, None, past_key_value
GEMMA2_ATTENTION_CLASSES = {
"eager": Gemma2Attention,
"flash_attention_2": Gemma2FlashAttention2,
"sdpa": Gemma2SdpaAttention,
}
class Gemma2DecoderLayer(nn.Module):
def __init__(self, config: Gemma2Config, layer_idx: int):
super().__init__()
self.hidden_size = config.hidden_size
self.config = config
self.is_sliding = not bool(layer_idx % 2)
self.self_attn = Gemma2Attention(config=config, layer_idx=layer_idx)
self.self_attn = GEMMA2_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
self.mlp = Gemma2MLP(config)
self.input_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.config = config
self.is_sliding = not bool(layer_idx % 2)
self.pre_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.sliding_window = config.sliding_window
@ -424,6 +517,25 @@ class Gemma2DecoderLayer(nn.Module):
use_cache: Optional[bool] = False,
cache_position: Optional[torch.LongTensor] = None,
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`, *optional*):
attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
query_sequence_length, key_sequence_length)` if default attention is used.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
(see `past_key_values`).
past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
Indices depicting the position of the input sequence tokens in the sequence
kwargs (`dict`, *optional*):
Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
into the model
"""
if self.is_sliding and attention_mask is not None: # efficient SDPA and no padding
# Flash-attn is a 2D tensor
if self.config._attn_implementation == "flash_attention_2":
@ -628,10 +740,7 @@ class Gemma2Model(Gemma2PreTrainedModel):
[Gemma2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
)
self.norm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.gradient_checkpointing = False
if getattr(config, "pretraining_tp", 1) != 1:
logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")
# Initialize weights and apply final processing
self.post_init()
@ -711,7 +820,7 @@ class Gemma2Model(Gemma2PreTrainedModel):
all_hidden_states = () if output_hidden_states else None
all_self_attns = () if output_attentions else None
for decoder_layer in self.layers[: self.config.num_hidden_layers]:
for decoder_layer in self.layers:
if output_hidden_states:
all_hidden_states += (hidden_states,)
@ -852,7 +961,6 @@ class Gemma2Model(Gemma2PreTrainedModel):
class Gemma2ForCausalLM(Gemma2PreTrainedModel, GenerationMixin):
_tied_weights_keys = ["lm_head.weight"]
_tp_plan = {"lm_head": "colwise_rep"}
def __init__(self, config):
super().__init__(config)

View File

@ -29,17 +29,18 @@ from ...modeling_outputs import (
from ...utils import (
is_flash_attn_2_available,
is_flash_attn_greater_or_equal,
is_torch_greater_or_equal,
is_flash_attn_greater_or_equal_2_10,
logging,
)
from ..gemma.modeling_gemma import (
GemmaAttention,
GemmaDecoderLayer,
GemmaForCausalLM,
GemmaForSequenceClassification,
GemmaForTokenClassification,
GemmaModel,
GemmaPreTrainedModel,
GemmaRMSNorm,
GemmaRotaryEmbedding,
apply_rotary_pos_emb,
repeat_kv,
)
@ -48,9 +49,6 @@ from ..gemma.modeling_gemma import (
if is_flash_attn_2_available():
from ...modeling_flash_attention_utils import _flash_attention_forward
if is_torch_greater_or_equal("2.5"):
from torch.nn.attention.flex_attention import flex_attention
_CHECKPOINT_FOR_DOC = "google/gemma2-7b"
@ -209,166 +207,13 @@ class Gemma2MLP(nn.Module):
return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
class Gemma2RotaryEmbedding(GemmaRotaryEmbedding):
pass
def eager_attention_forward(config, query, key, value, mask, **_kwargs):
key_states = repeat_kv(key, config.num_key_value_groups)
value_states = repeat_kv(value, config.num_key_value_groups)
attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * config.scaling
if config.attn_logit_softcapping is not None:
attn_weights = attn_weights / config.attn_logit_softcapping
attn_weights = torch.tanh(attn_weights)
attn_weights = attn_weights * config.attn_logit_softcapping
if mask is not None: # no matter the length, we just slice it
causal_mask = mask[:, :, :, : key_states.shape[-2]]
attn_weights = attn_weights + causal_mask
# upcast attention to fp32
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
attn_weights = nn.functional.dropout(attn_weights, p=config.attention_dropout, training=config.training)
attn_output = torch.matmul(attn_weights, value_states)
attn_output = attn_output.transpose(1, 2).contiguous()
return attn_output, attn_weights
def flash_attention_forward(config, query, key, value, mask, target_dtype=torch.float16, **_kwargs):
if mask is not None:
seq_len = mask.shape[1]
query = query[:, :, :seq_len]
value = value[:, :, :seq_len]
# TODO: These transpose are quite inefficient but Flash Attention requires the layout
# [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor rotary embedding
query_states = query.transpose(1, 2)
key_states = key.transpose(1, 2)
value_states = value.transpose(1, 2)
dropout_rate = config.attention_dropout if config.training else 0.0
input_dtype = query_states.dtype
if input_dtype == torch.float32:
query_states = query_states.to(target_dtype)
key_states = key_states.to(target_dtype)
value_states = value_states.to(target_dtype)
attn_output = _flash_attention_forward(
query_states,
key_states,
value_states,
mask,
seq_len,
dropout=dropout_rate,
softmax_scale=config.scaling,
is_causal=config.is_causal,
sliding_window=config.sliding_window,
use_top_left_mask=config._flash_attn_uses_top_left_mask,
softcap=config.attn_logit_softcapping if is_flash_attn_greater_or_equal("2.6.0") else None,
)
return attn_output, None
def flex_attention_forward(config, query, key, value, mask, output_attentions=False, **_kwargs):
def tanh_softcap(score, b, h, q_idx, kv_idx):
soft_cap = config.attn_logit_softcapping
score = soft_cap * torch.tanh(score / soft_cap)
if mask is not None:
return score + mask[b][0][q_idx][kv_idx]
return score
attn_output = flex_attention(
query,
key,
value,
score_mod=tanh_softcap,
enable_gqa=True,
scale=config.scaling,
return_lse=output_attentions,
)
if not output_attentions:
return attn_output, None
else:
return attn_output[0], attn_output[1]
def sdpa_attention_forward(config, query, key, value, mask, **_kwargs):
key = repeat_kv(key, config.num_key_value_groups)
value = repeat_kv(value, config.num_key_value_groups)
causal_mask = mask
if mask is not None:
causal_mask = causal_mask[:, :, :, : key.shape[-2]]
# SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
# Reference: https://github.com/pytorch/pytorch/issues/112577.
if query.device.type == "cuda" and causal_mask is not None:
query = query.contiguous()
key = key.contiguous()
value = value.contiguous()
# We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
# in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
is_causal = True if causal_mask is None and query.shape[1] > 1 else False
attn_output = torch.nn.functional.scaled_dot_product_attention(
query,
key,
value,
attn_mask=causal_mask,
dropout_p=config.attention_dropout if config.training else 0.0,
is_causal=is_causal,
scale=config.scaling,
)
return attn_output, None
GEMMA2_ATTENTION_FUNCTION = {
"flash_attention_2": flash_attention_forward,
"flex_attention": flex_attention_forward,
"eager": eager_attention_forward,
"sdpa": sdpa_attention_forward,
}
class Gemma2Attention(nn.Module):
class Gemma2Attention(GemmaAttention):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
super().__init__()
self.config = config
self.layer_idx = layer_idx
self.attention_dropout = config.attention_dropout
self.hidden_size = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = config.head_dim
self.num_key_value_heads = config.num_key_value_heads
self.num_key_value_groups = self.num_heads // self.num_key_value_heads
self.max_position_embeddings = config.max_position_embeddings
self.rope_theta = config.rope_theta
self.is_causal = True
super().__init__(config, layer_idx)
self.scaling = config.query_pre_attn_scalar**-0.5
self.sliding_window = config.sliding_window if not bool(layer_idx % 2) else None
self.attn_logit_softcapping = config.attn_logit_softcapping
if self.hidden_size % self.num_heads != 0:
raise ValueError(
f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
f" and `num_heads`: {self.num_heads})."
)
self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
self.rotary_emb = Gemma2RotaryEmbedding(
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
base=self.rope_theta,
)
def forward(
self,
@ -403,14 +248,145 @@ class Gemma2Attention(nn.Module):
}
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
if output_attentions and self.config._attn_implementation in ["sdpa", "flash_attention_2"]:
logger.warning_once("Setting `attention_type` to `flex_attention` because `output_attentions=True`")
attention_type = "eager"
else:
attention_type = self.config._attn_implementation
key_states = repeat_kv(key_states, self.num_key_value_groups)
value_states = repeat_kv(value_states, self.num_key_value_groups)
attn_output, attn_weights = GEMMA2_ATTENTION_FUNCTION[attention_type](
self, query_states, key_states, value_states, attention_mask, output_attentions=output_attentions
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling
if self.config.attn_logit_softcapping is not None:
attn_weights = attn_weights / self.config.attn_logit_softcapping
attn_weights = torch.tanh(attn_weights)
attn_weights = attn_weights * self.config.attn_logit_softcapping
if attention_mask is not None: # no matter the length, we just slice it
causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
attn_weights = attn_weights + causal_mask
# upcast attention to fp32
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
attn_output = torch.matmul(attn_weights, value_states)
if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
raise ValueError(
f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
f" {attn_output.size()}"
)
attn_output = attn_output.transpose(1, 2).contiguous()
attn_output = attn_output.view(bsz, q_len, -1)
attn_output = self.o_proj(attn_output)
if not output_attentions:
attn_weights = None
return attn_output, attn_weights, past_key_value
class Gemma2FlashAttention2(Gemma2Attention):
"""
Gemma2 flash attention module. This module inherits from `Gemma2Attention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
output_attentions = False
bsz, q_len, _ = hidden_states.size()
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
# Flash attention requires the input to have the shape
# batch_size x seq_length x head_dim x hidden_dim
# therefore we just need to keep the original shape
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
cos, sin = self.rotary_emb(value_states, position_ids)
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_value is not None:
# sin and cos are specific to RoPE models; cache_position needed for the static cache
cache_kwargs = {
"sin": sin,
"cos": cos,
"sliding_window": self.sliding_window,
"cache_position": cache_position,
}
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
if attention_mask is not None:
seq_len = attention_mask.shape[1]
key_states = key_states[:, :, :seq_len]
value_states = value_states[:, :, :seq_len]
# TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
# to be able to avoid many of these transpose/reshape/view.
query_states = query_states.transpose(1, 2)
key_states = key_states.transpose(1, 2)
value_states = value_states.transpose(1, 2)
dropout_rate = self.attention_dropout if self.training else 0.0
# In PEFT, usually we cast the layer norms in float32 for training stability reasons
# therefore the input hidden states gets silently casted in float32. Hence, we need
# cast them back in the correct dtype just to be sure everything works as expected.
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
# in fp32. (Gemma2RMSNorm handles it correctly)
input_dtype = query_states.dtype
if input_dtype == torch.float32:
if torch.is_autocast_enabled():
target_dtype = torch.get_autocast_gpu_dtype()
# Handle the case where the model is quantized
elif hasattr(self.config, "_pre_quantization_dtype"):
target_dtype = self.config._pre_quantization_dtype
else:
target_dtype = self.q_proj.weight.dtype
logger.warning_once(
f"The input hidden states seems to be silently casted in float32, this might be related to"
f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
f" {target_dtype}."
)
query_states = query_states.to(target_dtype)
key_states = key_states.to(target_dtype)
value_states = value_states.to(target_dtype)
attn_output = _flash_attention_forward(
query_states,
key_states,
value_states,
attention_mask,
q_len,
dropout=dropout_rate,
softmax_scale=self.scaling,
is_causal=self.is_causal,
sliding_window=self.sliding_window,
use_top_left_mask=self._flash_attn_uses_top_left_mask,
softcap=self.config.attn_logit_softcapping if is_flash_attn_greater_or_equal("2.6.0") else None,
)
attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
@ -422,37 +398,105 @@ class Gemma2Attention(nn.Module):
return attn_output, attn_weights, past_key_value
class Gemma2FlashAttention2(Gemma2Attention):
def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
super().__init__(config, layer_idx)
self.config._attn_implementation = "flash_attention_2"
logger.warning_once(
"The `Gemma2FlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`"
"attribute of the `GemmaAttention` class! It will be removed in v4.48"
)
class Gemma2SdpaAttention(Gemma2Attention):
def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
super().__init__(config, layer_idx)
self.config._attn_implementation = "sdpa"
logger.warning_once(
"The `Gemma2FlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`"
"attribute of the `GemmaAttention` class! It will be removed in v4.48"
"""
Gemma2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
`Gemma2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
SDPA API.
"""
# Adapted from Gemma2Attention.forward
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
if output_attentions:
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
logger.warning_once(
"Gemma2Model is using Gemma2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
return super().forward(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
cache_position=cache_position,
)
bsz, q_len, _ = hidden_states.size()
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
cos, sin = self.rotary_emb(value_states, position_ids)
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_value is not None:
# sin and cos are specific to RoPE models; cache_position needed for the static cache
cache_kwargs = {
"sin": sin,
"cos": cos,
"sliding_window": self.sliding_window,
"cache_position": cache_position,
}
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
key_states = repeat_kv(key_states, self.num_key_value_groups)
value_states = repeat_kv(value_states, self.num_key_value_groups)
causal_mask = attention_mask
if attention_mask is not None:
causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
# SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
# Reference: https://github.com/pytorch/pytorch/issues/112577.
if query_states.device.type == "cuda" and causal_mask is not None:
query_states = query_states.contiguous()
key_states = key_states.contiguous()
value_states = value_states.contiguous()
# We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
# in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
is_causal = True if causal_mask is None and q_len > 1 else False
attn_output = torch.nn.functional.scaled_dot_product_attention(
query_states,
key_states,
value_states,
attn_mask=causal_mask,
dropout_p=self.attention_dropout if self.training else 0.0,
is_causal=is_causal,
scale=self.scaling,
)
attn_output = attn_output.transpose(1, 2).contiguous()
attn_output = attn_output.view(bsz, q_len, -1)
class Gemma2DecoderLayer(nn.Module):
attn_output = self.o_proj(attn_output)
return attn_output, None, past_key_value
class Gemma2DecoderLayer(GemmaDecoderLayer):
def __init__(self, config: Gemma2Config, layer_idx: int):
super().__init__()
self.hidden_size = config.hidden_size
super().__init__(config, layer_idx)
self.config = config
self.is_sliding = not bool(layer_idx % 2)
self.self_attn = Gemma2Attention(config=config, layer_idx=layer_idx)
self.mlp = Gemma2MLP(config)
self.input_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.pre_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.sliding_window = config.sliding_window
@ -609,7 +653,7 @@ class Gemma2Model(GemmaModel, Gemma2PreTrainedModel):
all_hidden_states = () if output_hidden_states else None
all_self_attns = () if output_attentions else None
for decoder_layer in self.layers[: self.config.num_hidden_layers]:
for decoder_layer in self.layers:
if output_hidden_states:
all_hidden_states += (hidden_states,)

View File

@ -708,8 +708,6 @@ class GlmModel(GlmPreTrainedModel):
dim=config.head_dim // 2, max_position_embeddings=config.max_position_embeddings, base=config.rope_theta
)
self.gradient_checkpointing = False
if getattr(config, "pretraining_tp", 1) != 1:
logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")
# Initialize weights and apply final processing
self.post_init()
@ -789,7 +787,7 @@ class GlmModel(GlmPreTrainedModel):
all_self_attns = () if output_attentions else None
next_decoder_cache = None
for decoder_layer in self.layers[: self.config.num_hidden_layers]:
for decoder_layer in self.layers:
if output_hidden_states:
all_hidden_states += (hidden_states,)
@ -969,7 +967,6 @@ class GlmModel(GlmPreTrainedModel):
class GlmForCausalLM(GlmPreTrainedModel, GenerationMixin):
_tied_weights_keys = ["lm_head.weight"]
_tp_plan = {"lm_head": "colwise_rep"}
def __init__(self, config: GlmConfig):
super().__init__(config)

View File

@ -1471,7 +1471,7 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, Generati
logger.warning_once(
"Expanding inputs for image tokens in InstructBLIP should be done in processing. "
"Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
)
inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
attention_mask = torch.cat(
@ -1610,7 +1610,7 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, Generati
logger.warning_once(
"Expanding inputs for image tokens in InstructBLIP should be done in processing. "
"Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
)
inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
attention_mask = torch.cat(

View File

@ -148,7 +148,7 @@ class InstructBlipProcessor(ProcessorMixin):
logger.warning_once(
"Expanding inputs for image tokens in InstructBLIP should be done in processing. "
"Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
)
# cast to desired return tensors type after concatenating

View File

@ -141,16 +141,6 @@ class LlamaConfig(PretrainedConfig):
model_type = "llama"
keys_to_ignore_at_inference = ["past_key_values"]
# Default tensor parallel plan for base model `LlamaModel`
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise",
"layers.*.self_attn.k_proj": "colwise",
"layers.*.self_attn.v_proj": "colwise",
"layers.*.self_attn.o_proj": "rowwise",
"layers.*.mlp.gate_proj": "colwise",
"layers.*.mlp.up_proj": "colwise",
"layers.*.mlp.down_proj": "rowwise",
}
def __init__(
self,

View File

@ -21,6 +21,7 @@ import math
from typing import List, Optional, Tuple, Union
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
@ -239,7 +240,25 @@ class LlamaMLP(nn.Module):
self.act_fn = ACT2FN[config.hidden_act]
def forward(self, x):
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
if self.config.pretraining_tp > 1:
slice = self.intermediate_size // self.config.pretraining_tp
gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
up_proj_slices = self.up_proj.weight.split(slice, dim=0)
down_proj_slices = self.down_proj.weight.split(slice, dim=1)
gate_proj = torch.cat(
[F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
)
up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
down_proj = [
F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
]
down_proj = sum(down_proj)
else:
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
return down_proj
@ -301,14 +320,31 @@ class LlamaAttention(nn.Module):
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
bsz, q_len, _ = hidden_states.size()
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
if self.config.pretraining_tp > 1:
key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
query_slices = self.q_proj.weight.split(
(self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
)
key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
# use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used
query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
query_states = torch.cat(query_states, dim=-1)
key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
key_states = torch.cat(key_states, dim=-1)
value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
value_states = torch.cat(value_states, dim=-1)
else:
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
if position_embeddings is None:
logger.warning_once(
@ -350,7 +386,12 @@ class LlamaAttention(nn.Module):
attn_output = attn_output.reshape(bsz, q_len, -1)
attn_output = self.o_proj(attn_output)
if self.config.pretraining_tp > 1:
attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
else:
attn_output = self.o_proj(attn_output)
if not output_attentions:
attn_weights = None
@ -523,10 +564,9 @@ class LlamaSdpaAttention(LlamaAttention):
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
# use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used
query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
if position_embeddings is None:
logger.warning_once(
@ -810,10 +850,7 @@ class LlamaModel(LlamaPreTrainedModel):
)
self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.rotary_emb = LlamaRotaryEmbedding(config=config)
self.gradient_checkpointing = False
if getattr(config, "pretraining_tp", 1) != 1:
logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")
# Initialize weights and apply final processing
self.post_init()
@ -893,7 +930,7 @@ class LlamaModel(LlamaPreTrainedModel):
all_self_attns = () if output_attentions else None
next_decoder_cache = None
for decoder_layer in self.layers[: self.config.num_hidden_layers]:
for decoder_layer in self.layers:
if output_hidden_states:
all_hidden_states += (hidden_states,)
@ -1076,7 +1113,6 @@ class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
class LlamaForCausalLM(LlamaPreTrainedModel, GenerationMixin):
_tied_weights_keys = ["lm_head.weight"]
_tp_plan = {"lm_head": "colwise_rep"}
def __init__(self, config):
super().__init__(config)
@ -1175,8 +1211,13 @@ class LlamaForCausalLM(LlamaPreTrainedModel, GenerationMixin):
)
hidden_states = outputs[0]
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
if self.config.pretraining_tp > 1:
lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
logits = torch.cat(logits, dim=-1)
else:
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
loss = None
if labels is not None:

View File

@ -485,7 +485,7 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin):
"Expanding inputs for image tokens in LLaVa should be done in processing. "
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
)
# prefill stage vs decoding stage (legacy behavior copied)
if input_ids.shape[1] != 1:

View File

@ -58,19 +58,10 @@ class LlavaProcessor(ProcessorMixin):
in a chat into a tokenizable string.
image_token (`str`, *optional*, defaults to `"<image>"`):
Special token used to denote image location.
num_additional_image_tokens (`int`, *optional*, defaults to 0):
Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
extra tokens appended, no need to set this arg.
"""
attributes = ["image_processor", "tokenizer"]
valid_kwargs = [
"chat_template",
"patch_size",
"vision_feature_select_strategy",
"image_token",
"num_additional_image_tokens",
]
valid_kwargs = ["chat_template", "patch_size", "vision_feature_select_strategy", "image_token"]
image_processor_class = "AutoImageProcessor"
tokenizer_class = "AutoTokenizer"
@ -82,11 +73,9 @@ class LlavaProcessor(ProcessorMixin):
vision_feature_select_strategy=None,
chat_template=None,
image_token="<image>", # set the default and let users change if they have peculiar special tokens in rare cases
num_additional_image_tokens=0,
**kwargs,
):
self.patch_size = patch_size
self.num_additional_image_tokens = num_additional_image_tokens
self.vision_feature_select_strategy = vision_feature_select_strategy
self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
super().__init__(image_processor, tokenizer, chat_template=chat_template)
@ -158,11 +147,9 @@ class LlavaProcessor(ProcessorMixin):
# Replace the image token with the expanded image token sequence
pixel_values = image_inputs["pixel_values"]
height, width = get_image_size(to_numpy_array(pixel_values[0]))
num_image_tokens = (height // self.patch_size) * (
width // self.patch_size
) + self.num_additional_image_tokens
num_image_tokens = (height // self.patch_size) * (width // self.patch_size) + 1
if self.vision_feature_select_strategy == "default":
num_image_tokens -= self.num_additional_image_tokens
num_image_tokens -= 1
prompt_strings = []
for sample in text:
@ -173,7 +160,7 @@ class LlavaProcessor(ProcessorMixin):
"Expanding inputs for image tokens in LLaVa should be done in processing. "
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
)
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])

View File

@ -868,7 +868,7 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi
"Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
)
if input_ids.shape[1] != 1:
inputs_embeds = inputs_embeds.to(image_features.dtype)

View File

@ -61,19 +61,10 @@ class LlavaNextProcessor(ProcessorMixin):
in a chat into a tokenizable string.
image_token (`str`, *optional*, defaults to `"<image>"`):
Special token used to denote image location.
num_additional_image_tokens (`int`, *optional*, defaults to 0):
Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
extra tokens appended, no need to set this arg.
"""
attributes = ["image_processor", "tokenizer"]
valid_kwargs = [
"chat_template",
"patch_size",
"vision_feature_select_strategy",
"image_token",
"num_additional_image_tokens",
]
valid_kwargs = ["chat_template", "patch_size", "vision_feature_select_strategy", "image_token"]
image_processor_class = "AutoImageProcessor"
tokenizer_class = "AutoTokenizer"
@ -85,11 +76,9 @@ class LlavaNextProcessor(ProcessorMixin):
vision_feature_select_strategy=None,
chat_template=None,
image_token="<image>", # set the default and let users change if they have peculiar special tokens in rare cases
num_additional_image_tokens=0,
**kwargs,
):
self.patch_size = patch_size
self.num_additional_image_tokens = num_additional_image_tokens
self.vision_feature_select_strategy = vision_feature_select_strategy
self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
super().__init__(image_processor, tokenizer, chat_template=chat_template)
@ -154,7 +143,7 @@ class LlavaNextProcessor(ProcessorMixin):
"Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
)
else:
image_sizes = iter(image_inputs["image_sizes"])
@ -163,12 +152,10 @@ class LlavaNextProcessor(ProcessorMixin):
for sample in text:
while self.image_token in sample:
image_size = next(image_sizes)
if not isinstance(image_size, (list, tuple)):
# cast to list to avoid numerical precision errors when calculating unpadding
orig_height, orig_width = image_size.tolist()
orig_height, orig_width = image_size
num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
if self.vision_feature_select_strategy == "default":
num_image_tokens -= self.num_additional_image_tokens
num_image_tokens -= 1
sample = sample.replace(self.image_token, "<placeholder>" * num_image_tokens, 1)
prompt_strings.append(sample)
prompt_strings = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
@ -191,7 +178,7 @@ class LlavaNextProcessor(ProcessorMixin):
orig_height, orig_width, patches_height, patches_width, scale_height, scale_width
)
# The base patch covers the entire image (+1 for the CLS)
base_features = patches_height * patches_width + self.num_additional_image_tokens
base_features = patches_height * patches_width + 1
num_image_tokens = unpadded_features + newline_features + base_features
return num_image_tokens

View File

@ -58,22 +58,12 @@ class LlavaNextVideoProcessor(ProcessorMixin):
Special token used to denote video location.
image_token (`str`, *optional*, defaults to `"<image>"`):
Special token used to denote image location.
num_additional_image_tokens (`int`, *optional*, defaults to 0):
Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
extra tokens appended, no need to set this arg.
"""
# video and image processor share same args, but have different processing logic
# only image processor config is saved in the hub
attributes = ["video_processor", "image_processor", "tokenizer"]
valid_kwargs = [
"chat_template",
"patch_size",
"vision_feature_select_strategy",
"image_token",
"video_token",
"num_additional_image_tokens",
]
valid_kwargs = ["chat_template", "patch_size", "vision_feature_select_strategy", "image_token", "video_token"]
image_processor_class = "LlavaNextImageProcessor"
video_processor_class = "LlavaNextVideoImageProcessor"
tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
@ -88,11 +78,9 @@ class LlavaNextVideoProcessor(ProcessorMixin):
vision_feature_select_strategy=None,
video_token="<video>",
image_token="<image>",
num_additional_image_tokens=0,
**kwargs,
):
self.patch_size = patch_size
self.num_additional_image_tokens = num_additional_image_tokens
self.vision_feature_select_strategy = vision_feature_select_strategy
self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
@ -176,9 +164,8 @@ class LlavaNextVideoProcessor(ProcessorMixin):
if self.patch_size is None or self.vision_feature_select_strategy is None:
logger.warning_once(
"Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. "
"Please add `patch_size`, `num_additional_image_tokens` and `vision_feature_select_strategy` to the model's processing config or set directly "
"with `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` "
"and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
)
else:
@ -190,12 +177,10 @@ class LlavaNextVideoProcessor(ProcessorMixin):
for sample in text:
while self.image_token in sample:
image_size = next(image_sizes)
if not isinstance(image_size, (list, tuple)):
# cast to list to avoid numerical precision errors when calculating unpadding
orig_height, orig_width = image_size.tolist()
orig_height, orig_width = image_size
num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
if self.vision_feature_select_strategy == "default":
num_image_tokens -= self.num_additional_image_tokens
num_image_tokens -= 1
sample = sample.replace(self.image_token, "<placeholder>" * num_image_tokens, 1)
prompt_strings.append(sample)
text = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
@ -205,8 +190,6 @@ class LlavaNextVideoProcessor(ProcessorMixin):
one_video = to_numpy_array(videos_inputs.get("pixel_values_videos")[0])
height, width = get_image_size(one_video[0])
num_frames = one_video.shape[0] # frame dim is always after batch dim
# no `self.num_additional_image_tokens` added because video always has a default feature selection strategy
num_image_tokens = (height // self.patch_size) * (width // self.patch_size)
num_video_tokens = num_image_tokens // 4 * num_frames # divide by 4 needed for avg pooling layer
prompt_strings = []
@ -239,7 +222,7 @@ class LlavaNextVideoProcessor(ProcessorMixin):
orig_height, orig_width, patches_height, patches_width, scale_height, scale_width
)
# The base patch covers the entire image (+1 for the CLS)
base_features = patches_height * patches_width + self.num_additional_image_tokens
base_features = patches_height * patches_width + 1
num_image_tokens = unpadded_features + newline_features + base_features
return num_image_tokens

View File

@ -188,10 +188,7 @@ class LlavaOnevisionProcessor(ProcessorMixin):
for sample in text:
while special_token in sample:
image_size_list = next(image_sizes)
original_size = image_size_list[0] if num_frames != 1 else image_size_list
if not isinstance(original_size, (list, tuple)):
# cast to list to avoid numerical precision errors when calculating unpadding
orig_height, orig_width = original_size.tolist()
orig_height, orig_width = image_size_list[0] if num_frames != 1 else image_size_list
num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
if self.vision_feature_select_strategy == "default":
num_image_tokens -= 1

View File

@ -1034,8 +1034,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor):
) -> List[Dict]:
"""
Converts the output of [`Mask2FormerForUniversalSegmentationOutput`] into instance segmentation predictions.
Only supports PyTorch. If instances could overlap, set either return_coco_annotation or return_binary_maps
to `True` to get the correct segmentation result.
Only supports PyTorch.
Args:
outputs ([`Mask2FormerForUniversalSegmentation`]):
@ -1057,10 +1056,9 @@ class Mask2FormerImageProcessor(BaseImageProcessor):
(one per detected instance).
Returns:
`List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
- **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id`, or
- **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id` or
`List[List]` run-length encoding (RLE) of the segmentation map if return_coco_annotation is set to
`True`, or a tensor of shape `(num_instances, height, width)` if return_binary_maps is set to `True`.
Set to `None` if no mask if found above `threshold`.
`True`. Set to `None` if no mask if found above `threshold`.
- **segments_info** -- A dictionary that contains additional information on each segment.
- **id** -- An integer representing the `segment_id`.
- **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`.

View File

@ -926,7 +926,7 @@ class Mask2FormerPixelDecoderEncoderMultiscaleDeformableAttention(nn.Module):
encoder_attention_mask=None,
position_embeddings: Optional[torch.Tensor] = None,
reference_points=None,
spatial_shapes_list=None,
spatial_shapes=None,
level_start_index=None,
output_attentions: bool = False,
):
@ -936,8 +936,7 @@ class Mask2FormerPixelDecoderEncoderMultiscaleDeformableAttention(nn.Module):
batch_size, num_queries, _ = hidden_states.shape
batch_size, sequence_length, _ = encoder_hidden_states.shape
total_elements = sum(height * width for height, width in spatial_shapes_list)
if total_elements != sequence_length:
if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length:
raise ValueError(
"Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
)
@ -958,11 +957,7 @@ class Mask2FormerPixelDecoderEncoderMultiscaleDeformableAttention(nn.Module):
)
# batch_size, num_queries, n_heads, n_levels, n_points, 2
if reference_points.shape[-1] == 2:
offset_normalizer = torch.tensor(
[[shape[1], shape[0]] for shape in spatial_shapes_list],
dtype=torch.long,
device=reference_points.device,
)
offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
sampling_locations = (
reference_points[:, :, None, :, None, :]
+ sampling_offsets / offset_normalizer[None, None, None, :, None, :]
@ -975,7 +970,7 @@ class Mask2FormerPixelDecoderEncoderMultiscaleDeformableAttention(nn.Module):
else:
raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
output = multi_scale_deformable_attention(value, spatial_shapes_list, sampling_locations, attention_weights)
output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
output = self.output_proj(output)
return output, attention_weights
@ -1006,7 +1001,7 @@ class Mask2FormerPixelDecoderEncoderLayer(nn.Module):
attention_mask: torch.Tensor,
position_embeddings: torch.Tensor = None,
reference_points=None,
spatial_shapes_list=None,
spatial_shapes=None,
level_start_index=None,
output_attentions: bool = False,
):
@ -1020,8 +1015,8 @@ class Mask2FormerPixelDecoderEncoderLayer(nn.Module):
Position embeddings, to be added to `hidden_states`.
reference_points (`torch.FloatTensor`, *optional*):
Reference points.
spatial_shapes_list (`list` of `tuple`):
Spatial shapes of the backbone feature maps as a list of tuples.
spatial_shapes (`torch.LongTensor`, *optional*):
Spatial shapes of the backbone feature maps.
level_start_index (`torch.LongTensor`, *optional*):
Level start index.
output_attentions (`bool`, *optional*):
@ -1038,7 +1033,7 @@ class Mask2FormerPixelDecoderEncoderLayer(nn.Module):
encoder_attention_mask=attention_mask,
position_embeddings=position_embeddings,
reference_points=reference_points,
spatial_shapes_list=spatial_shapes_list,
spatial_shapes=spatial_shapes,
level_start_index=level_start_index,
output_attentions=output_attentions,
)
@ -1091,13 +1086,13 @@ class Mask2FormerPixelDecoderEncoderOnly(nn.Module):
)
@staticmethod
def get_reference_points(spatial_shapes_list, valid_ratios, device):
def get_reference_points(spatial_shapes, valid_ratios, device):
"""
Get reference points for each feature map. Used in decoder.
Args:
spatial_shapes_list (`list` of `tuple`):
Spatial shapes of the backbone feature maps as a list of tuples.
spatial_shapes (`torch.LongTensor`):
Spatial shapes of each feature map, has shape of `(num_feature_levels, 2)`.
valid_ratios (`torch.FloatTensor`):
Valid ratios of each feature map, has shape of `(batch_size, num_feature_levels, 2)`.
device (`torch.device`):
@ -1106,7 +1101,7 @@ class Mask2FormerPixelDecoderEncoderOnly(nn.Module):
`torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
"""
reference_points_list = []
for lvl, (height, width) in enumerate(spatial_shapes_list):
for lvl, (height, width) in enumerate(spatial_shapes):
ref_y, ref_x = torch.meshgrid(
torch.linspace(0.5, height - 0.5, height, dtype=valid_ratios.dtype, device=device),
torch.linspace(0.5, width - 0.5, width, dtype=valid_ratios.dtype, device=device),
@ -1127,7 +1122,7 @@ class Mask2FormerPixelDecoderEncoderOnly(nn.Module):
inputs_embeds=None,
attention_mask=None,
position_embeddings=None,
spatial_shapes_list=None,
spatial_shapes=None,
level_start_index=None,
valid_ratios=None,
output_attentions=None,
@ -1145,8 +1140,8 @@ class Mask2FormerPixelDecoderEncoderOnly(nn.Module):
[What are attention masks?](../glossary#attention-mask)
position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Position embeddings that are added to the queries and keys in each self-attention layer.
spatial_shapes_list (`list` of `tuple`):
Spatial shapes of each feature map as a list of tuples.
spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
Spatial shapes of each feature map.
level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`):
Starting index of each feature map.
valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
@ -1167,7 +1162,7 @@ class Mask2FormerPixelDecoderEncoderOnly(nn.Module):
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
hidden_states = inputs_embeds
reference_points = self.get_reference_points(spatial_shapes_list, valid_ratios, device=inputs_embeds.device)
reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=inputs_embeds.device)
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
@ -1181,7 +1176,7 @@ class Mask2FormerPixelDecoderEncoderOnly(nn.Module):
attention_mask,
position_embeddings=position_embeddings,
reference_points=reference_points,
spatial_shapes_list=spatial_shapes_list,
spatial_shapes=spatial_shapes,
level_start_index=level_start_index,
output_attentions=output_attentions,
)
@ -1307,9 +1302,9 @@ class Mask2FormerPixelDecoder(nn.Module):
]
# Prepare encoder inputs (by flattening)
spatial_shapes_list = [(embed.shape[2], embed.shape[3]) for embed in input_embeds]
spatial_shapes = [(embed.shape[2], embed.shape[3]) for embed in input_embeds]
input_embeds_flat = torch.cat([embed.flatten(2).transpose(1, 2) for embed in input_embeds], 1)
spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=input_embeds_flat.device)
spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=input_embeds_flat.device)
masks_flat = torch.cat([mask.flatten(1) for mask in masks], 1)
position_embeddings = [embed.flatten(2).transpose(1, 2) for embed in position_embeddings]
@ -1325,7 +1320,7 @@ class Mask2FormerPixelDecoder(nn.Module):
inputs_embeds=input_embeds_flat,
attention_mask=masks_flat,
position_embeddings=level_pos_embed_flat,
spatial_shapes_list=spatial_shapes_list,
spatial_shapes=spatial_shapes,
level_start_index=level_start_index,
valid_ratios=valid_ratios,
output_attentions=output_attentions,
@ -1336,23 +1331,18 @@ class Mask2FormerPixelDecoder(nn.Module):
last_hidden_state = encoder_outputs.last_hidden_state
batch_size = last_hidden_state.shape[0]
# We compute level_start_index_list separately from the tensor version level_start_index
# to avoid iterating over a tensor which breaks torch.compile/export.
level_start_index_list = [0]
for height, width in spatial_shapes_list[:-1]:
level_start_index_list.append(level_start_index_list[-1] + height * width)
split_sizes = [None] * self.num_feature_levels
for i in range(self.num_feature_levels):
if i < self.num_feature_levels - 1:
split_sizes[i] = level_start_index_list[i + 1] - level_start_index_list[i]
split_sizes[i] = level_start_index[i + 1] - level_start_index[i]
else:
split_sizes[i] = last_hidden_state.shape[1] - level_start_index_list[i]
split_sizes[i] = last_hidden_state.shape[1] - level_start_index[i]
encoder_output = torch.split(last_hidden_state, split_sizes, dim=1)
encoder_output = torch.split(last_hidden_state, [size.item() for size in split_sizes], dim=1)
# Compute final features
outputs = [
x.transpose(1, 2).view(batch_size, -1, spatial_shapes_list[i][0], spatial_shapes_list[i][1])
x.transpose(1, 2).view(batch_size, -1, spatial_shapes[i][0], spatial_shapes[i][1])
for i, x in enumerate(encoder_output)
]
@ -1886,9 +1876,7 @@ class Mask2FormerMaskedAttentionDecoder(nn.Module):
else:
level_index = idx % self.num_feature_levels
where = (attention_mask.sum(-1) != attention_mask.shape[-1]).to(attention_mask.dtype)
# Multiply the attention mask instead of indexing to avoid issue in torch.export.
attention_mask = attention_mask * where.unsqueeze(-1)
attention_mask[torch.where(attention_mask.sum(-1) == attention_mask.shape[-1])] = False
layer_outputs = decoder_layer(
hidden_states,
@ -2428,8 +2416,8 @@ class Mask2FormerForUniversalSegmentation(Mask2FormerPreTrainedModel):
>>> masks_queries_logits = outputs.masks_queries_logits
>>> # Perform post-processing to get instance segmentation map
>>> pred_instance_map = image_processor.post_process_instance_segmentation(
... outputs, target_sizes=[(image.height, image.width)]
>>> pred_instance_map = image_processor.post_process_semantic_segmentation(
... outputs, target_sizes=[image.size[::-1]]
... )[0]
>>> print(pred_instance_map.shape)
torch.Size([480, 640])
@ -2462,7 +2450,7 @@ class Mask2FormerForUniversalSegmentation(Mask2FormerPreTrainedModel):
>>> # Perform post-processing to get semantic segmentation map
>>> pred_semantic_map = image_processor.post_process_semantic_segmentation(
... outputs, target_sizes=[(image.height, image.width)]
... outputs, target_sizes=[image.size[::-1]]
... )[0]
>>> print(pred_semantic_map.shape)
torch.Size([512, 683])
@ -2496,7 +2484,7 @@ class Mask2FormerForUniversalSegmentation(Mask2FormerPreTrainedModel):
>>> # Perform post-processing to get panoptic segmentation map
>>> pred_panoptic_map = image_processor.post_process_panoptic_segmentation(
... outputs, target_sizes=[(image.height, image.width)]
... outputs, target_sizes=[image.size[::-1]]
... )[0]["segmentation"]
>>> print(pred_panoptic_map.shape)
torch.Size([338, 676])

View File

@ -1080,8 +1080,7 @@ class MaskFormerImageProcessor(BaseImageProcessor):
) -> List[Dict]:
"""
Converts the output of [`MaskFormerForInstanceSegmentationOutput`] into instance segmentation predictions. Only
supports PyTorch. If instances could overlap, set either return_coco_annotation or return_binary_maps
to `True` to get the correct segmentation result.
supports PyTorch.
Args:
outputs ([`MaskFormerForInstanceSegmentation`]):
@ -1103,10 +1102,9 @@ class MaskFormerImageProcessor(BaseImageProcessor):
(one per detected instance).
Returns:
`List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
- **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id`, or
- **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id` or
`List[List]` run-length encoding (RLE) of the segmentation map if return_coco_annotation is set to
`True`, or a tensor of shape `(num_instances, height, width)` if return_binary_maps is set to `True`.
Set to `None` if no mask if found above `threshold`.
`True`. Set to `None` if no mask if found above `threshold`.
- **segments_info** -- A dictionary that contains additional information on each segment.
- **id** -- An integer representing the `segment_id`.
- **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`.

View File

@ -1780,7 +1780,7 @@ class MaskFormerForInstanceSegmentation(MaskFormerPreTrainedModel):
>>> # you can pass them to image_processor for postprocessing
>>> predicted_semantic_map = image_processor.post_process_semantic_segmentation(
... outputs, target_sizes=[(image.height, image.width)]
... outputs, target_sizes=[image.size[::-1]]
... )[0]
>>> # we refer to the demo notebooks for visualization (see "Resources" section in the MaskFormer docs)
@ -1810,7 +1810,7 @@ class MaskFormerForInstanceSegmentation(MaskFormerPreTrainedModel):
>>> masks_queries_logits = outputs.masks_queries_logits
>>> # you can pass them to image_processor for postprocessing
>>> result = image_processor.post_process_panoptic_segmentation(outputs, target_sizes=[(image.height, image.width)])[0]
>>> result = image_processor.post_process_panoptic_segmentation(outputs, target_sizes=[image.size[::-1]])[0]
>>> # we refer to the demo notebooks for visualization (see "Resources" section in the MaskFormer docs)
>>> predicted_panoptic_map = result["segmentation"]

View File

@ -21,11 +21,10 @@ import sys
import types
import torch
from huggingface_hub import split_torch_state_dict_into_shards
from packaging import version
from transformers import AutoTokenizer, GPT2Config
from transformers.modeling_utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME
from transformers.modeling_utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME, shard_checkpoint
def add_checkpointing_args(parser):
@ -572,15 +571,7 @@ def convert_checkpoint_from_megatron_to_transformers(args):
# Store the state_dict to file.
max_shard_size = int(args.max_shard_size) if args.max_shard_size.isdigit() else args.max_shard_size
state_dict_split = split_torch_state_dict_into_shards(output_state_dict, max_shard_size=max_shard_size)
shards = index = None
for tensors in state_dict_split.filename_to_tensors.values():
shards = {tensor: state_dict[tensor] for tensor in tensors}
if state_dict_split.is_sharded:
index = {
"metadata": state_dict_split.metadata,
"weight_map": state_dict_split.tensor_to_filename,
}
shards, index = shard_checkpoint(output_state_dict, max_shard_size=max_shard_size)
# Save the model
for shard_file, shard in shards.items():

View File

@ -980,7 +980,7 @@ class NemotronModel(NemotronPreTrainedModel):
return causal_mask
# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->NEMOTRON,Llama->Nemotron,llama->nemotron
# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->NEMOTRON,Llama->Nemotron,llama->nemotron
class NemotronForCausalLM(NemotronPreTrainedModel, GenerationMixin):
_tied_weights_keys = ["lm_head.weight"]

View File

@ -1020,7 +1020,7 @@ class OlmoModel(OlmoPreTrainedModel):
return causal_mask
# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->OLMO,Llama->Olmo
# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->OLMO,Llama->Olmo
class OlmoForCausalLM(OlmoPreTrainedModel, GenerationMixin):
_tied_weights_keys = ["lm_head.weight"]

View File

@ -1,27 +0,0 @@
# Copyright 2024 EleutherAI and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ...utils import _LazyModule
from ...utils.import_utils import define_import_structure
if TYPE_CHECKING:
from .configuration_olmo_1124 import *
from .modeling_olmo_1124 import *
else:
import sys
_file = globals()["__file__"]
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

View File

@ -1,166 +0,0 @@
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# This file was automatically generated from src/transformers/models/olmo_1124/modular_olmo_1124.py.
# Do NOT edit this file manually as any edits will be overwritten by the generation of
# the file from the modular. If any change should be done, please apply the change to the
# modular_olmo_1124.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
from ...configuration_utils import PretrainedConfig
class Olmo1124Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Olmo1124Model`]. It is used to instantiate an OLMo November 2024
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the [allenai/Olmo1124-7B-hf](https://huggingface.co/allenai/Olmo1124-7B-hf).
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 50304):
Vocabulary size of the Olmo1124 model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`Olmo1124Model`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 11008):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer decoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer decoder.
num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details checkout [this
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
`num_attention_heads`.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
pad_token_id (`int`, *optional*, defaults to 1):
Padding token id.
bos_token_id (`int`, *optional*):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 50279):
End of stream token id.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
`{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
`max_position_embeddings` to the expected new maximum. See the following thread for more information on how
these scaling strategies behave:
https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
experimental feature, subject to breaking API changes in future versions.
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the rms normalization layers.
```python
>>> from transformers import Olmo1124Model, Olmo1124Config
>>> # Initializing a Olmo November 2024 7B style configuration
>>> configuration = Olmo1124Config()
>>> # Initializing a model from the Olmo November 2024 7B style configuration
>>> model = Olmo1124Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "olmo_1124"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=50304,
hidden_size=4096,
intermediate_size=11008,
num_hidden_layers=32,
num_attention_heads=32,
num_key_value_heads=None,
hidden_act="silu",
max_position_embeddings=2048,
initializer_range=0.02,
use_cache=True,
pad_token_id=1,
bos_token_id=None,
eos_token_id=50279,
tie_word_embeddings=False,
rope_theta=10000.0,
rope_scaling=None,
attention_bias=False,
attention_dropout=0.0,
rms_norm_eps=1e-5,
**kwargs,
):
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self._rope_scaling_validation()
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.rms_norm_eps = rms_norm_eps
def _rope_scaling_validation(self):
"""
Validate the `rope_scaling` configuration.
"""
if self.rope_scaling is None:
return
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
raise ValueError(
"`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}"
)
rope_scaling_type = self.rope_scaling.get("type", None)
rope_scaling_factor = self.rope_scaling.get("factor", None)
if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
raise ValueError(
f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
)
if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
__all__ = ["Olmo1124Config"]

View File

@ -1,304 +0,0 @@
# Copyright 2024 EleutherAI and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import gc
import json
import os
import shutil
from pathlib import Path
from typing import Any, Dict
import torch
import yaml
from tokenizers import Tokenizer
from transformers import Olmo1124Config, Olmo1124ForCausalLM
from transformers.models.gpt2.tokenization_gpt2_fast import GPT2TokenizerFast
"""
Sample usage:
```
python src/transformers/models/olmo_1124/convert_olmo_1124_weights_to_hf.py \
--input_dir /path/to/downloaded/olmo_1124/weights --model_size 7B --output_dir /output/path
```
Thereafter, models can be loaded via:
```py
from transformers import Olmo1124ForCausalLM, AutoTokenizer
model = Olmo1124ForCausalLM.from_pretrained("/output/path")
tokenizer = AutoTokenizer.from_pretrained("/output/path")
```
Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
"""
def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
def read_json(path):
with open(path, "r") as f:
return json.load(f)
def write_json(text, path):
with open(path, "w") as f:
json.dump(text, f)
def write_model(
model_path,
input_base_path,
include_tokenizer=True,
tokenizer_path=None,
safe_serialization=True,
fix_eos_token_id=True,
tmp_cleanup=True,
):
os.makedirs(model_path, exist_ok=True)
tmp_model_path = os.path.join(model_path, "tmp")
os.makedirs(tmp_model_path, exist_ok=True)
config_path = Path(input_base_path) / "config.yaml"
olmo_1124_config = yaml.safe_load(config_path.read_text())["model"]
if not olmo_1124_config.get("attention_layer_norm", False):
raise RuntimeError("OLMo November 2024 checkpoints must have attention layer norm")
if not olmo_1124_config.get("norm_after", False):
raise RuntimeError("OLMo November 2024 checkpoints must set norm_after to True")
n_layers = olmo_1124_config["n_layers"]
n_heads = olmo_1124_config["n_heads"]
dim = olmo_1124_config["d_model"]
dims_per_head = dim // n_heads
base = olmo_1124_config["rope_theta"]
inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
max_position_embeddings = olmo_1124_config["max_sequence_length"]
vocab_size = olmo_1124_config.get("embedding_size", olmo_1124_config["vocab_size"])
if olmo_1124_config.get("n_kv_heads", None) is not None:
num_key_value_heads = olmo_1124_config["n_kv_heads"] # for GQA / MQA
elif olmo_1124_config["multi_query_attention"]: # compatibility with other checkpoints
num_key_value_heads = 1
else:
num_key_value_heads = n_heads
print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
# Not sharded
# (The sharded implementation would also work, but this is simpler.)
loaded = torch.load(os.path.join(input_base_path, "model.pt"), map_location="cpu")
param_count = 0
index_dict: Dict[str, Any] = {"weight_map": {}}
for layer_i in range(n_layers):
filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
# Unsharded
# TODO: Layernorm stuff
# TODO: multi query attention
fused_dims = [dim, dims_per_head * num_key_value_heads, dims_per_head * num_key_value_heads]
q_proj_weight, k_proj_weight, v_proj_weight = torch.split(
loaded[f"transformer.blocks.{layer_i}.att_proj.weight"], fused_dims, dim=0
)
up_proj_weight, gate_proj_weight = torch.chunk(
loaded[f"transformer.blocks.{layer_i}.ff_proj.weight"], 2, dim=0
)
state_dict = {
f"model.layers.{layer_i}.self_attn.q_proj.weight": q_proj_weight,
f"model.layers.{layer_i}.self_attn.k_proj.weight": k_proj_weight,
f"model.layers.{layer_i}.self_attn.v_proj.weight": v_proj_weight,
f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"transformer.blocks.{layer_i}.attn_out.weight"],
f"model.layers.{layer_i}.self_attn.q_norm.weight": loaded[f"transformer.blocks.{layer_i}.q_norm.weight"],
f"model.layers.{layer_i}.self_attn.k_norm.weight": loaded[f"transformer.blocks.{layer_i}.k_norm.weight"],
f"model.layers.{layer_i}.mlp.gate_proj.weight": gate_proj_weight,
f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"transformer.blocks.{layer_i}.ff_out.weight"],
f"model.layers.{layer_i}.mlp.up_proj.weight": up_proj_weight,
f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[
f"transformer.blocks.{layer_i}.attn_norm.weight"
],
f"model.layers.{layer_i}.post_feedforward_layernorm.weight": loaded[
f"transformer.blocks.{layer_i}.ff_norm.weight"
],
}
state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
for k, v in state_dict.items():
index_dict["weight_map"][k] = filename
param_count += v.numel()
torch.save(state_dict, os.path.join(tmp_model_path, filename))
filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
# Unsharded
# TODO: Deal with weight-tying
state_dict = {
"model.embed_tokens.weight": loaded["transformer.wte.weight"],
"model.norm.weight": loaded["transformer.ln_f.weight"],
"lm_head.weight": loaded["transformer.ff_out.weight"]
if "transformer.ff_out.weight" in loaded
else loaded["transformer.wte.weight"],
}
for k, v in state_dict.items():
index_dict["weight_map"][k] = filename
param_count += v.numel()
torch.save(state_dict, os.path.join(tmp_model_path, filename))
# Write configs
index_dict["metadata"] = {"total_size": param_count * 2}
write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
if olmo_1124_config.get("mlp_hidden_size", None) is not None:
intermediate_size = olmo_1124_config["mlp_hidden_size"] // 2
else:
intermediate_size = (dim * olmo_1124_config["mlp_ratio"]) // 2
if fix_eos_token_id and olmo_1124_config["eos_token_id"] == 0:
# Fixing a bug in OLMo where eos token id was incorrectly set
print("Changing eos_token_id from 0 to 50279.")
olmo_1124_config["eos_token_id"] = 50279
config = Olmo1124Config(
vocab_size=vocab_size,
hidden_size=dim,
intermediate_size=intermediate_size,
num_hidden_layers=n_layers,
num_attention_heads=n_heads,
num_key_value_heads=num_key_value_heads,
max_position_embeddings=max_position_embeddings,
pad_token_id=olmo_1124_config["pad_token_id"],
bos_token_id=None,
eos_token_id=olmo_1124_config["eos_token_id"],
tie_word_embeddings=olmo_1124_config["weight_tying"],
rms_norm_eps=olmo_1124_config["layer_norm_eps"],
rope_theta=base,
)
config.save_pretrained(tmp_model_path)
# Make space so we can load the model properly now.
del state_dict
del loaded
gc.collect()
if include_tokenizer:
_write_tokenizer(model_path, config, input_base_path, tokenizer_path)
print("Loading the checkpoint in a OLMo November 2024 model.")
model = Olmo1124ForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float32, low_cpu_mem_usage=True)
# Avoid saving this as part of the config.
del model.config._name_or_path
print("Saving in the Transformers format.")
model.save_pretrained(model_path, safe_serialization=safe_serialization)
if tmp_cleanup:
# Make cleanup optional; attempting to `rmtree` the `tmp_model_path` causes
# errors if using NFS.
shutil.rmtree(tmp_model_path)
def _write_tokenizer(
output_path: Path,
config: Olmo1124Config,
checkpoint_dir: str,
input_tokenizer_path: Path | None,
) -> None:
print(f"Saving a {GPT2TokenizerFast.__name__} to {output_path}.")
if input_tokenizer_path is not None:
base_tokenizer = Tokenizer.from_file(str(input_tokenizer_path))
else:
config_path = Path(checkpoint_dir) / "config.yaml"
tokenizer_config = yaml.safe_load(config_path.read_text())["tokenizer"]
# Initialize tokenizer and validate vocab size.
if Path(tokenizer_config["identifier"]).is_file():
base_tokenizer = Tokenizer.from_file(tokenizer_config["identifier"])
else:
base_tokenizer = Tokenizer.from_pretrained(tokenizer_config["identifier"])
eos_token_id = config.eos_token_id if config.eos_token_id is not None else base_tokenizer.get_vocab_size() - 1
pad_token_id = config.pad_token_id if config.pad_token_id is not None else eos_token_id
tokenizer = GPT2TokenizerFast(
tokenizer_object=base_tokenizer,
eos_token=base_tokenizer.decode([eos_token_id], skip_special_tokens=False),
pad_token=base_tokenizer.decode([pad_token_id], skip_special_tokens=False),
)
tokenizer.save_pretrained(output_path)
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--input_dir",
required=True,
help="Location of OLMo November 2024 weights, which contains config.yaml and model.pt.",
)
parser.add_argument(
"--no_tokenizer",
action="store_false",
dest="include_tokenizer",
help="If set, do not convert OLMo tokenizer to HF tokenizer.",
)
parser.add_argument(
"--tokenizer_json_path",
type=Path,
default=None,
help="Location of OLMo November 2024 tokenizer json file. Defaults to what is set in the config file.",
)
parser.add_argument(
"--output_dir",
required=True,
help="Location to write HF model and tokenizer",
)
parser.add_argument(
"--no_fix_eos_token_id",
action="store_false",
dest="fix_eos_token_id",
help="If set, does not change eos token id from 0 to 50279 if it is 0. Changing 0 to 50279 is a bug fix, so use this option with care.",
)
parser.add_argument(
"--no_tmp_cleanup",
action="store_false",
dest="tmp_cleanup",
help="If passed, don't remove temp dir at end of HF conversion.",
)
parser.add_argument(
"--no_safe_serialization",
action="store_false",
dest="safe_serialization",
help="Whether or not to save using `safetensors`.",
)
args = parser.parse_args()
write_model(
model_path=args.output_dir,
input_base_path=args.input_dir,
safe_serialization=args.safe_serialization,
include_tokenizer=args.include_tokenizer,
tokenizer_path=args.tokenizer_json_path,
fix_eos_token_id=args.fix_eos_token_id,
tmp_cleanup=args.tmp_cleanup,
)
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

View File

@ -1,489 +0,0 @@
import math
from typing import Optional, Tuple
import torch
from torch import nn
from ...cache_utils import Cache
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
from ...utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, logging
from ..llama.modeling_llama import LlamaRMSNorm
from ..olmo.configuration_olmo import OlmoConfig
from ..olmo.modeling_olmo import (
OlmoAttention,
OlmoDecoderLayer,
OlmoFlashAttention2,
OlmoForCausalLM,
OlmoModel,
OlmoPreTrainedModel,
OlmoSdpaAttention,
apply_rotary_pos_emb,
repeat_kv,
)
if is_flash_attn_2_available():
from ...modeling_flash_attention_utils import _flash_attention_forward
logger = logging.get_logger(__name__)
class Olmo1124Config(OlmoConfig):
r"""
This is the configuration class to store the configuration of a [`Olmo1124Model`]. It is used to instantiate an OLMo November 2024
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the [allenai/Olmo1124-7B-hf](https://huggingface.co/allenai/Olmo1124-7B-hf).
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 50304):
Vocabulary size of the Olmo1124 model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`Olmo1124Model`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 11008):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer decoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer decoder.
num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details checkout [this
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
`num_attention_heads`.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
pad_token_id (`int`, *optional*, defaults to 1):
Padding token id.
bos_token_id (`int`, *optional*):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 50279):
End of stream token id.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
`{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
`max_position_embeddings` to the expected new maximum. See the following thread for more information on how
these scaling strategies behave:
https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
experimental feature, subject to breaking API changes in future versions.
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the rms normalization layers.
```python
>>> from transformers import Olmo1124Model, Olmo1124Config
>>> # Initializing a Olmo November 2024 7B style configuration
>>> configuration = Olmo1124Config()
>>> # Initializing a model from the Olmo November 2024 7B style configuration
>>> model = Olmo1124Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "olmo_1124"
def __init__(
self,
vocab_size=50304,
hidden_size=4096,
intermediate_size=11008,
num_hidden_layers=32,
num_attention_heads=32,
num_key_value_heads=None,
hidden_act="silu",
max_position_embeddings=2048,
initializer_range=0.02,
use_cache=True,
pad_token_id=1,
bos_token_id=None,
eos_token_id=50279,
tie_word_embeddings=False,
rope_theta=10000.0,
rope_scaling=None,
attention_bias=False,
attention_dropout=0.0,
rms_norm_eps=1e-5,
**kwargs,
):
super().__init__(
vocab_size=vocab_size,
hidden_size=hidden_size,
intermediate_size=intermediate_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
num_key_value_heads=num_key_value_heads,
hidden_act=hidden_act,
max_position_embeddings=max_position_embeddings,
initializer_range=initializer_range,
use_cache=use_cache,
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
attention_bias=attention_bias,
attention_dropout=attention_dropout,
**kwargs,
)
self.rms_norm_eps = rms_norm_eps
del self.clip_qkv
class Olmo1124RMSNorm(LlamaRMSNorm):
pass
ALL_LAYERNORM_LAYERS.append(Olmo1124RMSNorm)
# Olmo1124 attention is identical to OLMo attention except:
# - Norm is applied to attention queries and keys.
# - No qkv clipping.
class Olmo1124Attention(OlmoAttention):
def __init__(self, config: Olmo1124Config, layer_idx: Optional[int] = None):
super().__init__(config, layer_idx=layer_idx)
self.q_norm = Olmo1124RMSNorm(self.num_heads * self.head_dim, config.rms_norm_eps)
self.k_norm = Olmo1124RMSNorm(self.num_key_value_heads * self.head_dim, config.rms_norm_eps)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
**kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
bsz, q_len, _ = hidden_states.size()
query_states = self.q_norm(self.q_proj(hidden_states))
key_states = self.k_norm(self.k_proj(hidden_states))
value_states = self.v_proj(hidden_states)
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
cos, sin = self.rotary_emb(value_states, position_ids)
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_value is not None:
# sin and cos are specific to RoPE models; cache_position needed for the static cache
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
key_states = repeat_kv(key_states, self.num_key_value_groups)
value_states = repeat_kv(value_states, self.num_key_value_groups)
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
if attention_mask is not None: # no matter the length, we just slice it
causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
attn_weights = attn_weights + causal_mask
# upcast attention to fp32
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
attn_output = torch.matmul(attn_weights, value_states)
if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
raise ValueError(
f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
f" {attn_output.size()}"
)
attn_output = attn_output.transpose(1, 2).contiguous()
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
attn_output = self.o_proj(attn_output)
if not output_attentions:
attn_weights = None
return attn_output, attn_weights, past_key_value
class Olmo1124FlashAttention2(OlmoFlashAttention2, Olmo1124Attention):
"""
OLMo November 2024 flash attention module. This module inherits from `Olmo1124Attention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
"""
def __init__(self, *args, **kwargs):
Olmo1124Attention.__init__(*args, **kwargs)
# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
**kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
output_attentions = False
bsz, q_len, _ = hidden_states.size()
query_states = self.q_norm(self.q_proj(hidden_states))
key_states = self.k_norm(self.k_proj(hidden_states))
value_states = self.v_proj(hidden_states)
# Flash attention requires the input to have the shape
# batch_size x seq_length x head_dim x hidden_dim
# therefore we just need to keep the original shape
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
cos, sin = self.rotary_emb(value_states, position_ids)
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_value is not None:
# sin and cos are specific to RoPE models; cache_position needed for the static cache
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
# TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
# to be able to avoid many of these transpose/reshape/view.
query_states = query_states.transpose(1, 2)
key_states = key_states.transpose(1, 2)
value_states = value_states.transpose(1, 2)
dropout_rate = self.attention_dropout if self.training else 0.0
# In PEFT, usually we cast the layer norms in float32 for training stability reasons
# therefore the input hidden states gets silently casted in float32. Hence, we need
# cast them back in the correct dtype just to be sure everything works as expected.
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
# in fp32. (OlmoRMSNorm handles it correctly)
input_dtype = query_states.dtype
if input_dtype == torch.float32:
if torch.is_autocast_enabled():
target_dtype = torch.get_autocast_gpu_dtype()
# Handle the case where the model is quantized
elif hasattr(self.config, "_pre_quantization_dtype"):
target_dtype = self.config._pre_quantization_dtype
else:
target_dtype = self.q_proj.weight.dtype
logger.warning_once(
f"The input hidden states seems to be silently casted in float32, this might be related to"
f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
f" {target_dtype}."
)
query_states = query_states.to(target_dtype)
key_states = key_states.to(target_dtype)
value_states = value_states.to(target_dtype)
attn_output = _flash_attention_forward(
query_states,
key_states,
value_states,
attention_mask,
q_len,
position_ids=position_ids,
dropout=dropout_rate,
use_top_left_mask=self._flash_attn_uses_top_left_mask,
is_causal=self.is_causal,
)
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
attn_output = self.o_proj(attn_output)
if not output_attentions:
attn_weights = None
return attn_output, attn_weights, past_key_value
class Olmo1124SdpaAttention(OlmoSdpaAttention, Olmo1124Attention):
# Adapted from Olmo1124Attention.forward
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
if output_attentions:
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
logger.warning_once(
"Olmo1124Model is using Olmo1124SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
return super().forward(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
cache_position=cache_position,
)
bsz, q_len, _ = hidden_states.size()
query_states = self.q_norm(self.q_proj(hidden_states))
key_states = self.k_norm(self.k_proj(hidden_states))
value_states = self.v_proj(hidden_states)
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
cos, sin = self.rotary_emb(value_states, position_ids)
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_value is not None:
# sin and cos are specific to RoPE models; cache_position needed for the static cache
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
key_states = repeat_kv(key_states, self.num_key_value_groups)
value_states = repeat_kv(value_states, self.num_key_value_groups)
causal_mask = attention_mask
# if attention_mask is not None and cache_position is not None:
if attention_mask is not None:
causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
# SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
# Reference: https://github.com/pytorch/pytorch/issues/112577.
if query_states.device.type == "cuda" and causal_mask is not None:
query_states = query_states.contiguous()
key_states = key_states.contiguous()
value_states = value_states.contiguous()
# We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
# in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
is_causal = True if causal_mask is None and q_len > 1 else False
attn_output = torch.nn.functional.scaled_dot_product_attention(
query_states,
key_states,
value_states,
attn_mask=causal_mask,
dropout_p=self.attention_dropout if self.training else 0.0,
is_causal=is_causal,
)
attn_output = attn_output.transpose(1, 2).contiguous()
attn_output = attn_output.view(bsz, q_len, self.hidden_size)
attn_output = self.o_proj(attn_output)
return attn_output, None, past_key_value
# The OLMo November 2024 layers are identical to those of the OLMo model except:
# - RMSNorm is used instead of standard layer norm.
# - Norm is applied after attention/feedforward rather than before.
class Olmo1124DecoderLayer(OlmoDecoderLayer):
def __init__(self, config: Olmo1124Config, layer_idx: int):
super().__init__(config, layer_idx=layer_idx)
self.post_attention_layernorm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.post_feedforward_layernorm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
del self.input_layernorm
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
cache_position: Optional[torch.LongTensor] = None,
**kwargs,
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
residual = hidden_states
# Self Attention
hidden_states, self_attn_weights, present_key_value = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
cache_position=cache_position,
**kwargs,
)
hidden_states = self.post_attention_layernorm(hidden_states)
hidden_states = residual + hidden_states
# Fully Connected
residual = hidden_states
hidden_states = self.mlp(hidden_states)
hidden_states = self.post_feedforward_layernorm(hidden_states)
hidden_states = residual + hidden_states
outputs = (hidden_states,)
if output_attentions:
outputs += (self_attn_weights,)
if use_cache:
outputs += (present_key_value,)
return outputs
class Olmo1124PreTrainedModel(OlmoPreTrainedModel):
pass
# The OLMo November 2024 model is identical to the OLMo model, except RMSNorm is used instead of
# standard layer norm for the output norm.
class Olmo1124Model(OlmoModel):
def __init__(self, config: Olmo1124Config):
super().__init__(config)
self.layers = nn.ModuleList(
[Olmo1124DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
)
self.norm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
# The heads now only need to redefine the model inside to the correct `RobertaModel`
class Olmo1124ForCausalLM(OlmoForCausalLM):
def __init__(self, config: Olmo1124Config):
super().__init__(config)
self.model = Olmo1124Model(config)
__all__ = [
"Olmo1124Config",
"Olmo1124ForCausalLM",
"Olmo1124Model",
"Olmo1124PreTrainedModel",
]

View File

@ -888,7 +888,7 @@ OLMOE_INPUTS_DOCSTRING = r"""
"The bare Olmoe Model outputting raw hidden-states without any specific head on top.",
OLMOE_START_DOCSTRING,
)
# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Olmoe
# Copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Olmoe
class OlmoeModel(OlmoePreTrainedModel):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OlmoeDecoderLayer`]
@ -995,7 +995,7 @@ class OlmoeModel(OlmoePreTrainedModel):
all_router_logits = () if output_router_logits else None
next_decoder_cache = None
for decoder_layer in self.layers[: self.config.num_hidden_layers]:
for decoder_layer in self.layers:
if output_hidden_states:
all_hidden_states += (hidden_states,)

View File

@ -3161,7 +3161,7 @@ class OneFormerForUniversalSegmentation(OneFormerPreTrainedModel):
>>> # you can pass them to processor for semantic postprocessing
>>> predicted_semantic_map = processor.post_process_semantic_segmentation(
... outputs, target_sizes=[(image.height, image.width)]
... outputs, target_sizes=[image.size[::-1]]
... )[0]
>>> f"👉 Semantic Predictions Shape: {list(predicted_semantic_map.shape)}"
'👉 Semantic Predictions Shape: [512, 683]'
@ -3178,7 +3178,7 @@ class OneFormerForUniversalSegmentation(OneFormerPreTrainedModel):
>>> # you can pass them to processor for instance postprocessing
>>> predicted_instance_map = processor.post_process_instance_segmentation(
... outputs, target_sizes=[(image.height, image.width)]
... outputs, target_sizes=[image.size[::-1]]
... )[0]["segmentation"]
>>> f"👉 Instance Predictions Shape: {list(predicted_instance_map.shape)}"
'👉 Instance Predictions Shape: [512, 683]'
@ -3195,7 +3195,7 @@ class OneFormerForUniversalSegmentation(OneFormerPreTrainedModel):
>>> # you can pass them to processor for panoptic postprocessing
>>> predicted_panoptic_map = processor.post_process_panoptic_segmentation(
... outputs, target_sizes=[(image.height, image.width)]
... outputs, target_sizes=[image.size[::-1]]
... )[0]["segmentation"]
>>> f"👉 Panoptic Predictions Shape: {list(predicted_panoptic_map.shape)}"
'👉 Panoptic Predictions Shape: [512, 683]'

View File

@ -91,10 +91,6 @@ class Pix2StructTextConfig(PretrainedConfig):
"hidden_size": "hidden_size",
"num_attention_heads": "num_heads",
"num_hidden_layers": "num_layers",
"decoder_attention_heads": "num_heads",
"encoder_attention_heads": "num_heads",
"encoder_layers": "num_layers",
"decoder_layers": "num_layers",
}
def __init__(
@ -358,8 +354,6 @@ class Pix2StructConfig(PretrainedConfig):
vision_config = {}
logger.info("vision_config is None. Initializing the Pix2StructVisionConfig with default values.")
text_config["is_encoder_decoder"] = is_encoder_decoder
text_config["tie_word_embeddings"] = tie_word_embeddings
self.text_config = Pix2StructTextConfig(**text_config)
self.vision_config = Pix2StructVisionConfig(**vision_config)

Some files were not shown because too many files have changed in this diff Show More