mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-22 18:34:37 +08:00
Compare commits
37 Commits
remove-scr
...
submodels-
Author | SHA1 | Date | |
---|---|---|---|
94f54f608a | |||
420f2dc5bb | |||
6bdd4ec952 | |||
08bf7f1afe | |||
be10d4df60 | |||
e1e11b0299 | |||
bdf5fb70aa | |||
719058c625 | |||
9f42c1f192 | |||
1636a7bcb9 | |||
71de20b818 | |||
23c89a6732 | |||
4f650040a6 | |||
d3d835d4fc | |||
2e4c045540 | |||
21cb353b7b | |||
f9be71b34d | |||
9eac19eb59 | |||
2ce02b98bf | |||
b6b4d43d6d | |||
0c98f24889 | |||
d29482cc91 | |||
1a96127e46 | |||
84d19be41e | |||
07aab1af1e | |||
74f5e4a1fa | |||
334bf913dc | |||
c184550daf | |||
984ff89e73 | |||
2166b6b4ff | |||
166e823f77 | |||
3d34b92116 | |||
b8059e1f8f | |||
5ee60f970a | |||
8ac2d75353 | |||
9120567b02 | |||
ff95974bc6 |
6
.github/workflows/model_jobs.yml
vendored
6
.github/workflows/model_jobs.yml
vendored
@ -12,8 +12,8 @@ on:
|
||||
slice_id:
|
||||
required: true
|
||||
type: number
|
||||
runner:
|
||||
required: true
|
||||
runner_map:
|
||||
required: false
|
||||
type: string
|
||||
docker:
|
||||
required: true
|
||||
@ -45,7 +45,7 @@ jobs:
|
||||
matrix:
|
||||
folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
|
||||
runs-on:
|
||||
group: '${{ inputs.machine_type }}'
|
||||
group: ${{ fromJson(inputs.runner_map)[matrix.folders][inputs.machine_type] }}
|
||||
container:
|
||||
image: ${{ inputs.docker }}
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
|
128
.github/workflows/model_jobs_amd.yml
vendored
128
.github/workflows/model_jobs_amd.yml
vendored
@ -1,128 +0,0 @@
|
||||
name: model jobs
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
folder_slices:
|
||||
required: true
|
||||
type: string
|
||||
machine_type:
|
||||
required: true
|
||||
type: string
|
||||
slice_id:
|
||||
required: true
|
||||
type: number
|
||||
runner:
|
||||
required: true
|
||||
type: string
|
||||
docker:
|
||||
required: true
|
||||
type: string
|
||||
|
||||
env:
|
||||
HF_HOME: /mnt/cache
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
OMP_NUM_THREADS: 8
|
||||
MKL_NUM_THREADS: 8
|
||||
RUN_SLOW: yes
|
||||
# For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
|
||||
# This token is created under the bot `hf-transformers-bot`.
|
||||
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
||||
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||
CUDA_VISIBLE_DEVICES: 0,1
|
||||
|
||||
jobs:
|
||||
run_models_gpu:
|
||||
name: " "
|
||||
strategy:
|
||||
max-parallel: 1 # For now, not to parallelize. Can change later if it works well.
|
||||
fail-fast: false
|
||||
matrix:
|
||||
folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
|
||||
runs-on: ['${{ inputs.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
|
||||
container:
|
||||
image: ${{ inputs.docker }}
|
||||
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: Echo input and matrix info
|
||||
shell: bash
|
||||
run: |
|
||||
echo "${{ inputs.folder_slices }}"
|
||||
echo "${{ matrix.folders }}"
|
||||
echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
|
||||
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
|
||||
# set the artifact folder names (because the character `/` is not allowed).
|
||||
run: |
|
||||
echo "${{ matrix.folders }}"
|
||||
matrix_folders=${{ matrix.folders }}
|
||||
matrix_folders=${matrix_folders/'models/'/'models_'}
|
||||
echo "$matrix_folders"
|
||||
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
||||
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
run: git fetch && git checkout ${{ github.sha }}
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: /transformers
|
||||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
|
||||
|
||||
- name: Update / Install some packages (for Past CI)
|
||||
if: ${{ contains(inputs.docker, '-past-') }}
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 -m pip install -U datasets
|
||||
|
||||
- name: Update / Install some packages (for Past CI)
|
||||
if: ${{ contains(inputs.docker, '-past-') && contains(inputs.docker, '-pytorch-') }}
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
||||
|
||||
- name: ROCM-SMI
|
||||
run: |
|
||||
rocm-smi
|
||||
|
||||
- name: ROCM-INFO
|
||||
run: |
|
||||
rocminfo | grep "Agent" -A 14
|
||||
|
||||
- name: Show ROCR environment
|
||||
run: |
|
||||
echo "ROCR: $ROCR_VISIBLE_DEVICES"
|
||||
|
||||
- name: Environment
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
working-directory: /transformers
|
||||
run: pip freeze
|
||||
|
||||
- name: Run all tests on GPU
|
||||
working-directory: /transformers
|
||||
run: python3 -m pytest -rsfE -v --make-reports=${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} -m "not not_device_test"
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: cat /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
|
||||
|
||||
- name: Run test
|
||||
shell: bash
|
||||
run: |
|
||||
mkdir -p /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
|
||||
echo "hello" > /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
|
||||
echo "${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
|
||||
path: /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
|
121
.github/workflows/model_jobs_intel_gaudi.yml
vendored
Normal file
121
.github/workflows/model_jobs_intel_gaudi.yml
vendored
Normal file
@ -0,0 +1,121 @@
|
||||
name: model jobs
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
folder_slices:
|
||||
required: true
|
||||
type: string
|
||||
slice_id:
|
||||
required: true
|
||||
type: number
|
||||
runner:
|
||||
required: true
|
||||
type: string
|
||||
machine_type:
|
||||
required: true
|
||||
type: string
|
||||
report_name_prefix:
|
||||
required: false
|
||||
default: run_models_gpu
|
||||
type: string
|
||||
|
||||
env:
|
||||
RUN_SLOW: yes
|
||||
PT_HPU_LAZY_MODE: 0
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
PT_ENABLE_INT64_SUPPORT: 1
|
||||
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
||||
HF_HOME: /mnt/cache/.cache/huggingface
|
||||
|
||||
jobs:
|
||||
run_models_gpu:
|
||||
name: " "
|
||||
strategy:
|
||||
max-parallel: 8
|
||||
fail-fast: false
|
||||
matrix:
|
||||
folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
|
||||
runs-on:
|
||||
group: ${{ inputs.runner }}
|
||||
container:
|
||||
image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
||||
options: --runtime=habana
|
||||
-v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
|
||||
--env OMPI_MCA_btl_vader_single_copy_mechanism=none
|
||||
--env HABANA_VISIBLE_DEVICES
|
||||
--env HABANA_VISIBLE_MODULES
|
||||
--cap-add=sys_nice
|
||||
--shm-size=64G
|
||||
steps:
|
||||
- name: Echo input and matrix info
|
||||
shell: bash
|
||||
run: |
|
||||
echo "${{ inputs.folder_slices }}"
|
||||
echo "${{ matrix.folders }}"
|
||||
echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
|
||||
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
run: |
|
||||
echo "${{ matrix.folders }}"
|
||||
matrix_folders=${{ matrix.folders }}
|
||||
matrix_folders=${matrix_folders/'models/'/'models_'}
|
||||
echo "$matrix_folders"
|
||||
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn
|
||||
|
||||
- name: HL-SMI
|
||||
run: |
|
||||
hl-smi
|
||||
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
|
||||
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
|
||||
|
||||
- name: Environment
|
||||
run: python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
run: pip freeze
|
||||
|
||||
- name: Set `machine_type` for report and artifact names
|
||||
shell: bash
|
||||
run: |
|
||||
if [ "${{ inputs.machine_type }}" = "1gaudi" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ inputs.machine_type }}" = "2gaudi" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ inputs.machine_type }}
|
||||
fi
|
||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||
|
||||
- name: Run all tests on Gaudi
|
||||
run: python3 -m pytest -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: cat reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt
|
||||
|
||||
- name: Run test
|
||||
shell: bash
|
||||
run: |
|
||||
mkdir -p reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
|
||||
echo "hello" > reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt
|
||||
echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports"
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
|
||||
path: reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
|
8
.github/workflows/self-scheduled-caller.yml
vendored
8
.github/workflows/self-scheduled-caller.yml
vendored
@ -22,7 +22,7 @@ on:
|
||||
default: ""
|
||||
|
||||
|
||||
# Used for `push` to easily modiffy the target workflow runs to compare against
|
||||
# Used for `push` to easily modify the target workflow runs to compare against
|
||||
env:
|
||||
prev_workflow_run_id: ""
|
||||
other_workflow_run_id: ""
|
||||
@ -51,7 +51,6 @@ jobs:
|
||||
with:
|
||||
job: run_models_gpu
|
||||
slack_report_channel: "#transformers-ci-daily-models"
|
||||
runner: daily-ci
|
||||
docker: huggingface/transformers-all-latest-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
@ -63,7 +62,6 @@ jobs:
|
||||
with:
|
||||
job: run_pipelines_torch_gpu
|
||||
slack_report_channel: "#transformers-ci-daily-pipeline-torch"
|
||||
runner: daily-ci
|
||||
docker: huggingface/transformers-pytorch-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
@ -75,7 +73,6 @@ jobs:
|
||||
with:
|
||||
job: run_examples_gpu
|
||||
slack_report_channel: "#transformers-ci-daily-examples"
|
||||
runner: daily-ci
|
||||
docker: huggingface/transformers-all-latest-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
@ -87,7 +84,6 @@ jobs:
|
||||
with:
|
||||
job: run_trainer_and_fsdp_gpu
|
||||
slack_report_channel: "#transformers-ci-daily-training"
|
||||
runner: daily-ci
|
||||
docker: huggingface/transformers-all-latest-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
@ -99,7 +95,6 @@ jobs:
|
||||
with:
|
||||
job: run_torch_cuda_extensions_gpu
|
||||
slack_report_channel: "#transformers-ci-daily-training"
|
||||
runner: daily-ci
|
||||
docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
|
||||
ci_event: Daily CI
|
||||
working-directory-prefix: /workspace
|
||||
@ -112,7 +107,6 @@ jobs:
|
||||
with:
|
||||
job: run_quantization_torch_gpu
|
||||
slack_report_channel: "#transformers-ci-daily-quantization"
|
||||
runner: daily-ci
|
||||
docker: huggingface/transformers-quantization-latest-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
|
345
.github/workflows/self-scheduled-intel-gaudi.yml
vendored
Normal file
345
.github/workflows/self-scheduled-intel-gaudi.yml
vendored
Normal file
@ -0,0 +1,345 @@
|
||||
name: Self-hosted runner (scheduled-intel-gaudi)
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
job:
|
||||
required: true
|
||||
type: string
|
||||
slack_report_channel:
|
||||
required: true
|
||||
type: string
|
||||
runner_scale_set:
|
||||
required: true
|
||||
type: string
|
||||
ci_event:
|
||||
required: true
|
||||
type: string
|
||||
report_repo_id:
|
||||
required: true
|
||||
type: string
|
||||
|
||||
env:
|
||||
NUM_SLICES: 2
|
||||
RUN_SLOW: yes
|
||||
PT_HPU_LAZY_MODE: 0
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
PT_ENABLE_INT64_SUPPORT: 1
|
||||
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
||||
HF_HOME: /mnt/cache/.cache/huggingface
|
||||
|
||||
jobs:
|
||||
setup:
|
||||
if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
|
||||
name: Setup
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
|
||||
folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
|
||||
quantization_matrix: ${{ steps.set-matrix.outputs.quantization_matrix }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
|
||||
- id: set-matrix
|
||||
if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
|
||||
name: Identify models to test
|
||||
working-directory: tests
|
||||
run: |
|
||||
if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
|
||||
echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
|
||||
echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
|
||||
elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
|
||||
echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
|
||||
echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- id: set-matrix-quantization
|
||||
if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
|
||||
name: Identify quantization method to test
|
||||
working-directory: tests
|
||||
run: |
|
||||
echo "quantization_matrix=$(python3 -c 'import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ; print(d)')" >> $GITHUB_OUTPUT
|
||||
|
||||
run_models_gpu:
|
||||
if: ${{ inputs.job == 'run_models_gpu' }}
|
||||
name: " "
|
||||
needs: setup
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [1gaudi, 2gaudi]
|
||||
slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
|
||||
uses: ./.github/workflows/model_jobs_intel_gaudi.yml
|
||||
with:
|
||||
slice_id: ${{ matrix.slice_id }}
|
||||
machine_type: ${{ matrix.machine_type }}
|
||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||
runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
||||
report_name_prefix: run_models_gpu
|
||||
|
||||
secrets: inherit
|
||||
|
||||
run_trainer_and_fsdp_gpu:
|
||||
if: ${{ inputs.job == 'run_trainer_and_fsdp_gpu' }}
|
||||
name: " "
|
||||
needs: setup
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [1gaudi, 2gaudi]
|
||||
slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
|
||||
uses: ./.github/workflows/model_jobs_intel_gaudi.yml
|
||||
with:
|
||||
slice_id: ${{ matrix.slice_id }}
|
||||
machine_type: ${{ matrix.machine_type }}
|
||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||
runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
||||
report_name_prefix: run_trainer_and_fsdp_gpu
|
||||
|
||||
secrets: inherit
|
||||
|
||||
run_pipelines_gpu:
|
||||
if: ${{ inputs.job == 'run_pipelines_gpu' }}
|
||||
name: Pipelines
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [1gaudi, 2gaudi]
|
||||
runs-on:
|
||||
group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
||||
container:
|
||||
image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
||||
options: --runtime=habana
|
||||
-v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
|
||||
--env OMPI_MCA_btl_vader_single_copy_mechanism=none
|
||||
--env HABANA_VISIBLE_DEVICES
|
||||
--env HABANA_VISIBLE_MODULES
|
||||
--cap-add=sys_nice
|
||||
--shm-size=64G
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
|
||||
|
||||
- name: HL-SMI
|
||||
run: |
|
||||
hl-smi
|
||||
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
|
||||
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
|
||||
|
||||
- name: Environment
|
||||
run: python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
run: pip freeze
|
||||
|
||||
- name: Set `machine_type` for report and artifact names
|
||||
shell: bash
|
||||
run: |
|
||||
if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
fi
|
||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||
|
||||
- name: Run all pipeline tests on Intel Gaudi
|
||||
run: |
|
||||
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_pipelines_gpu_test_reports tests/pipelines -m "not not_device_test"
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: |
|
||||
cat reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports/failures_short.txt
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_gpu_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ env.machine_type }}_run_pipelines_gpu_test_reports
|
||||
path: reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports
|
||||
|
||||
run_examples_gpu:
|
||||
if: ${{ inputs.job == 'run_examples_gpu' }}
|
||||
name: Examples directory
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [1gaudi]
|
||||
runs-on:
|
||||
group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
||||
container:
|
||||
image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
||||
options: --runtime=habana
|
||||
-v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
|
||||
--env OMPI_MCA_btl_vader_single_copy_mechanism=none
|
||||
--env HABANA_VISIBLE_DEVICES
|
||||
--env HABANA_VISIBLE_MODULES
|
||||
--cap-add=sys_nice
|
||||
--shm-size=64G
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
|
||||
|
||||
- name: HL-SMI
|
||||
run: |
|
||||
hl-smi
|
||||
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
|
||||
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
run: |
|
||||
pip freeze
|
||||
|
||||
- name: Set `machine_type` for report and artifact names
|
||||
shell: bash
|
||||
run: |
|
||||
if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
fi
|
||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||
|
||||
- name: Run examples tests on Intel Gaudi
|
||||
run: |
|
||||
pip install -r examples/pytorch/_tests_requirements.txt
|
||||
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_examples_gpu_test_reports examples/pytorch -m "not not_device_test"
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: |
|
||||
cat reports/${{ env.machine_type }}_run_examples_gpu_test_reports/failures_short.txt
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_examples_gpu_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ env.machine_type }}_run_examples_gpu_test_reports
|
||||
path: reports/${{ env.machine_type }}_run_examples_gpu_test_reports
|
||||
|
||||
run_deepspeed_gpu:
|
||||
if: ${{ inputs.job == 'run_deepspeed_gpu' }}
|
||||
name: Intel Gaudi deepspeed tests
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [1gaudi, 2gaudi]
|
||||
runs-on:
|
||||
group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
||||
container:
|
||||
image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
||||
options: --runtime=habana
|
||||
-v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
|
||||
--env OMPI_MCA_btl_vader_single_copy_mechanism=none
|
||||
--env HABANA_VISIBLE_DEVICES
|
||||
--env HABANA_VISIBLE_MODULES
|
||||
--cap-add=sys_nice
|
||||
--shm-size=64G
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
|
||||
pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0
|
||||
|
||||
- name: HL-SMI
|
||||
run: |
|
||||
hl-smi
|
||||
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
|
||||
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
run: |
|
||||
pip freeze
|
||||
|
||||
- name: Set `machine_type` for report and artifact names
|
||||
shell: bash
|
||||
run: |
|
||||
if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
fi
|
||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||
|
||||
- name: Run all deepspeed tests on intel Gaudi
|
||||
run: |
|
||||
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_deepspeed_gpu_test_reports tests/deepspeed -m "not not_device_test"
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: |
|
||||
cat reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports/failures_short.txt
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports
|
||||
path: reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports
|
||||
|
||||
send_results:
|
||||
name: Slack Report
|
||||
needs:
|
||||
[
|
||||
setup,
|
||||
run_models_gpu,
|
||||
run_examples_gpu,
|
||||
run_pipelines_gpu,
|
||||
run_deepspeed_gpu,
|
||||
run_trainer_and_fsdp_gpu,
|
||||
]
|
||||
if: ${{ always() }}
|
||||
uses: ./.github/workflows/slack-report.yml
|
||||
with:
|
||||
job: ${{ inputs.job }}
|
||||
setup_status: ${{ needs.setup.result }}
|
||||
slack_report_channel: ${{ inputs.slack_report_channel }}
|
||||
quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
|
||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||
report_repo_id: ${{ inputs.report_repo_id }}
|
||||
ci_event: ${{ inputs.ci_event }}
|
||||
|
||||
secrets: inherit
|
67
.github/workflows/self-scheduled-intel-gaudi3-caller.yml
vendored
Normal file
67
.github/workflows/self-scheduled-intel-gaudi3-caller.yml
vendored
Normal file
@ -0,0 +1,67 @@
|
||||
name: Self-hosted runner (Intel Gaudi3 scheduled CI caller)
|
||||
|
||||
on:
|
||||
repository_dispatch:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: "17 2 * * *"
|
||||
|
||||
jobs:
|
||||
model-ci:
|
||||
name: Model CI
|
||||
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
||||
with:
|
||||
job: run_models_gpu
|
||||
ci_event: Scheduled CI (Intel) - Gaudi3
|
||||
runner_scale_set: itac-bm-emr-gaudi3-dell
|
||||
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
||||
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
|
||||
|
||||
secrets: inherit
|
||||
|
||||
pipeline-ci:
|
||||
name: Pipeline CI
|
||||
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
||||
with:
|
||||
job: run_pipelines_gpu
|
||||
ci_event: Scheduled CI (Intel) - Gaudi3
|
||||
runner_scale_set: itac-bm-emr-gaudi3-dell
|
||||
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
||||
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
|
||||
|
||||
secrets: inherit
|
||||
|
||||
example-ci:
|
||||
name: Example CI
|
||||
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
||||
with:
|
||||
job: run_examples_gpu
|
||||
ci_event: Scheduled CI (Intel) - Gaudi3
|
||||
runner_scale_set: itac-bm-emr-gaudi3-dell
|
||||
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
||||
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
|
||||
|
||||
secrets: inherit
|
||||
|
||||
deepspeed-ci:
|
||||
name: DeepSpeed CI
|
||||
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
||||
with:
|
||||
job: run_deepspeed_gpu
|
||||
ci_event: Scheduled CI (Intel) - Gaudi3
|
||||
runner_scale_set: itac-bm-emr-gaudi3-dell
|
||||
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
||||
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
|
||||
|
||||
secrets: inherit
|
||||
|
||||
trainer-fsdp-ci:
|
||||
name: Trainer/FSDP CI
|
||||
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
||||
with:
|
||||
job: run_trainer_and_fsdp_gpu
|
||||
ci_event: Scheduled CI (Intel) - Gaudi3
|
||||
runner_scale_set: itac-bm-emr-gaudi3-dell
|
||||
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
||||
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
|
||||
secrets: inherit
|
10
.github/workflows/self-scheduled.yml
vendored
10
.github/workflows/self-scheduled.yml
vendored
@ -15,9 +15,6 @@ on:
|
||||
slack_report_channel:
|
||||
required: true
|
||||
type: string
|
||||
runner:
|
||||
required: true
|
||||
type: string
|
||||
docker:
|
||||
required: true
|
||||
type: string
|
||||
@ -62,6 +59,7 @@ jobs:
|
||||
outputs:
|
||||
folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
|
||||
slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
|
||||
runner_map: ${{ steps.set-matrix.outputs.runner_map }}
|
||||
quantization_matrix: ${{ steps.set-matrix-quantization.outputs.quantization_matrix }}
|
||||
steps:
|
||||
- name: Update clone
|
||||
@ -88,6 +86,7 @@ jobs:
|
||||
if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
|
||||
echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
|
||||
echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
|
||||
echo "runner_map=$(python3 ../utils/get_runner_map.py)" >> $GITHUB_OUTPUT
|
||||
elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
|
||||
echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
|
||||
echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
|
||||
@ -111,14 +110,14 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
|
||||
uses: ./.github/workflows/model_jobs.yml
|
||||
with:
|
||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||
machine_type: ${{ matrix.machine_type }}
|
||||
slice_id: ${{ matrix.slice_id }}
|
||||
runner: ${{ inputs.runner }}
|
||||
runner_map: ${{ needs.setup.outputs.runner_map }}
|
||||
docker: ${{ inputs.docker }}
|
||||
secrets: inherit
|
||||
|
||||
@ -136,7 +135,6 @@ jobs:
|
||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||
machine_type: ${{ matrix.machine_type }}
|
||||
slice_id: ${{ matrix.slice_id }}
|
||||
runner: ${{ inputs.runner }}
|
||||
docker: ${{ inputs.docker }}
|
||||
report_name_prefix: run_trainer_and_fsdp_gpu
|
||||
secrets: inherit
|
||||
|
@ -3,6 +3,9 @@ LABEL maintainer="Hugging Face"
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
ARG TORCH_VISION='0.21.0'
|
||||
ARG TORCH_AUDIO='2.6.0'
|
||||
|
||||
RUN apt update && \
|
||||
apt install -y --no-install-recommends git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-dev python3-pip python3-dev ffmpeg git-lfs && \
|
||||
apt clean && \
|
||||
@ -20,6 +23,7 @@ WORKDIR /
|
||||
ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
|
||||
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO
|
||||
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]
|
||||
|
||||
RUN python3 -m pip uninstall -y tensorflow flax
|
||||
|
@ -363,6 +363,8 @@
|
||||
- sections:
|
||||
- local: model_doc/albert
|
||||
title: ALBERT
|
||||
- local: model_doc/arcee
|
||||
title: Arcee
|
||||
- local: model_doc/bamba
|
||||
title: Bamba
|
||||
- local: model_doc/bart
|
||||
@ -841,6 +843,8 @@
|
||||
title: GraniteSpeech
|
||||
- local: model_doc/hubert
|
||||
title: Hubert
|
||||
- local: model_doc/stt
|
||||
title: Kyutai Speech-To-Text
|
||||
- local: model_doc/mctct
|
||||
title: MCTCT
|
||||
- local: model_doc/mimi
|
||||
|
@ -468,9 +468,17 @@ def generate(model, input_ids, generation_config=None, left_padding=None, **kwar
|
||||
Follow the recommended practices below to ensure your custom decoding method works as expected.
|
||||
- Feel free to reuse the logic for validation and input preparation in the original [`~GenerationMixin.generate`].
|
||||
- Pin the `transformers` version in the requirements if you use any private method/attribute in `model`.
|
||||
- You can add other files in the `custom_generate` folder, and use relative imports.
|
||||
- Consider adding model validation, input validation, or even a separate test file to help users sanity-check your code in their environment.
|
||||
|
||||
Your custom `generate` method can relative import code from the `custom_generate` folder. For example, if you have a `utils.py` file, you can import it like this:
|
||||
|
||||
```py
|
||||
from .utils import some_function
|
||||
```
|
||||
|
||||
Only relative imports from the same-level `custom_generate` folder are supported. Parent/sibling folder imports are not valid. The `custom_generate` argument also works locally with any directory that contains a `custom_generate` structure. This is the recommended workflow for developing your custom decoding method.
|
||||
|
||||
|
||||
#### requirements.txt
|
||||
|
||||
You can optionally specify additional Python requirements in a `requirements.txt` file inside the `custom_generate` folder. These are checked at runtime and an exception will be thrown if they're missing, nudging users to update their environment accordingly.
|
||||
|
104
docs/source/en/model_doc/arcee.md
Normal file
104
docs/source/en/model_doc/arcee.md
Normal file
@ -0,0 +1,104 @@
|
||||
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
<div style="float: right;">
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
|
||||
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
# Arcee
|
||||
|
||||
Arcee is a decoder-only transformer model based on the Llama architecture with a key modification: it uses ReLU² (ReLU-squared) activation in the MLP blocks instead of SiLU, following recent research showing improved training efficiency with squared activations. This architecture is designed for efficient training and inference while maintaining the proven stability of the Llama design.
|
||||
|
||||
The Arcee model is architecturally similar to Llama but uses `x * relu(x)` in MLP layers for improved gradient flow and is optimized for efficiency in both training and inference scenarios.
|
||||
|
||||
> [!TIP]
|
||||
> The Arcee model supports extended context with RoPE scaling and all standard transformers features including Flash Attention 2, SDPA, gradient checkpointing, and quantization support.
|
||||
|
||||
The example below demonstrates how to generate text with Arcee using [`Pipeline`] or the [`AutoModel`].
|
||||
|
||||
<hfoptions id="usage">
|
||||
<hfoption id="Pipeline">
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import pipeline
|
||||
|
||||
pipeline = pipeline(
|
||||
task="text-generation",
|
||||
model="arcee-ai/AFM-4.5B",
|
||||
torch_dtype=torch.float16,
|
||||
device=0
|
||||
)
|
||||
|
||||
output = pipeline("The key innovation in Arcee is")
|
||||
print(output[0]["generated_text"])
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="AutoModel">
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import AutoTokenizer, ArceeForCausalLM
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("arcee-ai/AFM-4.5B")
|
||||
model = ArceeForCausalLM.from_pretrained(
|
||||
"arcee-ai/AFM-4.5B",
|
||||
torch_dtype=torch.float16,
|
||||
device_map="auto"
|
||||
)
|
||||
|
||||
inputs = tokenizer("The key innovation in Arcee is", return_tensors="pt")
|
||||
with torch.no_grad():
|
||||
outputs = model.generate(**inputs, max_new_tokens=50)
|
||||
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## ArceeConfig
|
||||
|
||||
[[autodoc]] ArceeConfig
|
||||
|
||||
## ArceeModel
|
||||
|
||||
[[autodoc]] ArceeModel
|
||||
- forward
|
||||
|
||||
## ArceeForCausalLM
|
||||
|
||||
[[autodoc]] ArceeForCausalLM
|
||||
- forward
|
||||
|
||||
## ArceeForSequenceClassification
|
||||
|
||||
[[autodoc]] ArceeForSequenceClassification
|
||||
- forward
|
||||
|
||||
## ArceeForQuestionAnswering
|
||||
|
||||
[[autodoc]] ArceeForQuestionAnswering
|
||||
- forward
|
||||
|
||||
## ArceeForTokenClassification
|
||||
|
||||
[[autodoc]] ArceeForTokenClassification
|
||||
- forward
|
@ -14,35 +14,76 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# BLIP
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
|
||||
<div style="float: right;">
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Overview
|
||||
# BLIP
|
||||
|
||||
The BLIP model was proposed in [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://huggingface.co/papers/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
|
||||
[BLIP](https://huggingface.co/papers/2201.12086) (Bootstrapped Language-Image Pretraining) is a vision-language pretraining (VLP) framework designed for *both* understanding and generation tasks. Most existing pretrained models are only good at one or the other. It uses a captioner to generate captions and a filter to remove the noisy captions. This increases training data quality and more effectively uses the messy web data.
|
||||
|
||||
BLIP is a model that is able to perform various multi-modal tasks including:
|
||||
- Visual Question Answering
|
||||
- Image-Text retrieval (Image-text matching)
|
||||
- Image Captioning
|
||||
|
||||
The abstract from the paper is the following:
|
||||
You can find all the original BLIP checkpoints under the [BLIP](https://huggingface.co/collections/Salesforce/blip-models-65242f40f1491fbf6a9e9472) collection.
|
||||
|
||||
*Vision-Language Pre-training (VLP) has advanced the performance for many vision-language tasks.
|
||||
However, most existing pre-trained models only excel in either understanding-based tasks or generation-based tasks. Furthermore, performance improvement has been largely achieved by scaling up the dataset with noisy image-text pairs collected from the web, which is a suboptimal source of supervision. In this paper, we propose BLIP, a new VLP framework which transfers flexibly to both vision-language understanding and generation tasks. BLIP effectively utilizes the noisy web data by bootstrapping the captions, where a captioner generates synthetic captions and a filter removes the noisy ones. We achieve state-of-the-art results on a wide range of vision-language tasks, such as image-text retrieval (+2.7% in average recall@1), image captioning (+2.8% in CIDEr), and VQA (+1.6% in VQA score). BLIP also demonstrates strong generalization ability when directly transferred to videolanguage tasks in a zero-shot manner. Code, models, and datasets are released.*
|
||||
> [!TIP]
|
||||
> This model was contributed by [ybelkada](https://huggingface.co/ybelkada).
|
||||
>
|
||||
> Click on the BLIP models in the right sidebar for more examples of how to apply BLIP to different vision language tasks.
|
||||
|
||||

|
||||
The example below demonstrates how to visual question answering with [`Pipeline`] or the [`AutoModel`] class.
|
||||
|
||||
This model was contributed by [ybelkada](https://huggingface.co/ybelkada).
|
||||
The original code can be found [here](https://github.com/salesforce/BLIP).
|
||||
<hfoptions id="usage">
|
||||
<hfoption id="Pipeline">
|
||||
|
||||
```python
|
||||
import torch
|
||||
from transformers import pipeline
|
||||
|
||||
pipeline = pipeline(
|
||||
task="visual-question-answering",
|
||||
model="Salesforce/blip-vqa-base",
|
||||
torch_dtype=torch.float16,
|
||||
device=0
|
||||
)
|
||||
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
|
||||
pipeline(question="What is the weather in this image?", image=url)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="AutoModel">
|
||||
|
||||
```python
|
||||
import requests
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering
|
||||
|
||||
processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
|
||||
model = AutoModelForVisualQuestionAnswering.from_pretrained(
|
||||
"Salesforce/blip-vqa-base",
|
||||
torch_dtype=torch.float16,
|
||||
device_map="auto"
|
||||
)
|
||||
|
||||
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
|
||||
image = Image.open(requests.get(url, stream=True).raw)
|
||||
|
||||
question = "What is the weather in this image?"
|
||||
inputs = processor(images=image, text=question, return_tensors="pt").to("cuda", torch.float16)
|
||||
|
||||
output = model.generate(**inputs)
|
||||
processor.batch_decode(output, skip_special_tokens=True)[0]
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Resources
|
||||
|
||||
- [Jupyter notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb) on how to fine-tune BLIP for image captioning on a custom dataset
|
||||
Refer to this [notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb) to learn how to fine-tune BLIP for image captioning on a custom dataset.
|
||||
|
||||
## BlipConfig
|
||||
|
||||
|
@ -162,7 +162,7 @@ To load and run a model using Flash Attention-2, simply change the code snippet
|
||||
```diff
|
||||
model = Idefics2ForConditionalGeneration.from_pretrained(
|
||||
"HuggingFaceM4/idefics2-8b",
|
||||
+ torch_dtype=torch.float16,
|
||||
+ torch_dtype=torch.float16,
|
||||
+ attn_implementation="flash_attention_2",
|
||||
).to(device)
|
||||
```
|
||||
@ -184,7 +184,7 @@ Quantizing a model is as simple as passing a `quantization_config` to the model.
|
||||
+ )
|
||||
model = Idefics2ForConditionalGeneration.from_pretrained(
|
||||
"HuggingFaceM4/idefics2-8b",
|
||||
+ torch_dtype=torch.float16,
|
||||
+ torch_dtype=torch.float16,
|
||||
+ quantization_config=quantization_config,
|
||||
).to(device)
|
||||
```
|
||||
@ -218,7 +218,10 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
|
||||
[[autodoc]] Idefics2ImageProcessor
|
||||
- preprocess
|
||||
|
||||
## Idefics2ImageProcessorFast
|
||||
[[autodoc]] Idefics2ImageProcessorFast
|
||||
- preprocess
|
||||
|
||||
## Idefics2Processor
|
||||
[[autodoc]] Idefics2Processor
|
||||
- __call__
|
||||
- __call__
|
||||
|
@ -80,6 +80,9 @@ This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts)
|
||||
[[autodoc]] Idefics3ImageProcessor
|
||||
- preprocess
|
||||
|
||||
## Idefics3ImageProcessorFast
|
||||
[[autodoc]] Idefics3ImageProcessorFast
|
||||
- preprocess
|
||||
|
||||
## Idefics3Processor
|
||||
[[autodoc]] Idefics3Processor
|
||||
|
@ -56,7 +56,7 @@ Here is how to use the processor to process text and audio:
|
||||
```python
|
||||
>>> # let's load an audio sample from an Arabic speech corpus
|
||||
>>> from datasets import load_dataset
|
||||
>>> dataset = load_dataset("halabi2016/arabic_speech_corpus", split="test", streaming=True)
|
||||
>>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True, trust_remote_code=True)
|
||||
>>> audio_sample = next(iter(dataset))["audio"]
|
||||
|
||||
>>> # now, process it
|
||||
|
@ -56,7 +56,7 @@ Here is how to use the processor to process text and audio:
|
||||
```python
|
||||
>>> # let's load an audio sample from an Arabic speech corpus
|
||||
>>> from datasets import load_dataset
|
||||
>>> dataset = load_dataset("halabi2016/arabic_speech_corpus", split="test", streaming=True)
|
||||
>>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True, trust_remote_code=True)
|
||||
>>> audio_sample = next(iter(dataset))["audio"]
|
||||
|
||||
>>> # now, process it
|
||||
|
@ -32,7 +32,7 @@ SmolVLM2 is an adaptation of the Idefics3 model with two main differences:
|
||||
|
||||
Input images are processed either by upsampling (if resizing is enabled) or at their original resolution. The resizing behavior depends on two parameters: do_resize and size.
|
||||
|
||||
Videos should not be upsampled.
|
||||
Videos should not be upsampled.
|
||||
|
||||
If `do_resize` is set to `True`, the model resizes images so that the longest edge is 4*512 pixels by default.
|
||||
The default resizing behavior can be customized by passing a dictionary to the `size` parameter. For example, `{"longest_edge": 4 * 512}` is the default, but you can change it to a different value if needed.
|
||||
@ -192,11 +192,14 @@ print(generated_texts[0])
|
||||
[[autodoc]] SmolVLMForConditionalGeneration
|
||||
- forward
|
||||
|
||||
|
||||
## SmolVLMImageProcessor
|
||||
[[autodoc]] SmolVLMImageProcessor
|
||||
- preprocess
|
||||
|
||||
## SmolVLMImageProcessorFast
|
||||
[[autodoc]] SmolVLMImageProcessorFast
|
||||
- preprocess
|
||||
|
||||
## SmolVLMVideoProcessor
|
||||
[[autodoc]] SmolVLMVideoProcessor
|
||||
- preprocess
|
||||
|
122
docs/source/en/model_doc/stt.md
Normal file
122
docs/source/en/model_doc/stt.md
Normal file
@ -0,0 +1,122 @@
|
||||
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Kyutai Speech-To-Text
|
||||
## Overview
|
||||
|
||||
Kyutai STT is a speech-to-text model architecture based on the [Mimi codec](https://huggingface.co/docs/transformers/en/model_doc/mimi), which encodes audio into discrete tokens in a streaming fashion, and a [Moshi-like](https://huggingface.co/docs/transformers/en/model_doc/moshi) autoregressive decoder. Kyutai’s lab has released two model checkpoints:
|
||||
- [kyutai/stt-1b-en_fr](https://huggingface.co/kyutai/stt-1b-en_fr): a 1B-parameter model capable of transcribing both English and French
|
||||
- [kyutai/stt-2.6b-en](https://huggingface.co/kyutai/stt-2.6b-en): a 2.6B-parameter model focused solely on English, optimized for maximum transcription accuracy
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/eustlb/documentation-images/resolve/main/kyutai_stt.png"/>
|
||||
</div>
|
||||
|
||||
## Usage Tips
|
||||
|
||||
### Inference
|
||||
|
||||
```python
|
||||
import torch
|
||||
from datasets import load_dataset, Audio
|
||||
from transformers import KyutaiSpeechToTextProcessor, KyutaiSpeechToTextForConditionalGeneration
|
||||
|
||||
# 1. load the model and the processor
|
||||
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
model_id = "kyutai/stt-2.6b-en"
|
||||
|
||||
processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id)
|
||||
model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
|
||||
|
||||
# 2. load audio samples
|
||||
ds = load_dataset(
|
||||
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation"
|
||||
)
|
||||
ds = ds.cast_column("audio", Audio(sampling_rate=24000))
|
||||
|
||||
# 3. prepare the model inputs
|
||||
inputs = processor(
|
||||
ds[0]["audio"]["array"],
|
||||
)
|
||||
inputs.to(torch_device)
|
||||
|
||||
# 4. infer the model
|
||||
output_tokens = model.generate(**inputs)
|
||||
|
||||
# 5. decode the generated tokens
|
||||
print(processor.batch_decode(output_tokens, skip_special_tokens=True))
|
||||
```
|
||||
|
||||
### Batched Inference
|
||||
|
||||
```python
|
||||
import torch
|
||||
from datasets import load_dataset, Audio
|
||||
from transformers import KyutaiSpeechToTextProcessor, KyutaiSpeechToTextForConditionalGeneration
|
||||
|
||||
# 1. load the model and the processor
|
||||
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
model_id = "kyutai/stt-2.6b-en"
|
||||
|
||||
processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id)
|
||||
model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
|
||||
|
||||
# 2. load audio samples
|
||||
ds = load_dataset(
|
||||
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation"
|
||||
)
|
||||
ds = ds.cast_column("audio", Audio(sampling_rate=24000))
|
||||
|
||||
# 3. prepare the model inputs
|
||||
audio_arrays = [ds[i]["audio"]["array"] for i in range(4)]
|
||||
inputs = processor(audio_arrays, return_tensors="pt", padding=True)
|
||||
inputs = inputs.to(torch_device)
|
||||
|
||||
# 4. infer the model
|
||||
output_tokens = model.generate(**inputs)
|
||||
|
||||
# 5. decode the generated tokens
|
||||
decoded_outputs = processor.batch_decode(output_tokens, skip_special_tokens=True)
|
||||
for output in decoded_outputs:
|
||||
print(output)
|
||||
```
|
||||
|
||||
This model was contributed by [Eustache Le Bihan](https://huggingface.co/eustlb).
|
||||
The original code can be found [here](https://github.com/kyutai-labs/moshi).
|
||||
|
||||
|
||||
## KyutaiSpeechToTextConfig
|
||||
|
||||
[[autodoc]] KyutaiSpeechToTextConfig
|
||||
|
||||
## KyutaiSpeechToTextProcessor
|
||||
|
||||
[[autodoc]] KyutaiSpeechToTextProcessor
|
||||
- __call__
|
||||
|
||||
## KyutaiSpeechToTextFeatureExtractor
|
||||
|
||||
[[autodoc]] KyutaiSpeechToTextFeatureExtractor
|
||||
|
||||
## KyutaiSpeechToTextForConditionalGeneration
|
||||
|
||||
[[autodoc]] KyutaiSpeechToTextForConditionalGeneration
|
||||
- forward
|
||||
- generate
|
||||
|
||||
## KyutaiSpeechToTextModel
|
||||
|
||||
[[autodoc]] KyutaiSpeechToTextModel
|
@ -24,7 +24,7 @@ A linter "unravels" the modular file into a `modeling.py` file to preserve the s
|
||||
Run the command below to automatically generate a `modeling.py` file from a modular file.
|
||||
|
||||
```bash
|
||||
python utils/modular_model_converter.py --files_to_parse src/transformers/models/<your_model>/modular_<your_model>.py
|
||||
python utils/modular_model_converter.py --files-to-parse src/transformers/models/<your_model>/modular_<your_model>.py
|
||||
```
|
||||
|
||||
For example:
|
||||
|
@ -31,7 +31,7 @@ Refer to the table below to quickly help you identify the features relevant to y
|
||||
| data preloading | yes | no |
|
||||
| torch_empty_cache_steps | no | yes |
|
||||
| torch.compile | yes | no |
|
||||
| PEFT | no | yes |
|
||||
| scaled dot production attention (SDPA) | yes | yes |
|
||||
|
||||
## Trainer
|
||||
|
||||
@ -128,7 +128,7 @@ fp16 isn't memory-optimized because the gradients that are computed in fp16 are
|
||||
|
||||
[bf16](https://cloud.google.com/blog/products/ai-machine-learning/bfloat16-the-secret-to-high-performance-on-cloud-tpus) trades off some precision for a much larger dynamic range, which is helpful for avoiding overflow and underflow errors. You can use bf16 without adding any loss scaling methods like you would with fp16. bf16 is supported by NVIDIAs Ampere architecture or newer.
|
||||
|
||||
Configure [`~TrainingArguments.fp16`] in [`TrainingArguments`] to enable mixed precision training with the bf16 data type.
|
||||
Configure [`~TrainingArguments.bf16`] in [`TrainingArguments`] to enable mixed precision training with the bf16 data type.
|
||||
|
||||
```py
|
||||
from transformers import TrainingArguments
|
||||
|
@ -264,6 +264,7 @@ class ExamplesTests(TestCasePlus):
|
||||
--dataset_config clean
|
||||
--train_split_name validation
|
||||
--eval_split_name validation
|
||||
--trust_remote_code
|
||||
--output_dir {tmp_dir}
|
||||
--overwrite_output_dir
|
||||
--num_train_epochs=2
|
||||
|
@ -14,6 +14,7 @@ class MyNewModelConfig(PretrainedConfig):
|
||||
This is the configuration class to store the configuration of a [`MyNewModelModel`]. It is used to instantiate an MyNewModel
|
||||
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
||||
defaults will yield a similar configuration to that of the MyNewModel-7B.
|
||||
e.g. [meta-my_new_model/MyNewModel-2-7b-hf](https://huggingface.co/meta-my_new_model/MyNewModel-2-7b-hf)
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
@ -4,37 +4,25 @@
|
||||
# the file from the modular. If any change should be done, please apply the change to the
|
||||
# modular_dummy.py file directly. One of our CI enforces this.
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
from typing import Callable, Optional, Union
|
||||
from typing import Callable, Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...cache_utils import Cache, DynamicCache
|
||||
from ...integrations import use_kernel_forward_from_hub
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...masking_utils import create_causal_mask
|
||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BaseModelOutputWithPast
|
||||
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import (
|
||||
add_start_docstrings,
|
||||
add_start_docstrings_to_model_forward,
|
||||
can_return_tuple,
|
||||
is_torch_flex_attn_available,
|
||||
logging,
|
||||
)
|
||||
from ...utils import auto_docstring, can_return_tuple, logging
|
||||
from .configuration_dummy import DummyConfig
|
||||
|
||||
|
||||
if is_torch_flex_attn_available():
|
||||
from torch.nn.attention.flex_attention import BlockMask
|
||||
|
||||
from ...integrations.flex_attention import make_flex_block_causal_mask
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@ -232,15 +220,8 @@ class DummyAttention(nn.Module):
|
||||
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
||||
|
||||
attention_interface: Callable = eager_attention_forward
|
||||
|
||||
if self.config._attn_implementation != "eager":
|
||||
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
|
||||
logger.warning_once(
|
||||
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
|
||||
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
|
||||
)
|
||||
else:
|
||||
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||
|
||||
attn_output, attn_weights = attention_interface(
|
||||
self,
|
||||
@ -311,27 +292,7 @@ class DummyDecoderLayer(GradientCheckpointingLayer):
|
||||
return outputs
|
||||
|
||||
|
||||
DUMMY_START_DOCSTRING = r"""
|
||||
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
||||
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
||||
etc.)
|
||||
|
||||
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
||||
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
||||
and behavior.
|
||||
|
||||
Parameters:
|
||||
config ([`DummyConfig`]):
|
||||
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
||||
load the weights associated with the model, only the configuration. Check out the
|
||||
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
||||
"""
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare Dummy Model outputting raw hidden-states without any specific head on top.",
|
||||
DUMMY_START_DOCSTRING,
|
||||
)
|
||||
@auto_docstring
|
||||
class DummyPreTrainedModel(PreTrainedModel):
|
||||
config_class = DummyConfig
|
||||
base_model_prefix = "model"
|
||||
@ -360,88 +321,8 @@ class DummyPreTrainedModel(PreTrainedModel):
|
||||
module.weight.data.fill_(1.0)
|
||||
|
||||
|
||||
DUMMY_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
||||
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
||||
it.
|
||||
|
||||
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
||||
[`PreTrainedTokenizer.__call__`] for details.
|
||||
|
||||
[What are input IDs?](../glossary#input-ids)
|
||||
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length) or `BlockMask`, *optional*):
|
||||
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
If the model is configured to use flex_attention, it will attempt to convert the mask Tensor into a BlockMask,
|
||||
but you can also pass a `BlockMask` object directly here.
|
||||
|
||||
[What are attention masks?](../glossary#attention-mask)
|
||||
|
||||
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
||||
[`PreTrainedTokenizer.__call__`] for details.
|
||||
|
||||
If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
|
||||
`past_key_values`).
|
||||
|
||||
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
|
||||
and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
|
||||
information on the default strategy.
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the head is **masked**.
|
||||
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
||||
config.n_positions - 1]`.
|
||||
|
||||
[What are position IDs?](../glossary#position-ids)
|
||||
past_key_values (`Cache`, *optional*):
|
||||
Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
||||
blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
|
||||
returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
||||
|
||||
It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
|
||||
|
||||
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
|
||||
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
|
||||
of shape `(batch_size, sequence_length)`.
|
||||
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
|
||||
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
|
||||
model's internal embedding lookup matrix.
|
||||
use_cache (`bool`, *optional*):
|
||||
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
|
||||
`past_key_values`).
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail.
|
||||
output_hidden_states (`bool`, *optional*):
|
||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
||||
more detail.
|
||||
return_dict (`bool`, *optional*):
|
||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
||||
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
|
||||
Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
|
||||
this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
|
||||
the complete sequence length.
|
||||
"""
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare Dummy Model outputting raw hidden-states without any specific head on top.",
|
||||
DUMMY_START_DOCSTRING,
|
||||
)
|
||||
@auto_docstring
|
||||
class DummyModel(DummyPreTrainedModel):
|
||||
"""
|
||||
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DummyDecoderLayer`]
|
||||
|
||||
Args:
|
||||
config: DummyConfig
|
||||
"""
|
||||
|
||||
def __init__(self, config: DummyConfig):
|
||||
super().__init__(config)
|
||||
self.padding_idx = config.pad_token_id
|
||||
@ -465,7 +346,7 @@ class DummyModel(DummyPreTrainedModel):
|
||||
self.embed_tokens = value
|
||||
|
||||
@can_return_tuple
|
||||
@add_start_docstrings_to_model_forward(DUMMY_INPUTS_DOCSTRING)
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
@ -513,8 +394,12 @@ class DummyModel(DummyPreTrainedModel):
|
||||
if position_ids is None:
|
||||
position_ids = cache_position.unsqueeze(0)
|
||||
|
||||
causal_mask = self._update_causal_mask(
|
||||
attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
|
||||
causal_mask = create_causal_mask(
|
||||
config=self.config,
|
||||
input_embeds=inputs_embeds,
|
||||
attention_mask=attention_mask,
|
||||
cache_position=cache_position,
|
||||
past_key_values=past_key_values,
|
||||
)
|
||||
|
||||
hidden_states = inputs_embeds
|
||||
@ -559,126 +444,3 @@ class DummyModel(DummyPreTrainedModel):
|
||||
hidden_states=all_hidden_states,
|
||||
attentions=all_self_attns,
|
||||
)
|
||||
|
||||
def _update_causal_mask(
|
||||
self,
|
||||
attention_mask: Union[torch.Tensor, "BlockMask"],
|
||||
input_tensor: torch.Tensor,
|
||||
cache_position: torch.Tensor,
|
||||
past_key_values: Cache,
|
||||
output_attentions: bool = False,
|
||||
):
|
||||
if self.config._attn_implementation == "flash_attention_2":
|
||||
if attention_mask is not None and (attention_mask == 0.0).any():
|
||||
return attention_mask
|
||||
return None
|
||||
if self.config._attn_implementation == "flex_attention":
|
||||
if isinstance(attention_mask, torch.Tensor):
|
||||
attention_mask = make_flex_block_causal_mask(attention_mask)
|
||||
return attention_mask
|
||||
|
||||
# For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
|
||||
# order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
|
||||
# to infer the attention mask.
|
||||
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
|
||||
using_static_cache = isinstance(past_key_values, StaticCache)
|
||||
|
||||
# When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
|
||||
if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
|
||||
if AttentionMaskConverter._ignore_causal_mask_sdpa(
|
||||
attention_mask,
|
||||
inputs_embeds=input_tensor,
|
||||
past_key_values_length=past_seen_tokens,
|
||||
is_training=self.training,
|
||||
):
|
||||
return None
|
||||
|
||||
dtype = input_tensor.dtype
|
||||
sequence_length = input_tensor.shape[1]
|
||||
if using_static_cache:
|
||||
target_length = past_key_values.get_max_cache_shape()
|
||||
else:
|
||||
target_length = (
|
||||
attention_mask.shape[-1]
|
||||
if isinstance(attention_mask, torch.Tensor)
|
||||
else past_seen_tokens + sequence_length + 1
|
||||
)
|
||||
|
||||
# In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
|
||||
causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
|
||||
attention_mask,
|
||||
sequence_length=sequence_length,
|
||||
target_length=target_length,
|
||||
dtype=dtype,
|
||||
cache_position=cache_position,
|
||||
batch_size=input_tensor.shape[0],
|
||||
)
|
||||
|
||||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type in ["cuda", "xpu", "npu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
# using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
|
||||
# Details: https://github.com/pytorch/pytorch/issues/110213
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
|
||||
|
||||
return causal_mask
|
||||
|
||||
@staticmethod
|
||||
def _prepare_4d_causal_attention_mask_with_cache_position(
|
||||
attention_mask: torch.Tensor,
|
||||
sequence_length: int,
|
||||
target_length: int,
|
||||
dtype: torch.dtype,
|
||||
cache_position: torch.Tensor,
|
||||
batch_size: int,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
|
||||
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
|
||||
|
||||
Args:
|
||||
attention_mask (`torch.Tensor`):
|
||||
A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
|
||||
`(batch_size, 1, query_length, key_value_length)`.
|
||||
sequence_length (`int`):
|
||||
The sequence length being processed.
|
||||
target_length (`int`):
|
||||
The target length: when generating with static cache, the mask should be as long as the static cache,
|
||||
to account for the 0 padding, the part of the cache that is not filled yet.
|
||||
dtype (`torch.dtype`):
|
||||
The dtype to use for the 4D attention mask.
|
||||
cache_position (`torch.Tensor`):
|
||||
Indices depicting the position of the input sequence tokens in the sequence.
|
||||
batch_size (`torch.Tensor`):
|
||||
Batch size.
|
||||
"""
|
||||
if attention_mask is not None and attention_mask.dim() == 4:
|
||||
# In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
|
||||
causal_mask = attention_mask
|
||||
else:
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
causal_mask = torch.full(
|
||||
(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
|
||||
)
|
||||
if sequence_length != 1:
|
||||
causal_mask = torch.triu(causal_mask, diagonal=1)
|
||||
causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
|
||||
causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
|
||||
if attention_mask is not None:
|
||||
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
|
||||
mask_length = attention_mask.shape[-1]
|
||||
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
|
||||
causal_mask.device
|
||||
)
|
||||
padding_mask = padding_mask == 0
|
||||
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
|
||||
padding_mask, min_dtype
|
||||
)
|
||||
|
||||
return causal_mask
|
||||
|
@ -14,24 +14,16 @@ from torch import nn
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa, _prepare_4d_causal_attention_mask_for_sdpa
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, BaseModelOutputWithPoolingAndCrossAttentions
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
|
||||
from ...utils import (
|
||||
add_code_sample_docstrings,
|
||||
add_start_docstrings,
|
||||
add_start_docstrings_to_model_forward,
|
||||
get_torch_version,
|
||||
logging,
|
||||
)
|
||||
from ...utils import auto_docstring, get_torch_version, logging
|
||||
from .configuration_dummy_bert import DummyBertConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
_CHECKPOINT_FOR_DOC = "google-dummy_bert/dummy_bert-base-uncased"
|
||||
_CONFIG_FOR_DOC = "DummyBertConfig"
|
||||
|
||||
|
||||
class DummyBertEmbeddings(nn.Module):
|
||||
"""Construct the embeddings from word, position and token_type embeddings."""
|
||||
@ -432,7 +424,7 @@ class DummyBertOutput(nn.Module):
|
||||
return hidden_states
|
||||
|
||||
|
||||
class DummyBertLayer(nn.Module):
|
||||
class DummyBertLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.chunk_size_feed_forward = config.chunk_size_feed_forward
|
||||
@ -557,27 +549,15 @@ class DummyBertEncoder(nn.Module):
|
||||
layer_head_mask = head_mask[i] if head_mask is not None else None
|
||||
past_key_value = past_key_values[i] if past_key_values is not None else None
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
layer_module.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states, # as a positional argument for gradient checkpointing
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
past_key_value=past_key_value,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
if use_cache:
|
||||
@ -739,12 +719,8 @@ def load_tf_weights_in_dummy_bert(model, config, tf_checkpoint_path):
|
||||
return model
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class DummyBertPreTrainedModel(PreTrainedModel):
|
||||
"""
|
||||
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
||||
models.
|
||||
"""
|
||||
|
||||
config_class = DummyBertConfig
|
||||
load_tf_weights = load_tf_weights_in_dummy_bert
|
||||
base_model_prefix = "dummy_bert"
|
||||
@ -770,79 +746,8 @@ class DummyBertPreTrainedModel(PreTrainedModel):
|
||||
module.bias.data.zero_()
|
||||
|
||||
|
||||
DUMMY_BERT_START_DOCSTRING = r"""
|
||||
|
||||
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
||||
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
||||
etc.)
|
||||
|
||||
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
||||
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
||||
and behavior.
|
||||
|
||||
Parameters:
|
||||
config ([`DummyBertConfig`]): Model configuration class with all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the
|
||||
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
||||
"""
|
||||
|
||||
DUMMY_BERT_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
input_ids (`torch.LongTensor` of shape `({0})`):
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
|
||||
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
||||
[`PreTrainedTokenizer.__call__`] for details.
|
||||
|
||||
[What are input IDs?](../glossary#input-ids)
|
||||
attention_mask (`torch.FloatTensor` of shape `({0})`or `(batch_size, sequence_length, target_length)`, *optional*):
|
||||
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
[What are attention masks?](../glossary#attention-mask)
|
||||
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
|
||||
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
|
||||
1]`:
|
||||
|
||||
- 0 corresponds to a *sentence A* token,
|
||||
- 1 corresponds to a *sentence B* token.
|
||||
|
||||
[What are token type IDs?](../glossary#token-type-ids)
|
||||
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
|
||||
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
||||
config.max_position_embeddings - 1]`.
|
||||
|
||||
[What are position IDs?](../glossary#position-ids)
|
||||
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
|
||||
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
|
||||
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
|
||||
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
|
||||
model's internal embedding lookup matrix.
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail.
|
||||
output_hidden_states (`bool`, *optional*):
|
||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
||||
more detail.
|
||||
return_dict (`bool`, *optional*):
|
||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare DummyBert Model transformer outputting raw hidden-states without any specific head on top.",
|
||||
DUMMY_BERT_START_DOCSTRING,
|
||||
)
|
||||
class DummyBertModel(DummyBertPreTrainedModel):
|
||||
"""
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
|
||||
cross-attention is added between the self-attention layers, following the architecture described in [Attention is
|
||||
all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
|
||||
@ -852,10 +757,15 @@ class DummyBertModel(DummyBertPreTrainedModel):
|
||||
to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
|
||||
`add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
|
||||
"""
|
||||
|
||||
)
|
||||
class DummyBertModel(DummyBertPreTrainedModel):
|
||||
_no_split_modules = ["DummyBertEmbeddings", "DummyBertLayer"]
|
||||
|
||||
def __init__(self, config, add_pooling_layer=True):
|
||||
r"""
|
||||
add_pooling_layer (bool, *optional*, defaults to `True`):
|
||||
Whether to add a pooling layer
|
||||
"""
|
||||
super().__init__(config)
|
||||
self.config = config
|
||||
|
||||
@ -884,12 +794,7 @@ class DummyBertModel(DummyBertPreTrainedModel):
|
||||
for layer, heads in heads_to_prune.items():
|
||||
self.encoder.layer[layer].attention.prune_heads(heads)
|
||||
|
||||
@add_start_docstrings_to_model_forward(DUMMY_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutputWithPoolingAndCrossAttentions,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.Tensor] = None,
|
||||
@ -906,26 +811,6 @@ class DummyBertModel(DummyBertPreTrainedModel):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
|
||||
r"""
|
||||
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
|
||||
the model is configured as a decoder.
|
||||
encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, target_length)`, *optional*):
|
||||
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
|
||||
the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
|
||||
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
|
||||
|
||||
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
|
||||
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
|
||||
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
|
||||
use_cache (`bool`, *optional*):
|
||||
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
|
||||
`past_key_values`).
|
||||
"""
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
|
@ -10,6 +10,7 @@ import torch
|
||||
from torch import nn
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
||||
from ...utils import logging
|
||||
from .configuration_from_uppercase_model import FromUppercaseModelTextConfig, FromUppercaseModelVisionConfig
|
||||
@ -138,7 +139,7 @@ class FromUppercaseModelMLP(nn.Module):
|
||||
return hidden_states
|
||||
|
||||
|
||||
class FromUppercaseModelEncoderLayer(nn.Module):
|
||||
class FromUppercaseModelEncoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: Union[FromUppercaseModelVisionConfig, FromUppercaseModelTextConfig]):
|
||||
super().__init__()
|
||||
self.embed_dim = config.hidden_size
|
||||
|
@ -4,37 +4,25 @@
|
||||
# the file from the modular. If any change should be done, please apply the change to the
|
||||
# modular_multimodal1.py file directly. One of our CI enforces this.
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
from typing import Callable, Optional, Union
|
||||
from typing import Callable, Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...cache_utils import Cache, DynamicCache
|
||||
from ...integrations import use_kernel_forward_from_hub
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...masking_utils import create_causal_mask
|
||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BaseModelOutputWithPast
|
||||
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import (
|
||||
add_start_docstrings,
|
||||
add_start_docstrings_to_model_forward,
|
||||
can_return_tuple,
|
||||
is_torch_flex_attn_available,
|
||||
logging,
|
||||
)
|
||||
from ...utils import auto_docstring, can_return_tuple, logging
|
||||
from .configuration_multimodal1 import Multimodal1TextConfig
|
||||
|
||||
|
||||
if is_torch_flex_attn_available():
|
||||
from torch.nn.attention.flex_attention import BlockMask
|
||||
|
||||
from ...integrations.flex_attention import make_flex_block_causal_mask
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@ -232,15 +220,8 @@ class Multimodal1TextAttention(nn.Module):
|
||||
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
||||
|
||||
attention_interface: Callable = eager_attention_forward
|
||||
|
||||
if self.config._attn_implementation != "eager":
|
||||
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
|
||||
logger.warning_once(
|
||||
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
|
||||
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
|
||||
)
|
||||
else:
|
||||
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||
|
||||
attn_output, attn_weights = attention_interface(
|
||||
self,
|
||||
@ -311,27 +292,7 @@ class Multimodal1TextDecoderLayer(GradientCheckpointingLayer):
|
||||
return outputs
|
||||
|
||||
|
||||
MULTIMODAL1_TEXT_START_DOCSTRING = r"""
|
||||
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
||||
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
||||
etc.)
|
||||
|
||||
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
||||
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
||||
and behavior.
|
||||
|
||||
Parameters:
|
||||
config ([`Multimodal1TextConfig`]):
|
||||
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
||||
load the weights associated with the model, only the configuration. Check out the
|
||||
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
||||
"""
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare Multimodal1Text Model outputting raw hidden-states without any specific head on top.",
|
||||
MULTIMODAL1_TEXT_START_DOCSTRING,
|
||||
)
|
||||
@auto_docstring
|
||||
class Multimodal1TextPreTrainedModel(PreTrainedModel):
|
||||
config_class = Multimodal1TextConfig
|
||||
base_model_prefix = "model"
|
||||
@ -360,88 +321,8 @@ class Multimodal1TextPreTrainedModel(PreTrainedModel):
|
||||
module.weight.data.fill_(1.0)
|
||||
|
||||
|
||||
MULTIMODAL1_TEXT_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
||||
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
||||
it.
|
||||
|
||||
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
||||
[`PreTrainedTokenizer.__call__`] for details.
|
||||
|
||||
[What are input IDs?](../glossary#input-ids)
|
||||
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length) or `BlockMask`, *optional*):
|
||||
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
If the model is configured to use flex_attention, it will attempt to convert the mask Tensor into a BlockMask,
|
||||
but you can also pass a `BlockMask` object directly here.
|
||||
|
||||
[What are attention masks?](../glossary#attention-mask)
|
||||
|
||||
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
||||
[`PreTrainedTokenizer.__call__`] for details.
|
||||
|
||||
If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
|
||||
`past_key_values`).
|
||||
|
||||
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
|
||||
and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
|
||||
information on the default strategy.
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the head is **masked**.
|
||||
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
||||
config.n_positions - 1]`.
|
||||
|
||||
[What are position IDs?](../glossary#position-ids)
|
||||
past_key_values (`Cache`, *optional*):
|
||||
Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
||||
blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
|
||||
returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
||||
|
||||
It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
|
||||
|
||||
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
|
||||
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
|
||||
of shape `(batch_size, sequence_length)`.
|
||||
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
|
||||
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
|
||||
model's internal embedding lookup matrix.
|
||||
use_cache (`bool`, *optional*):
|
||||
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
|
||||
`past_key_values`).
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail.
|
||||
output_hidden_states (`bool`, *optional*):
|
||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
||||
more detail.
|
||||
return_dict (`bool`, *optional*):
|
||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
||||
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
|
||||
Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
|
||||
this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
|
||||
the complete sequence length.
|
||||
"""
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare Multimodal1Text Model outputting raw hidden-states without any specific head on top.",
|
||||
MULTIMODAL1_TEXT_START_DOCSTRING,
|
||||
)
|
||||
@auto_docstring
|
||||
class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
|
||||
"""
|
||||
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Multimodal1TextDecoderLayer`]
|
||||
|
||||
Args:
|
||||
config: Multimodal1TextConfig
|
||||
"""
|
||||
|
||||
def __init__(self, config: Multimodal1TextConfig):
|
||||
super().__init__(config)
|
||||
self.padding_idx = config.pad_token_id
|
||||
@ -465,7 +346,7 @@ class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
|
||||
self.embed_tokens = value
|
||||
|
||||
@can_return_tuple
|
||||
@add_start_docstrings_to_model_forward(MULTIMODAL1_TEXT_INPUTS_DOCSTRING)
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
@ -513,8 +394,12 @@ class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
|
||||
if position_ids is None:
|
||||
position_ids = cache_position.unsqueeze(0)
|
||||
|
||||
causal_mask = self._update_causal_mask(
|
||||
attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
|
||||
causal_mask = create_causal_mask(
|
||||
config=self.config,
|
||||
input_embeds=inputs_embeds,
|
||||
attention_mask=attention_mask,
|
||||
cache_position=cache_position,
|
||||
past_key_values=past_key_values,
|
||||
)
|
||||
|
||||
hidden_states = inputs_embeds
|
||||
@ -559,126 +444,3 @@ class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
|
||||
hidden_states=all_hidden_states,
|
||||
attentions=all_self_attns,
|
||||
)
|
||||
|
||||
def _update_causal_mask(
|
||||
self,
|
||||
attention_mask: Union[torch.Tensor, "BlockMask"],
|
||||
input_tensor: torch.Tensor,
|
||||
cache_position: torch.Tensor,
|
||||
past_key_values: Cache,
|
||||
output_attentions: bool = False,
|
||||
):
|
||||
if self.config._attn_implementation == "flash_attention_2":
|
||||
if attention_mask is not None and (attention_mask == 0.0).any():
|
||||
return attention_mask
|
||||
return None
|
||||
if self.config._attn_implementation == "flex_attention":
|
||||
if isinstance(attention_mask, torch.Tensor):
|
||||
attention_mask = make_flex_block_causal_mask(attention_mask)
|
||||
return attention_mask
|
||||
|
||||
# For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
|
||||
# order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
|
||||
# to infer the attention mask.
|
||||
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
|
||||
using_static_cache = isinstance(past_key_values, StaticCache)
|
||||
|
||||
# When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
|
||||
if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
|
||||
if AttentionMaskConverter._ignore_causal_mask_sdpa(
|
||||
attention_mask,
|
||||
inputs_embeds=input_tensor,
|
||||
past_key_values_length=past_seen_tokens,
|
||||
is_training=self.training,
|
||||
):
|
||||
return None
|
||||
|
||||
dtype = input_tensor.dtype
|
||||
sequence_length = input_tensor.shape[1]
|
||||
if using_static_cache:
|
||||
target_length = past_key_values.get_max_cache_shape()
|
||||
else:
|
||||
target_length = (
|
||||
attention_mask.shape[-1]
|
||||
if isinstance(attention_mask, torch.Tensor)
|
||||
else past_seen_tokens + sequence_length + 1
|
||||
)
|
||||
|
||||
# In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
|
||||
causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
|
||||
attention_mask,
|
||||
sequence_length=sequence_length,
|
||||
target_length=target_length,
|
||||
dtype=dtype,
|
||||
cache_position=cache_position,
|
||||
batch_size=input_tensor.shape[0],
|
||||
)
|
||||
|
||||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type in ["cuda", "xpu", "npu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
# using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
|
||||
# Details: https://github.com/pytorch/pytorch/issues/110213
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
|
||||
|
||||
return causal_mask
|
||||
|
||||
@staticmethod
|
||||
def _prepare_4d_causal_attention_mask_with_cache_position(
|
||||
attention_mask: torch.Tensor,
|
||||
sequence_length: int,
|
||||
target_length: int,
|
||||
dtype: torch.dtype,
|
||||
cache_position: torch.Tensor,
|
||||
batch_size: int,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
|
||||
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
|
||||
|
||||
Args:
|
||||
attention_mask (`torch.Tensor`):
|
||||
A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
|
||||
`(batch_size, 1, query_length, key_value_length)`.
|
||||
sequence_length (`int`):
|
||||
The sequence length being processed.
|
||||
target_length (`int`):
|
||||
The target length: when generating with static cache, the mask should be as long as the static cache,
|
||||
to account for the 0 padding, the part of the cache that is not filled yet.
|
||||
dtype (`torch.dtype`):
|
||||
The dtype to use for the 4D attention mask.
|
||||
cache_position (`torch.Tensor`):
|
||||
Indices depicting the position of the input sequence tokens in the sequence.
|
||||
batch_size (`torch.Tensor`):
|
||||
Batch size.
|
||||
"""
|
||||
if attention_mask is not None and attention_mask.dim() == 4:
|
||||
# In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
|
||||
causal_mask = attention_mask
|
||||
else:
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
causal_mask = torch.full(
|
||||
(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
|
||||
)
|
||||
if sequence_length != 1:
|
||||
causal_mask = torch.triu(causal_mask, diagonal=1)
|
||||
causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
|
||||
causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
|
||||
if attention_mask is not None:
|
||||
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
|
||||
mask_length = attention_mask.shape[-1]
|
||||
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
|
||||
causal_mask.device
|
||||
)
|
||||
padding_mask = padding_mask == 0
|
||||
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
|
||||
padding_mask, min_dtype
|
||||
)
|
||||
|
||||
return causal_mask
|
||||
|
@ -13,15 +13,10 @@ from torch import nn
|
||||
from transformers.utils import add_start_docstrings
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
||||
from ...utils import (
|
||||
add_start_docstrings_to_model_forward,
|
||||
can_return_tuple,
|
||||
logging,
|
||||
replace_return_docstrings,
|
||||
torch_int,
|
||||
)
|
||||
from ...utils import auto_docstring, can_return_tuple, logging, torch_int
|
||||
from .configuration_multimodal2 import Multimodal2Config, Multimodal2TextConfig, Multimodal2VisionConfig
|
||||
|
||||
|
||||
@ -229,7 +224,7 @@ class Multimodal2Attention(nn.Module):
|
||||
return attn_output, attn_weights
|
||||
|
||||
|
||||
class Multimodal2VisionEncoderLayer(nn.Module):
|
||||
class Multimodal2VisionEncoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.embed_dim = config.hidden_size
|
||||
@ -344,21 +339,12 @@ class Multimodal2VisionEncoder(nn.Module):
|
||||
for idx, encoder_layer in enumerate(self.layers):
|
||||
if output_hidden_states:
|
||||
encoder_states = encoder_states + (hidden_states,)
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
encoder_layer.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
causal_attention_mask,
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
causal_attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
causal_attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
@ -458,24 +444,6 @@ class Multimodal2VisionEmbeddings(nn.Module):
|
||||
return embeddings
|
||||
|
||||
|
||||
MULTIMODAL2_VISION_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
|
||||
[`AutoImageProcessor`]. See [`Multimodal2ImageProcessor.__call__`] for details.
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail.
|
||||
output_hidden_states (`bool`, *optional*):
|
||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
||||
more detail.
|
||||
interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
|
||||
Whether to interpolate the pre-trained position encodings.
|
||||
return_dict (`bool`, *optional*):
|
||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
class Multimodal2VisionTransformer(nn.Module):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
@ -488,8 +456,7 @@ class Multimodal2VisionTransformer(nn.Module):
|
||||
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
@can_return_tuple
|
||||
@add_start_docstrings_to_model_forward(MULTIMODAL2_VISION_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Multimodal2VisionConfig)
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
pixel_values: Optional[torch.FloatTensor] = None,
|
||||
@ -497,10 +464,6 @@ class Multimodal2VisionTransformer(nn.Module):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
interpolate_pos_encoding: Optional[bool] = False,
|
||||
) -> BaseModelOutputWithPooling:
|
||||
r"""
|
||||
Returns:
|
||||
|
||||
"""
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
@ -530,17 +493,15 @@ class Multimodal2VisionTransformer(nn.Module):
|
||||
)
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class Multimodal2VisionPreTrainedModel(PreTrainedModel):
|
||||
"""
|
||||
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
||||
models.
|
||||
"""
|
||||
|
||||
config_class = Multimodal2Config
|
||||
base_model_prefix = "multimodal2_vision"
|
||||
supports_gradient_checkpointing = True
|
||||
_supports_sdpa = True
|
||||
_supports_flash_attn_2 = True
|
||||
_supports_flex_attn = True
|
||||
_supports_attention_backend = True
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
@ -567,8 +528,7 @@ class Multimodal2VisionModel(Multimodal2VisionPreTrainedModel):
|
||||
return self.vision_model.embeddings.patch_embedding
|
||||
|
||||
@can_return_tuple
|
||||
@add_start_docstrings_to_model_forward(MULTIMODAL2_VISION_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Multimodal2VisionConfig)
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
pixel_values: Optional[torch.FloatTensor] = None,
|
||||
@ -577,9 +537,7 @@ class Multimodal2VisionModel(Multimodal2VisionPreTrainedModel):
|
||||
interpolate_pos_encoding: bool = False,
|
||||
) -> BaseModelOutputWithPooling:
|
||||
r"""
|
||||
Returns:
|
||||
|
||||
Examples:
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from PIL import Image
|
||||
|
@ -4,36 +4,24 @@
|
||||
# the file from the modular. If any change should be done, please apply the change to the
|
||||
# modular_my_new_model2.py file directly. One of our CI enforces this.
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
from typing import Callable, Optional, Union
|
||||
from typing import Callable, Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...cache_utils import Cache, DynamicCache
|
||||
from ...masking_utils import create_causal_mask
|
||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BaseModelOutputWithPast, SequenceClassifierOutputWithPast
|
||||
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import (
|
||||
add_start_docstrings,
|
||||
add_start_docstrings_to_model_forward,
|
||||
can_return_tuple,
|
||||
is_torch_flex_attn_available,
|
||||
logging,
|
||||
)
|
||||
from ...utils import auto_docstring, can_return_tuple, logging
|
||||
from .configuration_my_new_model2 import MyNewModel2Config
|
||||
|
||||
|
||||
if is_torch_flex_attn_available():
|
||||
from torch.nn.attention.flex_attention import BlockMask
|
||||
|
||||
from ...integrations.flex_attention import make_flex_block_causal_mask
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@ -230,15 +218,8 @@ class MyNewModel2Attention(nn.Module):
|
||||
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
||||
|
||||
attention_interface: Callable = eager_attention_forward
|
||||
|
||||
if self.config._attn_implementation != "eager":
|
||||
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
|
||||
logger.warning_once(
|
||||
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
|
||||
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
|
||||
)
|
||||
else:
|
||||
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||
|
||||
attn_output, attn_weights = attention_interface(
|
||||
self,
|
||||
@ -309,27 +290,7 @@ class MyNewModel2DecoderLayer(GradientCheckpointingLayer):
|
||||
return outputs
|
||||
|
||||
|
||||
MY_NEW_MODEL2_START_DOCSTRING = r"""
|
||||
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
||||
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
||||
etc.)
|
||||
|
||||
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
||||
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
||||
and behavior.
|
||||
|
||||
Parameters:
|
||||
config ([`MyNewModel2Config`]):
|
||||
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
||||
load the weights associated with the model, only the configuration. Check out the
|
||||
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
||||
"""
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare MyNewModel2 Model outputting raw hidden-states without any specific head on top.",
|
||||
MY_NEW_MODEL2_START_DOCSTRING,
|
||||
)
|
||||
@auto_docstring
|
||||
class MyNewModel2PreTrainedModel(PreTrainedModel):
|
||||
config_class = MyNewModel2Config
|
||||
base_model_prefix = "model"
|
||||
@ -358,88 +319,8 @@ class MyNewModel2PreTrainedModel(PreTrainedModel):
|
||||
module.weight.data.fill_(1.0)
|
||||
|
||||
|
||||
MY_NEW_MODEL2_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
||||
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
||||
it.
|
||||
|
||||
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
||||
[`PreTrainedTokenizer.__call__`] for details.
|
||||
|
||||
[What are input IDs?](../glossary#input-ids)
|
||||
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length) or `BlockMask`, *optional*):
|
||||
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
If the model is configured to use flex_attention, it will attempt to convert the mask Tensor into a BlockMask,
|
||||
but you can also pass a `BlockMask` object directly here.
|
||||
|
||||
[What are attention masks?](../glossary#attention-mask)
|
||||
|
||||
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
||||
[`PreTrainedTokenizer.__call__`] for details.
|
||||
|
||||
If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
|
||||
`past_key_values`).
|
||||
|
||||
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
|
||||
and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
|
||||
information on the default strategy.
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the head is **masked**.
|
||||
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
||||
config.n_positions - 1]`.
|
||||
|
||||
[What are position IDs?](../glossary#position-ids)
|
||||
past_key_values (`Cache`, *optional*):
|
||||
Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
||||
blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
|
||||
returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
||||
|
||||
It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
|
||||
|
||||
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
|
||||
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
|
||||
of shape `(batch_size, sequence_length)`.
|
||||
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
|
||||
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
|
||||
model's internal embedding lookup matrix.
|
||||
use_cache (`bool`, *optional*):
|
||||
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
|
||||
`past_key_values`).
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail.
|
||||
output_hidden_states (`bool`, *optional*):
|
||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
||||
more detail.
|
||||
return_dict (`bool`, *optional*):
|
||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
||||
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
|
||||
Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
|
||||
this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
|
||||
the complete sequence length.
|
||||
"""
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare MyNewModel2 Model outputting raw hidden-states without any specific head on top.",
|
||||
MY_NEW_MODEL2_START_DOCSTRING,
|
||||
)
|
||||
@auto_docstring
|
||||
class MyNewModel2Model(MyNewModel2PreTrainedModel):
|
||||
"""
|
||||
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MyNewModel2DecoderLayer`]
|
||||
|
||||
Args:
|
||||
config: MyNewModel2Config
|
||||
"""
|
||||
|
||||
def __init__(self, config: MyNewModel2Config):
|
||||
super().__init__(config)
|
||||
self.padding_idx = config.pad_token_id
|
||||
@ -463,19 +344,19 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
|
||||
self.embed_tokens = value
|
||||
|
||||
@can_return_tuple
|
||||
@add_start_docstrings_to_model_forward(MY_NEW_MODEL2_INPUTS_DOCSTRING)
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
|
||||
past_key_values: Optional[Cache] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs, # NOOP kwarg for now
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> BaseModelOutputWithPast:
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
@ -507,8 +388,12 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
|
||||
if position_ids is None:
|
||||
position_ids = cache_position.unsqueeze(0)
|
||||
|
||||
causal_mask = self._update_causal_mask(
|
||||
attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
|
||||
causal_mask = create_causal_mask(
|
||||
config=self.config,
|
||||
input_embeds=inputs_embeds,
|
||||
attention_mask=attention_mask,
|
||||
cache_position=cache_position,
|
||||
past_key_values=past_key_values,
|
||||
)
|
||||
|
||||
# embed positions
|
||||
@ -540,6 +425,7 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
position_embeddings=position_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
@ -560,132 +446,9 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
|
||||
attentions=all_self_attns,
|
||||
)
|
||||
|
||||
def _update_causal_mask(
|
||||
self,
|
||||
attention_mask: Union[torch.Tensor, "BlockMask"],
|
||||
input_tensor: torch.Tensor,
|
||||
cache_position: torch.Tensor,
|
||||
past_key_values: Cache,
|
||||
output_attentions: bool = False,
|
||||
):
|
||||
if self.config._attn_implementation == "flash_attention_2":
|
||||
if attention_mask is not None and (attention_mask == 0.0).any():
|
||||
return attention_mask
|
||||
return None
|
||||
if self.config._attn_implementation == "flex_attention":
|
||||
if isinstance(attention_mask, torch.Tensor):
|
||||
attention_mask = make_flex_block_causal_mask(attention_mask)
|
||||
return attention_mask
|
||||
|
||||
# For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
|
||||
# order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
|
||||
# to infer the attention mask.
|
||||
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
|
||||
using_static_cache = isinstance(past_key_values, StaticCache)
|
||||
|
||||
# When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
|
||||
if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
|
||||
if AttentionMaskConverter._ignore_causal_mask_sdpa(
|
||||
attention_mask,
|
||||
inputs_embeds=input_tensor,
|
||||
past_key_values_length=past_seen_tokens,
|
||||
is_training=self.training,
|
||||
):
|
||||
return None
|
||||
|
||||
dtype = input_tensor.dtype
|
||||
sequence_length = input_tensor.shape[1]
|
||||
if using_static_cache:
|
||||
target_length = past_key_values.get_max_cache_shape()
|
||||
else:
|
||||
target_length = (
|
||||
attention_mask.shape[-1]
|
||||
if isinstance(attention_mask, torch.Tensor)
|
||||
else past_seen_tokens + sequence_length + 1
|
||||
)
|
||||
|
||||
# In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
|
||||
causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
|
||||
attention_mask,
|
||||
sequence_length=sequence_length,
|
||||
target_length=target_length,
|
||||
dtype=dtype,
|
||||
cache_position=cache_position,
|
||||
batch_size=input_tensor.shape[0],
|
||||
)
|
||||
|
||||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type in ["cuda", "xpu", "npu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
# using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
|
||||
# Details: https://github.com/pytorch/pytorch/issues/110213
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
|
||||
|
||||
return causal_mask
|
||||
|
||||
@staticmethod
|
||||
def _prepare_4d_causal_attention_mask_with_cache_position(
|
||||
attention_mask: torch.Tensor,
|
||||
sequence_length: int,
|
||||
target_length: int,
|
||||
dtype: torch.dtype,
|
||||
cache_position: torch.Tensor,
|
||||
batch_size: int,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
|
||||
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
|
||||
|
||||
Args:
|
||||
attention_mask (`torch.Tensor`):
|
||||
A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
|
||||
`(batch_size, 1, query_length, key_value_length)`.
|
||||
sequence_length (`int`):
|
||||
The sequence length being processed.
|
||||
target_length (`int`):
|
||||
The target length: when generating with static cache, the mask should be as long as the static cache,
|
||||
to account for the 0 padding, the part of the cache that is not filled yet.
|
||||
dtype (`torch.dtype`):
|
||||
The dtype to use for the 4D attention mask.
|
||||
cache_position (`torch.Tensor`):
|
||||
Indices depicting the position of the input sequence tokens in the sequence.
|
||||
batch_size (`torch.Tensor`):
|
||||
Batch size.
|
||||
"""
|
||||
if attention_mask is not None and attention_mask.dim() == 4:
|
||||
# In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
|
||||
causal_mask = attention_mask
|
||||
else:
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
causal_mask = torch.full(
|
||||
(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
|
||||
)
|
||||
if sequence_length != 1:
|
||||
causal_mask = torch.triu(causal_mask, diagonal=1)
|
||||
causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
|
||||
causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
|
||||
if attention_mask is not None:
|
||||
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
|
||||
mask_length = attention_mask.shape[-1]
|
||||
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
|
||||
causal_mask.device
|
||||
)
|
||||
padding_mask = padding_mask == 0
|
||||
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
|
||||
padding_mask, min_dtype
|
||||
)
|
||||
|
||||
return causal_mask
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The MyNewModel2 Model transformer with a sequence classification head on top (linear layer).
|
||||
|
||||
[`MyNewModel2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
|
||||
@ -696,8 +459,7 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
|
||||
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
|
||||
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
|
||||
each row of the batch).
|
||||
""",
|
||||
MY_NEW_MODEL2_START_DOCSTRING,
|
||||
"""
|
||||
)
|
||||
class MyNewModel2ForSequenceClassification(MyNewModel2PreTrainedModel):
|
||||
def __init__(self, config):
|
||||
@ -716,7 +478,7 @@ class MyNewModel2ForSequenceClassification(MyNewModel2PreTrainedModel):
|
||||
self.model.embed_tokens = value
|
||||
|
||||
@can_return_tuple
|
||||
@add_start_docstrings_to_model_forward(MY_NEW_MODEL2_INPUTS_DOCSTRING)
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
|
@ -22,68 +22,48 @@ from .configuration_new_task_model import NewTaskModelConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class NewTaskModelModelOutputWithPast(BaseModelOutputWithPast):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for NewTaskModel outputs, with hidden states and attentions.
|
||||
"""
|
||||
)
|
||||
class NewTaskModelModelOutputWithPast(BaseModelOutputWithPast):
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class NewTaskModelCausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for NewTaskModel causal language model (or autoregressive) outputs.
|
||||
"""
|
||||
)
|
||||
class NewTaskModelCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -157,6 +137,12 @@ class NewTaskModelModel(NewTaskModelPreTrainedModel):
|
||||
def set_input_embeddings(self, value):
|
||||
self.language_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.language_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model
|
||||
|
||||
def _update_causal_mask(
|
||||
self,
|
||||
attention_mask,
|
||||
@ -406,10 +392,13 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
|
||||
self.lm_head = new_embeddings
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model = decoder
|
||||
self.model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model
|
||||
return self.model.get_decoder()
|
||||
|
||||
def get_image_features(self, pixel_values):
|
||||
return self.model.get_image_features(pixel_values)
|
||||
|
||||
# Make modules available throught conditional class for BC
|
||||
@property
|
||||
|
@ -14,24 +14,16 @@ from packaging import version
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa, _prepare_4d_causal_attention_mask_for_sdpa
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, BaseModelOutputWithPoolingAndCrossAttentions
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
|
||||
from ...utils import (
|
||||
add_code_sample_docstrings,
|
||||
add_start_docstrings,
|
||||
add_start_docstrings_to_model_forward,
|
||||
get_torch_version,
|
||||
logging,
|
||||
)
|
||||
from ...utils import auto_docstring, get_torch_version, logging
|
||||
from .configuration_roberta import RobertaConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
_CHECKPOINT_FOR_DOC = "google-roberta/roberta-base-uncased"
|
||||
_CONFIG_FOR_DOC = "RobertaConfig"
|
||||
|
||||
|
||||
class RobertaEmbeddings(nn.Module):
|
||||
"""Construct the embeddings from word, position and token_type embeddings."""
|
||||
@ -435,7 +427,7 @@ class RobertaOutput(nn.Module):
|
||||
return hidden_states
|
||||
|
||||
|
||||
class RobertaLayer(nn.Module):
|
||||
class RobertaLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.chunk_size_feed_forward = config.chunk_size_feed_forward
|
||||
@ -560,27 +552,15 @@ class RobertaEncoder(nn.Module):
|
||||
layer_head_mask = head_mask[i] if head_mask is not None else None
|
||||
past_key_value = past_key_values[i] if past_key_values is not None else None
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
layer_module.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states, # as a positional argument for gradient checkpointing
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
past_key_value=past_key_value,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
if use_cache:
|
||||
@ -742,12 +722,8 @@ def load_tf_weights_in_roberta(model, config, tf_checkpoint_path):
|
||||
return model
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class RobertaPreTrainedModel(PreTrainedModel):
|
||||
"""
|
||||
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
||||
models.
|
||||
"""
|
||||
|
||||
config_class = RobertaConfig
|
||||
load_tf_weights = load_tf_weights_in_roberta
|
||||
base_model_prefix = "roberta"
|
||||
@ -773,79 +749,8 @@ class RobertaPreTrainedModel(PreTrainedModel):
|
||||
module.bias.data.zero_()
|
||||
|
||||
|
||||
ROBERTA_START_DOCSTRING = r"""
|
||||
|
||||
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
||||
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
||||
etc.)
|
||||
|
||||
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
||||
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
||||
and behavior.
|
||||
|
||||
Parameters:
|
||||
config ([`RobertaConfig`]): Model configuration class with all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the
|
||||
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
||||
"""
|
||||
|
||||
ROBERTA_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
input_ids (`torch.LongTensor` of shape `({0})`):
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
|
||||
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
||||
[`PreTrainedTokenizer.__call__`] for details.
|
||||
|
||||
[What are input IDs?](../glossary#input-ids)
|
||||
attention_mask (`torch.FloatTensor` of shape `({0})`or `(batch_size, sequence_length, target_length)`, *optional*):
|
||||
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
[What are attention masks?](../glossary#attention-mask)
|
||||
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
|
||||
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
|
||||
1]`:
|
||||
|
||||
- 0 corresponds to a *sentence A* token,
|
||||
- 1 corresponds to a *sentence B* token.
|
||||
|
||||
[What are token type IDs?](../glossary#token-type-ids)
|
||||
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
|
||||
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
||||
config.max_position_embeddings - 1]`.
|
||||
|
||||
[What are position IDs?](../glossary#position-ids)
|
||||
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
|
||||
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
|
||||
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
|
||||
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
|
||||
model's internal embedding lookup matrix.
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail.
|
||||
output_hidden_states (`bool`, *optional*):
|
||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
||||
more detail.
|
||||
return_dict (`bool`, *optional*):
|
||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare Roberta Model transformer outputting raw hidden-states without any specific head on top.",
|
||||
ROBERTA_START_DOCSTRING,
|
||||
)
|
||||
class RobertaModel(RobertaPreTrainedModel):
|
||||
"""
|
||||
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
|
||||
cross-attention is added between the self-attention layers, following the architecture described in [Attention is
|
||||
all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
|
||||
@ -855,10 +760,15 @@ class RobertaModel(RobertaPreTrainedModel):
|
||||
to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
|
||||
`add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
|
||||
"""
|
||||
|
||||
)
|
||||
class RobertaModel(RobertaPreTrainedModel):
|
||||
_no_split_modules = ["RobertaEmbeddings", "RobertaLayer"]
|
||||
|
||||
def __init__(self, config, add_pooling_layer=True):
|
||||
r"""
|
||||
add_pooling_layer (bool, *optional*, defaults to `True`):
|
||||
Whether to add a pooling layer
|
||||
"""
|
||||
super().__init__(config)
|
||||
self.config = config
|
||||
|
||||
@ -887,12 +797,7 @@ class RobertaModel(RobertaPreTrainedModel):
|
||||
for layer, heads in heads_to_prune.items():
|
||||
self.encoder.layer[layer].attention.prune_heads(heads)
|
||||
|
||||
@add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
@add_code_sample_docstrings(
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutputWithPoolingAndCrossAttentions,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.Tensor] = None,
|
||||
@ -909,26 +814,6 @@ class RobertaModel(RobertaPreTrainedModel):
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
|
||||
r"""
|
||||
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
|
||||
the model is configured as a decoder.
|
||||
encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, target_length)`, *optional*):
|
||||
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
|
||||
the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
|
||||
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
|
||||
|
||||
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
|
||||
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
|
||||
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
|
||||
use_cache (`bool`, *optional*):
|
||||
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
|
||||
`past_key_values`).
|
||||
"""
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
|
@ -12,33 +12,17 @@ from torch import nn
|
||||
from transformers.modeling_outputs import CausalLMOutputWithPast
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, StaticCache
|
||||
from ...cache_utils import Cache
|
||||
from ...integrations import use_kernel_forward_from_hub
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import (
|
||||
add_start_docstrings,
|
||||
add_start_docstrings_to_model_forward,
|
||||
can_return_tuple,
|
||||
is_torch_flex_attn_available,
|
||||
logging,
|
||||
)
|
||||
from ...utils import auto_docstring, can_return_tuple
|
||||
from .configuration_super import SuperConfig
|
||||
|
||||
|
||||
if is_torch_flex_attn_available():
|
||||
from torch.nn.attention.flex_attention import BlockMask
|
||||
|
||||
from ...integrations.flex_attention import make_flex_block_causal_mask
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@use_kernel_forward_from_hub("RMSNorm")
|
||||
class SuperRMSNorm(nn.Module):
|
||||
def __init__(self, hidden_size, eps=1e-6):
|
||||
@ -233,15 +217,8 @@ class SuperAttention(nn.Module):
|
||||
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
||||
|
||||
attention_interface: Callable = eager_attention_forward
|
||||
|
||||
if self.config._attn_implementation != "eager":
|
||||
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
|
||||
logger.warning_once(
|
||||
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
|
||||
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
|
||||
)
|
||||
else:
|
||||
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||
|
||||
attn_output, attn_weights = attention_interface(
|
||||
self,
|
||||
@ -312,27 +289,7 @@ class SuperDecoderLayer(GradientCheckpointingLayer):
|
||||
return outputs
|
||||
|
||||
|
||||
SUPER_START_DOCSTRING = r"""
|
||||
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
||||
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
||||
etc.)
|
||||
|
||||
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
||||
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
||||
and behavior.
|
||||
|
||||
Parameters:
|
||||
config ([`SuperConfig`]):
|
||||
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
||||
load the weights associated with the model, only the configuration. Check out the
|
||||
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
||||
"""
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare Super Model outputting raw hidden-states without any specific head on top.",
|
||||
SUPER_START_DOCSTRING,
|
||||
)
|
||||
@auto_docstring
|
||||
class SuperPreTrainedModel(PreTrainedModel):
|
||||
config_class = SuperConfig
|
||||
base_model_prefix = "model"
|
||||
@ -361,88 +318,8 @@ class SuperPreTrainedModel(PreTrainedModel):
|
||||
module.weight.data.fill_(1.0)
|
||||
|
||||
|
||||
SUPER_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
||||
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
||||
it.
|
||||
|
||||
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
||||
[`PreTrainedTokenizer.__call__`] for details.
|
||||
|
||||
[What are input IDs?](../glossary#input-ids)
|
||||
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length) or `BlockMask`, *optional*):
|
||||
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
If the model is configured to use flex_attention, it will attempt to convert the mask Tensor into a BlockMask,
|
||||
but you can also pass a `BlockMask` object directly here.
|
||||
|
||||
[What are attention masks?](../glossary#attention-mask)
|
||||
|
||||
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
||||
[`PreTrainedTokenizer.__call__`] for details.
|
||||
|
||||
If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
|
||||
`past_key_values`).
|
||||
|
||||
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
|
||||
and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
|
||||
information on the default strategy.
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the head is **masked**.
|
||||
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
||||
config.n_positions - 1]`.
|
||||
|
||||
[What are position IDs?](../glossary#position-ids)
|
||||
past_key_values (`Cache`, *optional*):
|
||||
Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
||||
blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
|
||||
returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
||||
|
||||
It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
|
||||
|
||||
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
|
||||
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
|
||||
of shape `(batch_size, sequence_length)`.
|
||||
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
|
||||
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
|
||||
model's internal embedding lookup matrix.
|
||||
use_cache (`bool`, *optional*):
|
||||
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
|
||||
`past_key_values`).
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail.
|
||||
output_hidden_states (`bool`, *optional*):
|
||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
||||
more detail.
|
||||
return_dict (`bool`, *optional*):
|
||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
||||
cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
|
||||
Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
|
||||
this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
|
||||
the complete sequence length.
|
||||
"""
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare Super Model outputting raw hidden-states without any specific head on top.",
|
||||
SUPER_START_DOCSTRING,
|
||||
)
|
||||
@auto_docstring
|
||||
class SuperModel(SuperPreTrainedModel):
|
||||
"""
|
||||
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`SuperDecoderLayer`]
|
||||
|
||||
Args:
|
||||
config: SuperConfig
|
||||
"""
|
||||
|
||||
def __init__(self, config: SuperConfig):
|
||||
super().__init__(config)
|
||||
self.padding_idx = config.pad_token_id
|
||||
@ -466,7 +343,7 @@ class SuperModel(SuperPreTrainedModel):
|
||||
self.embed_tokens = value
|
||||
|
||||
@can_return_tuple
|
||||
@add_start_docstrings_to_model_forward(SUPER_INPUTS_DOCSTRING)
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.LongTensor = None,
|
||||
@ -494,126 +371,3 @@ class SuperModel(SuperPreTrainedModel):
|
||||
)
|
||||
out.logits *= 2**4
|
||||
return out
|
||||
|
||||
def _update_causal_mask(
|
||||
self,
|
||||
attention_mask: Union[torch.Tensor, "BlockMask"],
|
||||
input_tensor: torch.Tensor,
|
||||
cache_position: torch.Tensor,
|
||||
past_key_values: Cache,
|
||||
output_attentions: bool = False,
|
||||
):
|
||||
if self.config._attn_implementation == "flash_attention_2":
|
||||
if attention_mask is not None and (attention_mask == 0.0).any():
|
||||
return attention_mask
|
||||
return None
|
||||
if self.config._attn_implementation == "flex_attention":
|
||||
if isinstance(attention_mask, torch.Tensor):
|
||||
attention_mask = make_flex_block_causal_mask(attention_mask)
|
||||
return attention_mask
|
||||
|
||||
# For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
|
||||
# order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
|
||||
# to infer the attention mask.
|
||||
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
|
||||
using_static_cache = isinstance(past_key_values, StaticCache)
|
||||
|
||||
# When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
|
||||
if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
|
||||
if AttentionMaskConverter._ignore_causal_mask_sdpa(
|
||||
attention_mask,
|
||||
inputs_embeds=input_tensor,
|
||||
past_key_values_length=past_seen_tokens,
|
||||
is_training=self.training,
|
||||
):
|
||||
return None
|
||||
|
||||
dtype = input_tensor.dtype
|
||||
sequence_length = input_tensor.shape[1]
|
||||
if using_static_cache:
|
||||
target_length = past_key_values.get_max_cache_shape()
|
||||
else:
|
||||
target_length = (
|
||||
attention_mask.shape[-1]
|
||||
if isinstance(attention_mask, torch.Tensor)
|
||||
else past_seen_tokens + sequence_length + 1
|
||||
)
|
||||
|
||||
# In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
|
||||
causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
|
||||
attention_mask,
|
||||
sequence_length=sequence_length,
|
||||
target_length=target_length,
|
||||
dtype=dtype,
|
||||
cache_position=cache_position,
|
||||
batch_size=input_tensor.shape[0],
|
||||
)
|
||||
|
||||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type in ["cuda", "xpu", "npu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
# using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
|
||||
# Details: https://github.com/pytorch/pytorch/issues/110213
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
|
||||
|
||||
return causal_mask
|
||||
|
||||
@staticmethod
|
||||
def _prepare_4d_causal_attention_mask_with_cache_position(
|
||||
attention_mask: torch.Tensor,
|
||||
sequence_length: int,
|
||||
target_length: int,
|
||||
dtype: torch.dtype,
|
||||
cache_position: torch.Tensor,
|
||||
batch_size: int,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
|
||||
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
|
||||
|
||||
Args:
|
||||
attention_mask (`torch.Tensor`):
|
||||
A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
|
||||
`(batch_size, 1, query_length, key_value_length)`.
|
||||
sequence_length (`int`):
|
||||
The sequence length being processed.
|
||||
target_length (`int`):
|
||||
The target length: when generating with static cache, the mask should be as long as the static cache,
|
||||
to account for the 0 padding, the part of the cache that is not filled yet.
|
||||
dtype (`torch.dtype`):
|
||||
The dtype to use for the 4D attention mask.
|
||||
cache_position (`torch.Tensor`):
|
||||
Indices depicting the position of the input sequence tokens in the sequence.
|
||||
batch_size (`torch.Tensor`):
|
||||
Batch size.
|
||||
"""
|
||||
if attention_mask is not None and attention_mask.dim() == 4:
|
||||
# In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
|
||||
causal_mask = attention_mask
|
||||
else:
|
||||
min_dtype = torch.finfo(dtype).min
|
||||
causal_mask = torch.full(
|
||||
(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
|
||||
)
|
||||
if sequence_length != 1:
|
||||
causal_mask = torch.triu(causal_mask, diagonal=1)
|
||||
causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
|
||||
causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
|
||||
if attention_mask is not None:
|
||||
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
|
||||
mask_length = attention_mask.shape[-1]
|
||||
padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
|
||||
causal_mask.device
|
||||
)
|
||||
padding_mask = padding_mask == 0
|
||||
causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
|
||||
padding_mask, min_dtype
|
||||
)
|
||||
|
||||
return causal_mask
|
||||
|
@ -14,13 +14,9 @@ from ...cache_utils import Cache
|
||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import logging
|
||||
from .configuration_switch_function import SwitchFunctionConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
def rotate_half(x):
|
||||
# Split and rotate. Note that this function is different from e.g. Llama.
|
||||
x1 = x[..., ::2]
|
||||
@ -145,15 +141,8 @@ class SwitchFunctionAttention(nn.Module):
|
||||
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
||||
|
||||
attention_interface: Callable = eager_attention_forward
|
||||
|
||||
if self.config._attn_implementation != "eager":
|
||||
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
|
||||
logger.warning_once(
|
||||
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
|
||||
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
|
||||
)
|
||||
else:
|
||||
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||
|
||||
attn_output, attn_weights = attention_interface(
|
||||
self,
|
||||
|
@ -16,17 +16,11 @@ from torch import Tensor, nn
|
||||
from ...activations import ACT2FN
|
||||
from ...integrations import use_kernel_forward_from_hub
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BaseModelOutput
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...pytorch_utils import meshgrid
|
||||
from ...utils import (
|
||||
ModelOutput,
|
||||
add_start_docstrings,
|
||||
add_start_docstrings_to_model_forward,
|
||||
is_timm_available,
|
||||
replace_return_docstrings,
|
||||
requires_backends,
|
||||
)
|
||||
from ...utils import ModelOutput, auto_docstring, is_timm_available, requires_backends
|
||||
from ...utils.backbone_utils import load_backbone
|
||||
from .configuration_test_detr import TestDetrConfig
|
||||
|
||||
@ -34,8 +28,6 @@ from .configuration_test_detr import TestDetrConfig
|
||||
if is_timm_available():
|
||||
from timm import create_model
|
||||
|
||||
_CONFIG_FOR_DOC = "TestDetrConfig"
|
||||
|
||||
|
||||
@use_kernel_forward_from_hub("MultiScaleDeformableAttention")
|
||||
class MultiScaleDeformableAttention(nn.Module):
|
||||
@ -93,32 +85,24 @@ class MultiScaleDeformableAttention(nn.Module):
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestDetrDecoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the TestDetrDecoder. This class adds two attributes to
|
||||
BaseModelOutputWithCrossAttentions, namely:
|
||||
- a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
|
||||
- a stacked tensor of intermediate reference points.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
"""
|
||||
)
|
||||
class TestDetrDecoderOutput(ModelOutput):
|
||||
r"""
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
|
||||
used to compute the weighted average in the cross-attention heads.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -130,47 +114,27 @@ class TestDetrDecoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestDetrModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of the Deformable DETR encoder-decoder model.
|
||||
|
||||
Args:
|
||||
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Initial reference points sent through the Transformer decoder.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
|
||||
plus the initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
|
||||
num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
|
||||
average in the self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
|
||||
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
|
||||
weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
|
||||
layer plus the initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
|
||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
|
||||
picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
|
||||
foreground and background).
|
||||
enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Logits of predicted bounding boxes coordinates in the first stage.
|
||||
"""
|
||||
)
|
||||
class TestDetrModelOutput(ModelOutput):
|
||||
r"""
|
||||
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
|
||||
Initial reference points sent through the Transformer decoder.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
|
||||
Stacked intermediate hidden states (output of each layer of the decoder).
|
||||
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
|
||||
Stacked intermediate reference points (reference points of each layer of the decoder).
|
||||
enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
|
||||
picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
|
||||
foreground and background).
|
||||
enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
|
||||
Logits of predicted bounding boxes coordinates in the first stage.
|
||||
"""
|
||||
|
||||
init_reference_points: Optional[torch.FloatTensor] = None
|
||||
@ -635,7 +599,7 @@ class TestDetrMultiheadAttention(nn.Module):
|
||||
return attn_output, attn_weights_reshaped
|
||||
|
||||
|
||||
class TestDetrEncoderLayer(nn.Module):
|
||||
class TestDetrEncoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: TestDetrConfig):
|
||||
super().__init__()
|
||||
self.embed_dim = config.d_model
|
||||
@ -724,7 +688,7 @@ class TestDetrEncoderLayer(nn.Module):
|
||||
return outputs
|
||||
|
||||
|
||||
class TestDetrDecoderLayer(nn.Module):
|
||||
class TestDetrDecoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: TestDetrConfig):
|
||||
super().__init__()
|
||||
self.embed_dim = config.d_model
|
||||
@ -837,6 +801,7 @@ class TestDetrDecoderLayer(nn.Module):
|
||||
return outputs
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class TestDetrPreTrainedModel(PreTrainedModel):
|
||||
config_class = TestDetrConfig
|
||||
base_model_prefix = "model"
|
||||
@ -1001,29 +966,16 @@ class TestDetrEncoder(TestDetrPreTrainedModel):
|
||||
for i, encoder_layer in enumerate(self.layers):
|
||||
if output_hidden_states:
|
||||
encoder_states = encoder_states + (hidden_states,)
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
encoder_layer.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
position_embeddings,
|
||||
reference_points,
|
||||
spatial_shapes,
|
||||
spatial_shapes_list,
|
||||
level_start_index,
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
position_embeddings=position_embeddings,
|
||||
reference_points=reference_points,
|
||||
spatial_shapes=spatial_shapes,
|
||||
spatial_shapes_list=spatial_shapes_list,
|
||||
level_start_index=level_start_index,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
position_embeddings=position_embeddings,
|
||||
reference_points=reference_points,
|
||||
spatial_shapes=spatial_shapes,
|
||||
spatial_shapes_list=spatial_shapes_list,
|
||||
level_start_index=level_start_index,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
@ -1155,31 +1107,17 @@ class TestDetrDecoder(TestDetrPreTrainedModel):
|
||||
if output_hidden_states:
|
||||
all_hidden_states += (hidden_states,)
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
decoder_layer.__call__,
|
||||
hidden_states,
|
||||
position_embeddings,
|
||||
reference_points_input,
|
||||
spatial_shapes,
|
||||
spatial_shapes_list,
|
||||
level_start_index,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
position_embeddings=position_embeddings,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
reference_points=reference_points_input,
|
||||
spatial_shapes=spatial_shapes,
|
||||
spatial_shapes_list=spatial_shapes_list,
|
||||
level_start_index=level_start_index,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
position_embeddings,
|
||||
reference_points_input,
|
||||
spatial_shapes,
|
||||
spatial_shapes_list,
|
||||
level_start_index,
|
||||
encoder_hidden_states, # as a positional argument for gradient checkpointing
|
||||
encoder_attention_mask,
|
||||
output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
@ -1253,67 +1191,11 @@ def build_position_encoding(config):
|
||||
return position_embedding
|
||||
|
||||
|
||||
TEST_DETR_START_DOCSTRING = r"""
|
||||
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
||||
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
||||
etc.)
|
||||
|
||||
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
||||
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
||||
and behavior.
|
||||
|
||||
Parameters:
|
||||
config ([`TestDetrConfig`]):
|
||||
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
||||
load the weights associated with the model, only the configuration. Check out the
|
||||
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
||||
"""
|
||||
|
||||
TEST_DETR_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Padding will be ignored by default should you provide it.
|
||||
|
||||
Pixel values can be obtained using [`AutoImageProcessor`]. See [`TestDetrImageProcessor.__call__`]
|
||||
for details.
|
||||
|
||||
pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
|
||||
Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 for pixels that are real (i.e. **not masked**),
|
||||
- 0 for pixels that are padding (i.e. **masked**).
|
||||
|
||||
[What are attention masks?](../glossary#attention-mask)
|
||||
|
||||
decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
|
||||
Not used by default. Can be used to mask object queries.
|
||||
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
|
||||
`last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
|
||||
hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
|
||||
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
|
||||
can choose to directly pass a flattened representation of an image.
|
||||
decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
|
||||
Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
|
||||
embedded representation.
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail.
|
||||
output_hidden_states (`bool`, *optional*):
|
||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
||||
more detail.
|
||||
return_dict (`bool`, *optional*):
|
||||
Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
The bare Deformable DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
|
||||
hidden-states without any specific head on top.
|
||||
""",
|
||||
TEST_DETR_START_DOCSTRING,
|
||||
"""
|
||||
)
|
||||
class TestDetrModel(TestDetrPreTrainedModel):
|
||||
def __init__(self, config: TestDetrConfig):
|
||||
@ -1486,8 +1368,7 @@ class TestDetrModel(TestDetrPreTrainedModel):
|
||||
object_query = self.enc_output_norm(self.enc_output(object_query))
|
||||
return object_query, output_proposals
|
||||
|
||||
@add_start_docstrings_to_model_forward(TEST_DETR_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(output_type=TestDetrModelOutput, config_class=_CONFIG_FOR_DOC)
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
pixel_values: torch.FloatTensor,
|
||||
@ -1501,7 +1382,14 @@ class TestDetrModel(TestDetrPreTrainedModel):
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[tuple[torch.FloatTensor], TestDetrModelOutput]:
|
||||
r"""
|
||||
Returns:
|
||||
decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
|
||||
Not used by default. Can be used to mask object queries.
|
||||
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
|
||||
can choose to directly pass a flattened representation of an image.
|
||||
decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
|
||||
Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
|
||||
embedded representation.
|
||||
|
||||
Examples:
|
||||
|
||||
|
@ -2,4 +2,5 @@ accelerate>=0.12.0
|
||||
torch>=1.5.0
|
||||
torchvision>=0.6.0
|
||||
datasets>=2.14.0
|
||||
evaluate
|
||||
evaluate
|
||||
scikit-learn
|
@ -129,7 +129,7 @@ To pre-train `"large-sized"` Wav2Vec2 model, *e.g.* [facebook/wav2vec2-large-lv6
|
||||
on [librispeech_asr](https://huggingface.co/datasets/librispeech_asr), the following command can be run:
|
||||
|
||||
```bash
|
||||
accelerate launch run_wav2vec2_pretraining_no_trainer.py \
|
||||
accelerate launch run_wav2vec2_pretraining_no_trainer.py \
|
||||
--dataset_name=librispeech_asr \
|
||||
--dataset_config_names clean clean other \
|
||||
--dataset_split_names train.100 train.360 train.500 \
|
||||
@ -141,7 +141,7 @@ accelerate launch run_wav2vec2_pretraining_no_trainer.py \
|
||||
--weight_decay=0.01 \
|
||||
--max_duration_in_seconds=20.0 \
|
||||
--min_duration_in_seconds=2.0 \
|
||||
--model_name_or_path=./
|
||||
--model_name_or_path=./ \
|
||||
--logging_steps=1 \
|
||||
--saving_steps=10000 \
|
||||
--per_device_train_batch_size=2 \
|
||||
|
@ -312,6 +312,7 @@ class ExamplesTestsNoTrainer(TestCasePlus):
|
||||
{self.examples_dir}/pytorch/image-classification/run_image_classification_no_trainer.py
|
||||
--model_name_or_path google/vit-base-patch16-224-in21k
|
||||
--dataset_name hf-internal-testing/cats_vs_dogs_sample
|
||||
--trust_remote_code
|
||||
--learning_rate 1e-4
|
||||
--per_device_train_batch_size 2
|
||||
--per_device_eval_batch_size 1
|
||||
|
@ -17,6 +17,7 @@ import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import unittest
|
||||
from unittest.mock import patch
|
||||
|
||||
from transformers import ViTMAEForPreTraining, Wav2Vec2ForPreTraining
|
||||
@ -390,6 +391,7 @@ class ExamplesTests(TestCasePlus):
|
||||
--output_dir {tmp_dir}
|
||||
--model_name_or_path google/vit-base-patch16-224-in21k
|
||||
--dataset_name hf-internal-testing/cats_vs_dogs_sample
|
||||
--trust_remote_code
|
||||
--do_train
|
||||
--do_eval
|
||||
--learning_rate 1e-4
|
||||
@ -413,6 +415,7 @@ class ExamplesTests(TestCasePlus):
|
||||
result = get_results(tmp_dir)
|
||||
self.assertGreaterEqual(result["eval_accuracy"], 0.8)
|
||||
|
||||
@unittest.skip("temporary to avoid failing on circleci")
|
||||
def test_run_speech_recognition_ctc(self):
|
||||
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||
testargs = f"""
|
||||
@ -423,6 +426,7 @@ class ExamplesTests(TestCasePlus):
|
||||
--dataset_config_name clean
|
||||
--train_split_name validation
|
||||
--eval_split_name validation
|
||||
--trust_remote_code
|
||||
--do_train
|
||||
--do_eval
|
||||
--learning_rate 1e-4
|
||||
@ -443,6 +447,7 @@ class ExamplesTests(TestCasePlus):
|
||||
result = get_results(tmp_dir)
|
||||
self.assertLess(result["eval_loss"], result["train_loss"])
|
||||
|
||||
@unittest.skip("temporary to avoid failing on circleci")
|
||||
def test_run_speech_recognition_ctc_adapter(self):
|
||||
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||
testargs = f"""
|
||||
@ -453,6 +458,7 @@ class ExamplesTests(TestCasePlus):
|
||||
--dataset_config_name clean
|
||||
--train_split_name validation
|
||||
--eval_split_name validation
|
||||
--trust_remote_code
|
||||
--do_train
|
||||
--do_eval
|
||||
--learning_rate 1e-4
|
||||
@ -475,6 +481,7 @@ class ExamplesTests(TestCasePlus):
|
||||
self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "./adapter.tur.safetensors")))
|
||||
self.assertLess(result["eval_loss"], result["train_loss"])
|
||||
|
||||
@unittest.skip("temporary to avoid failing on circleci")
|
||||
def test_run_speech_recognition_seq2seq(self):
|
||||
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||
testargs = f"""
|
||||
@ -485,6 +492,7 @@ class ExamplesTests(TestCasePlus):
|
||||
--dataset_config_name clean
|
||||
--train_split_name validation
|
||||
--eval_split_name validation
|
||||
--trust_remote_code
|
||||
--do_train
|
||||
--do_eval
|
||||
--learning_rate 1e-4
|
||||
@ -512,6 +520,7 @@ class ExamplesTests(TestCasePlus):
|
||||
--output_dir {tmp_dir}
|
||||
--model_name_or_path hf-internal-testing/tiny-random-wav2vec2
|
||||
--dataset_name anton-l/superb_demo
|
||||
--trust_remote_code
|
||||
--dataset_config_name ks
|
||||
--train_split_name test
|
||||
--eval_split_name test
|
||||
@ -546,6 +555,7 @@ class ExamplesTests(TestCasePlus):
|
||||
--dataset_name hf-internal-testing/librispeech_asr_dummy
|
||||
--dataset_config_names clean
|
||||
--dataset_split_names validation
|
||||
--trust_remote_code
|
||||
--learning_rate 1e-4
|
||||
--per_device_train_batch_size 4
|
||||
--per_device_eval_batch_size 4
|
||||
@ -566,6 +576,7 @@ class ExamplesTests(TestCasePlus):
|
||||
run_mae.py
|
||||
--output_dir {tmp_dir}
|
||||
--dataset_name hf-internal-testing/cats_vs_dogs_sample
|
||||
--trust_remote_code
|
||||
--do_train
|
||||
--do_eval
|
||||
--learning_rate 1e-4
|
||||
|
@ -315,6 +315,7 @@ class ExamplesTests(TestCasePlus):
|
||||
testargs = f"""
|
||||
run_image_classification.py
|
||||
--dataset_name hf-internal-testing/cats_vs_dogs_sample
|
||||
--trust_remote_code
|
||||
--model_name_or_path microsoft/resnet-18
|
||||
--do_train
|
||||
--do_eval
|
||||
|
2
setup.py
2
setup.py
@ -128,7 +128,7 @@ _deps = [
|
||||
# Keras pin - this is to make sure Keras 3 doesn't destroy us. Remove or change when we have proper support.
|
||||
"keras>2.9,<2.16",
|
||||
"keras-nlp>=0.3.1,<0.14.0", # keras-nlp 0.14 doesn't support keras 2, see pin on keras.
|
||||
"kernels>=0.4.4,<0.5",
|
||||
"kernels>=0.6.1,<0.7",
|
||||
"librosa",
|
||||
"natten>=0.14.6,<0.15.0",
|
||||
"nltk<=3.8.1",
|
||||
|
@ -396,7 +396,7 @@ def add_fast_image_processor_file(
|
||||
|
||||
content_header = get_fast_image_processing_content_header(content_base_file)
|
||||
content_base_file = (
|
||||
f"@auto_docstring(\n"
|
||||
f"@auto_docstring\n"
|
||||
f"class {fast_image_processor_name}(BaseImageProcessorFast):\n"
|
||||
" # This generated class can be used as a starting point for the fast image processor.\n"
|
||||
" # if the image processor is only used for simple augmentations, such as resizing, center cropping, rescaling, or normalizing,\n"
|
||||
|
@ -338,7 +338,7 @@ class PretrainedConfig(PushToHubMixin):
|
||||
|
||||
@output_attentions.setter
|
||||
def output_attentions(self, value):
|
||||
if self._attn_implementation != "eager":
|
||||
if value is True and self._attn_implementation != "eager":
|
||||
raise ValueError(
|
||||
"The `output_attentions` attribute is not supported when using the `attn_implementation` set to "
|
||||
f"{self._attn_implementation}. Please set it to 'eager' instead."
|
||||
|
@ -23,7 +23,7 @@ from tqdm import tqdm
|
||||
|
||||
from ...models.bert.tokenization_bert import whitespace_tokenize
|
||||
from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
|
||||
from ...utils import is_tf_available, is_torch_available, logging
|
||||
from ...utils import is_tf_available, is_torch_available, is_torch_hpu_available, logging
|
||||
from .utils import DataProcessor
|
||||
|
||||
|
||||
@ -361,11 +361,29 @@ def squad_convert_examples_to_features(
|
||||
is_training=not evaluate,
|
||||
)
|
||||
```"""
|
||||
# Defining helper methods
|
||||
features = []
|
||||
|
||||
threads = min(threads, cpu_count())
|
||||
with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
|
||||
if not is_torch_hpu_available():
|
||||
threads = min(threads, cpu_count())
|
||||
with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
|
||||
annotate_ = partial(
|
||||
squad_convert_example_to_features,
|
||||
max_seq_length=max_seq_length,
|
||||
doc_stride=doc_stride,
|
||||
max_query_length=max_query_length,
|
||||
padding_strategy=padding_strategy,
|
||||
is_training=is_training,
|
||||
)
|
||||
features = list(
|
||||
tqdm(
|
||||
p.imap(annotate_, examples, chunksize=32),
|
||||
total=len(examples),
|
||||
desc="convert squad examples to features",
|
||||
disable=not tqdm_enabled,
|
||||
)
|
||||
)
|
||||
else:
|
||||
# Non-parallel version for hpu https://github.com/huggingface/transformers/pull/38790#discussion_r2156470902
|
||||
squad_convert_example_to_features_init(tokenizer_for_convert=tokenizer)
|
||||
annotate_ = partial(
|
||||
squad_convert_example_to_features,
|
||||
max_seq_length=max_seq_length,
|
||||
@ -376,7 +394,7 @@ def squad_convert_examples_to_features(
|
||||
)
|
||||
features = list(
|
||||
tqdm(
|
||||
p.imap(annotate_, examples, chunksize=32),
|
||||
map(annotate_, examples),
|
||||
total=len(examples),
|
||||
desc="convert squad examples to features",
|
||||
disable=not tqdm_enabled,
|
||||
|
@ -34,7 +34,7 @@ deps = {
|
||||
"kenlm": "kenlm",
|
||||
"keras": "keras>2.9,<2.16",
|
||||
"keras-nlp": "keras-nlp>=0.3.1,<0.14.0",
|
||||
"kernels": "kernels>=0.4.4,<0.5",
|
||||
"kernels": "kernels>=0.6.1,<0.7",
|
||||
"librosa": "librosa",
|
||||
"natten": "natten>=0.14.6,<0.15.0",
|
||||
"nltk": "nltk<=3.8.1",
|
||||
|
@ -402,10 +402,11 @@ def get_cached_module_file(
|
||||
if not (submodule_path / module_file).exists() or not filecmp.cmp(
|
||||
resolved_module_file, str(submodule_path / module_file)
|
||||
):
|
||||
(submodule_path / module_file).parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy(resolved_module_file, submodule_path / module_file)
|
||||
importlib.invalidate_caches()
|
||||
for module_needed in modules_needed:
|
||||
module_needed = f"{module_needed}.py"
|
||||
module_needed = Path(module_file).parent / f"{module_needed}.py"
|
||||
module_needed_file = os.path.join(pretrained_model_name_or_path, module_needed)
|
||||
if not (submodule_path / module_needed).exists() or not filecmp.cmp(
|
||||
module_needed_file, str(submodule_path / module_needed)
|
||||
|
@ -184,6 +184,7 @@ class DefaultFastImageProcessorKwargs(TypedDict, total=False):
|
||||
data_format: Optional[ChannelDimension]
|
||||
input_data_format: Optional[Union[str, ChannelDimension]]
|
||||
device: Optional["torch.device"]
|
||||
disable_grouping: Optional[bool]
|
||||
|
||||
|
||||
@auto_docstring
|
||||
@ -480,18 +481,35 @@ class BaseImageProcessorFast(BaseImageProcessor):
|
||||
) -> list["torch.Tensor"]:
|
||||
"""
|
||||
Prepare the input images for processing.
|
||||
|
||||
Args:
|
||||
images (`ImageInput`):
|
||||
The input images to process.
|
||||
do_convert_rgb (`bool`, *optional*):
|
||||
Whether to convert the images to RGB.
|
||||
input_data_format (`str` or `ChannelDimension`, *optional*):
|
||||
The input data format of the images.
|
||||
device (`torch.device`, *optional*):
|
||||
The device to put the processed images on.
|
||||
|
||||
Returns:
|
||||
List[`torch.Tensor`]: The processed images.
|
||||
"""
|
||||
|
||||
# Get structured images (potentially nested)
|
||||
images = self._prepare_images_structure(images)
|
||||
process_image_fn = partial(
|
||||
self._process_image,
|
||||
do_convert_rgb=do_convert_rgb,
|
||||
input_data_format=input_data_format,
|
||||
device=device,
|
||||
|
||||
process_image_partial = partial(
|
||||
self._process_image, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device
|
||||
)
|
||||
# todo: yoni - check if we can parallelize this efficiently
|
||||
processed_images = []
|
||||
for image in images:
|
||||
processed_images.append(process_image_fn(image))
|
||||
|
||||
# Check if we have nested structure, assuming the nesting is consistent
|
||||
has_nested_structure = len(images) > 0 and isinstance(images[0], (list, tuple))
|
||||
|
||||
if has_nested_structure:
|
||||
processed_images = [[process_image_partial(img) for img in nested_list] for nested_list in images]
|
||||
else:
|
||||
processed_images = [process_image_partial(img) for img in images]
|
||||
|
||||
return processed_images
|
||||
|
||||
@ -621,11 +639,12 @@ class BaseImageProcessorFast(BaseImageProcessor):
|
||||
do_normalize: bool,
|
||||
image_mean: Optional[Union[float, list[float]]],
|
||||
image_std: Optional[Union[float, list[float]]],
|
||||
disable_grouping: Optional[bool],
|
||||
return_tensors: Optional[Union[str, TensorType]],
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
# Group images by size for batched resizing
|
||||
grouped_images, grouped_images_index = group_images_by_shape(images)
|
||||
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
|
||||
resized_images_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
if do_resize:
|
||||
@ -635,7 +654,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
|
||||
|
||||
# Group images by size for further processing
|
||||
# Needed in case do_resize is False, or resize returns images with different sizes
|
||||
grouped_images, grouped_images_index = group_images_by_shape(resized_images)
|
||||
grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
|
||||
processed_images_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
if do_center_crop:
|
||||
@ -656,47 +675,3 @@ class BaseImageProcessorFast(BaseImageProcessor):
|
||||
encoder_dict.pop("_valid_processor_keys", None)
|
||||
encoder_dict.pop("_valid_kwargs_names", None)
|
||||
return encoder_dict
|
||||
|
||||
|
||||
class SemanticSegmentationMixin:
|
||||
def post_process_semantic_segmentation(self, outputs, target_sizes: Optional[list[tuple]] = None):
|
||||
"""
|
||||
Converts the output of [`MobileNetV2ForSemanticSegmentation`] into semantic segmentation maps. Only supports PyTorch.
|
||||
|
||||
Args:
|
||||
outputs ([`MobileNetV2ForSemanticSegmentation`]):
|
||||
Raw outputs of the model.
|
||||
target_sizes (`list[Tuple]` of length `batch_size`, *optional*):
|
||||
List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
|
||||
predictions will not be resized.
|
||||
|
||||
Returns:
|
||||
semantic_segmentation: `list[torch.Tensor]` of length `batch_size`, where each item is a semantic
|
||||
segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
|
||||
specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
|
||||
"""
|
||||
logits = outputs.logits
|
||||
|
||||
# Resize logits and compute semantic segmentation maps
|
||||
if target_sizes is not None:
|
||||
if len(logits) != len(target_sizes):
|
||||
raise ValueError(
|
||||
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
|
||||
)
|
||||
|
||||
# if is_torch_tensor(target_sizes):
|
||||
# target_sizes = target_sizes.numpy()
|
||||
|
||||
semantic_segmentation = []
|
||||
|
||||
for idx in range(len(logits)):
|
||||
resized_logits = torch.nn.functional.interpolate(
|
||||
logits[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
|
||||
)
|
||||
semantic_map = resized_logits[0].argmax(dim=0)
|
||||
semantic_segmentation.append(semantic_map)
|
||||
else:
|
||||
semantic_segmentation = logits.argmax(dim=1)
|
||||
semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
|
||||
|
||||
return semantic_segmentation
|
||||
|
@ -12,6 +12,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from collections import defaultdict
|
||||
from collections.abc import Collection, Iterable
|
||||
from math import ceil
|
||||
from typing import Optional, Union
|
||||
@ -841,37 +842,128 @@ def _cast_tensor_to_float(x):
|
||||
return x.float()
|
||||
|
||||
|
||||
def _group_images_by_shape(nested_images, is_nested: bool = False):
|
||||
"""Helper function to flatten a single level of nested image structures and group by shape."""
|
||||
grouped_images = defaultdict(list)
|
||||
grouped_images_index = {}
|
||||
nested_images = [nested_images] if not is_nested else nested_images
|
||||
for i, sublist in enumerate(nested_images):
|
||||
for j, image in enumerate(sublist):
|
||||
key = (i, j) if is_nested else j
|
||||
shape = image.shape[1:]
|
||||
grouped_images[shape].append(image)
|
||||
grouped_images_index[key] = (shape, len(grouped_images[shape]) - 1)
|
||||
|
||||
return grouped_images, grouped_images_index
|
||||
|
||||
|
||||
def _reconstruct_nested_structure(indices, processed_images):
|
||||
"""Helper function to reconstruct a single level nested structure."""
|
||||
# Find the maximum outer index
|
||||
max_outer_idx = max(idx[0] for idx in indices.keys())
|
||||
|
||||
# Create the outer list
|
||||
result = [None] * (max_outer_idx + 1)
|
||||
|
||||
# Group indices by outer index
|
||||
nested_indices = defaultdict(list)
|
||||
for i, j in indices.keys():
|
||||
nested_indices[i].append(j)
|
||||
|
||||
for i in range(max_outer_idx + 1):
|
||||
if i in nested_indices:
|
||||
inner_max_idx = max(nested_indices[i])
|
||||
inner_list = [None] * (inner_max_idx + 1)
|
||||
for j in range(inner_max_idx + 1):
|
||||
if (i, j) in indices:
|
||||
shape, idx = indices[(i, j)]
|
||||
inner_list[j] = processed_images[shape][idx]
|
||||
result[i] = inner_list
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def group_images_by_shape(
|
||||
images: list["torch.Tensor"],
|
||||
) -> tuple[dict[tuple[int, int], list["torch.Tensor"]], dict[int, tuple[tuple[int, int], int]]]:
|
||||
images: Union[list["torch.Tensor"], "torch.Tensor"],
|
||||
disable_grouping: bool,
|
||||
is_nested: bool = False,
|
||||
) -> tuple[
|
||||
dict[tuple[int, int], list["torch.Tensor"]], dict[Union[int, tuple[int, int]], tuple[tuple[int, int], int]]
|
||||
]:
|
||||
"""
|
||||
Groups images by shape.
|
||||
Returns a dictionary with the shape as key and a list of images with that shape as value,
|
||||
and a dictionary with the index of the image in the original list as key and the shape and index in the grouped list as value.
|
||||
|
||||
The function supports both flat lists of tensors and nested structures.
|
||||
The input must be either all flat or all nested, not a mix of both.
|
||||
|
||||
Args:
|
||||
images (Union[list["torch.Tensor"], "torch.Tensor"]):
|
||||
A list of images or a single tensor
|
||||
disable_grouping (bool):
|
||||
Whether to disable grouping. If None, will be set to True if the images are on CPU, and False otherwise.
|
||||
This choice is based on empirical observations, as detailed here: https://github.com/huggingface/transformers/pull/38157
|
||||
is_nested (bool, *optional*, defaults to False):
|
||||
Whether the images are nested.
|
||||
|
||||
Returns:
|
||||
tuple[dict[tuple[int, int], list["torch.Tensor"]], dict[Union[int, tuple[int, int]], tuple[tuple[int, int], int]]]:
|
||||
- A dictionary with shape as key and list of images with that shape as value
|
||||
- A dictionary mapping original indices to (shape, index) tuples
|
||||
"""
|
||||
grouped_images = {}
|
||||
grouped_images_index = {}
|
||||
for i, image in enumerate(images):
|
||||
shape = image.shape[1:]
|
||||
if shape not in grouped_images:
|
||||
grouped_images[shape] = []
|
||||
grouped_images[shape].append(image)
|
||||
grouped_images_index[i] = (shape, len(grouped_images[shape]) - 1)
|
||||
# stack images with the same shape
|
||||
grouped_images = {shape: torch.stack(images, dim=0) for shape, images in grouped_images.items()}
|
||||
# If disable grouping is not explicitely provided, we favor disabling it if the images are on CPU, and enabling it otherwise.
|
||||
if disable_grouping is None:
|
||||
device = images[0][0].device if is_nested else images[0].device
|
||||
disable_grouping = device == "cpu"
|
||||
|
||||
if disable_grouping:
|
||||
if is_nested:
|
||||
return {(i, j): images[i][j].unsqueeze(0) for i in range(len(images)) for j in range(len(images[i]))}, {
|
||||
(i, j): ((i, j), 0) for i in range(len(images)) for j in range(len(images[i]))
|
||||
}
|
||||
else:
|
||||
return {i: images[i].unsqueeze(0) for i in range(len(images))}, {i: (i, 0) for i in range(len(images))}
|
||||
|
||||
# Handle single level nested structure
|
||||
grouped_images, grouped_images_index = _group_images_by_shape(images, is_nested)
|
||||
|
||||
# Stack images with the same shape
|
||||
grouped_images = {shape: torch.stack(images_list, dim=0) for shape, images_list in grouped_images.items()}
|
||||
|
||||
return grouped_images, grouped_images_index
|
||||
|
||||
|
||||
def reorder_images(
|
||||
processed_images: dict[tuple[int, int], "torch.Tensor"], grouped_images_index: dict[int, tuple[int, int]]
|
||||
) -> list["torch.Tensor"]:
|
||||
processed_images: dict[tuple[int, int], "torch.Tensor"],
|
||||
grouped_images_index: dict[Union[int, tuple[int, int]], tuple[tuple[int, int], int]],
|
||||
is_nested: bool = False,
|
||||
) -> Union[list["torch.Tensor"], "torch.Tensor"]:
|
||||
"""
|
||||
Reconstructs a list of images in the original order.
|
||||
Reconstructs images in the original order, preserving the original structure (nested or not).
|
||||
The input structure is either all flat or all nested.
|
||||
|
||||
Args:
|
||||
processed_images (dict[tuple[int, int], "torch.Tensor"]):
|
||||
Dictionary mapping shapes to batched processed images.
|
||||
grouped_images_index (dict[Union[int, tuple[int, int]], tuple[tuple[int, int], int]]):
|
||||
Dictionary mapping original indices to (shape, index) tuples.
|
||||
is_nested (bool, *optional*, defaults to False):
|
||||
Whether the images are nested. Cannot be infered from the input, as some processing functions outputs nested images.
|
||||
even with non nested images,e.g functions splitting images into patches. We thus can't deduce is_nested from the input.
|
||||
|
||||
|
||||
Returns:
|
||||
Union[list["torch.Tensor"], "torch.Tensor"]:
|
||||
Images in the original structure.
|
||||
"""
|
||||
return [
|
||||
processed_images[grouped_images_index[i][0]][grouped_images_index[i][1]]
|
||||
for i in range(len(grouped_images_index))
|
||||
]
|
||||
if not is_nested:
|
||||
return [
|
||||
processed_images[grouped_images_index[i][0]][grouped_images_index[i][1]]
|
||||
for i in range(len(grouped_images_index))
|
||||
]
|
||||
|
||||
return _reconstruct_nested_structure(grouped_images_index, processed_images)
|
||||
|
||||
|
||||
class NumpyToTensor:
|
||||
|
@ -13,8 +13,6 @@
|
||||
# limitations under the License.
|
||||
from typing import Union
|
||||
|
||||
from ..utils import is_torchdynamo_compiling
|
||||
|
||||
|
||||
try:
|
||||
from kernels import (
|
||||
@ -22,9 +20,7 @@ try:
|
||||
LayerRepository,
|
||||
register_kernel_mapping,
|
||||
replace_kernel_forward_from_hub,
|
||||
)
|
||||
from kernels import (
|
||||
use_kernel_forward_from_hub as original_use_kernel_forward_from_hub,
|
||||
use_kernel_forward_from_hub,
|
||||
)
|
||||
|
||||
_hub_kernels_available = True
|
||||
@ -45,9 +41,9 @@ try:
|
||||
},
|
||||
"RMSNorm": {
|
||||
"cuda": LayerRepository(
|
||||
repo_id="kernels-community/triton-layer-norm",
|
||||
layer_name="LlamaRMSNorm",
|
||||
revision="pure-layer-test",
|
||||
repo_id="kernels-community/liger_kernels",
|
||||
layer_name="LigerRMSNorm",
|
||||
# revision="pure-layer-test",
|
||||
)
|
||||
},
|
||||
"MLP": {
|
||||
@ -60,39 +56,6 @@ try:
|
||||
|
||||
register_kernel_mapping(_KERNEL_MAPPING)
|
||||
|
||||
def use_kernel_forward_from_hub(*args, **kwargs):
|
||||
"""
|
||||
Expands `kernels`' `use_kernel_forward_from_hub` to NOT use a kernel at compile time. This should be removed
|
||||
when `kernels` supports `torch.compile`.
|
||||
|
||||
If the layer has a `config` attribute, we can also set `config.disable_custom_kernels = True` to disable the
|
||||
kernel.
|
||||
"""
|
||||
|
||||
def decorator_with_compile_path(cls):
|
||||
# Keeps a reference to the original forward method
|
||||
original_forward = cls.forward
|
||||
|
||||
# Applies the original decorator
|
||||
decorator = original_use_kernel_forward_from_hub(*args, **kwargs)
|
||||
cls = decorator(cls)
|
||||
|
||||
# Replaces the kernel forward with a compile-friendly version
|
||||
kernel_forward = cls.forward
|
||||
|
||||
def forward_with_compile_path(*forward_args, **forward_kwargs):
|
||||
disable_custom_kernels = hasattr(cls, "config") and getattr(cls.config, "disable_custom_kernels", None)
|
||||
if is_torchdynamo_compiling() or disable_custom_kernels:
|
||||
return original_forward(*forward_args, **forward_kwargs)
|
||||
else:
|
||||
return kernel_forward(*forward_args, **forward_kwargs)
|
||||
|
||||
cls.forward = forward_with_compile_path
|
||||
|
||||
return cls
|
||||
|
||||
return decorator_with_compile_path
|
||||
|
||||
|
||||
except ImportError:
|
||||
# Stub to make decorators int transformers work when `kernels`
|
||||
|
@ -16,6 +16,11 @@ from functools import partial
|
||||
|
||||
import torch.nn as nn
|
||||
|
||||
from transformers.utils import logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class GradientCheckpointingLayer(nn.Module):
|
||||
"""Base class for layers with gradient checkpointing.
|
||||
@ -44,5 +49,35 @@ class GradientCheckpointingLayer(nn.Module):
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
if self.gradient_checkpointing and self.training:
|
||||
do_warn = False
|
||||
layer_name = self.__class__.__name__
|
||||
message = f"Caching is incompatible with gradient checkpointing in {layer_name}. Setting"
|
||||
|
||||
if "use_cache" in kwargs and kwargs["use_cache"]:
|
||||
kwargs["use_cache"] = False
|
||||
message += " `use_cache=False`,"
|
||||
do_warn = True
|
||||
|
||||
# different names for the same thing in different layers
|
||||
if "past_key_value" in kwargs and kwargs["past_key_value"] is not None:
|
||||
kwargs["past_key_value"] = None
|
||||
message += " `past_key_value=None`,"
|
||||
do_warn = True
|
||||
|
||||
if "past_key_values" in kwargs and kwargs["past_key_values"] is not None:
|
||||
kwargs["past_key_values"] = None
|
||||
message += " `past_key_values=None`,"
|
||||
do_warn = True
|
||||
|
||||
if "layer_past" in kwargs and kwargs["layer_past"] is not None:
|
||||
kwargs["layer_past"] = None
|
||||
message += " `layer_past=None`,"
|
||||
do_warn = True
|
||||
|
||||
# warn if anything was changed
|
||||
if do_warn:
|
||||
message = message.rstrip(",") + "."
|
||||
logger.warning(message)
|
||||
|
||||
return self._gradient_checkpointing_func(partial(super().__call__, **kwargs), *args)
|
||||
return super().__call__(*args, **kwargs)
|
||||
|
@ -172,7 +172,8 @@ _is_quantized = False
|
||||
_is_ds_init_called = False
|
||||
_torch_distributed_available = torch.distributed.is_available()
|
||||
|
||||
if _torch_distributed_available and is_torch_greater_or_equal("2.5"):
|
||||
_is_dtensor_available = _torch_distributed_available and is_torch_greater_or_equal("2.5")
|
||||
if _is_dtensor_available:
|
||||
from torch.distributed.tensor import DTensor
|
||||
|
||||
|
||||
@ -3780,7 +3781,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
|
||||
for shard_file, tensors in filename_to_tensors:
|
||||
shard = {}
|
||||
for tensor in tensors:
|
||||
if isinstance(state_dict[tensor], DTensor):
|
||||
if _is_dtensor_available and isinstance(state_dict[tensor], DTensor):
|
||||
full_tensor = state_dict[tensor].full_tensor()
|
||||
# to get the correctly ordered tensor we need to repack if packed
|
||||
if _get_parameter_tp_plan(tensor, self._tp_plan) in ("local_packed_rowwise",):
|
||||
@ -4280,6 +4281,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
|
||||
tp_size = kwargs.pop("tp_size", None)
|
||||
device_mesh = kwargs.pop("device_mesh", None)
|
||||
trust_remote_code = kwargs.pop("trust_remote_code", None)
|
||||
use_kernels = kwargs.pop("use_kernels", False)
|
||||
|
||||
key_mapping = kwargs.pop("key_mapping", None)
|
||||
# Load models with hardcoded key mapping on class for VLMs only, to keep BC and standardize model
|
||||
@ -4656,8 +4658,11 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
|
||||
# The _keep_in_fp32_modules flag is only used to avoid bf16 -> fp16 casting precision issues. It was introduced
|
||||
# in case of force loading a model that should stay bf16 in fp16 (which includes a few quantizers as this is a pre-processing
|
||||
# step for e.g. bitsandbytes). See https://github.com/huggingface/transformers/issues/20287 for details.
|
||||
# Update: to extend _keep_in_fp32_modules flag feature, it can also be used to force modules that should stay in fp32
|
||||
if model._keep_in_fp32_modules is not None and (
|
||||
torch_dtype == torch.float16 or getattr(hf_quantizer, "use_keep_in_fp32_modules", False)
|
||||
torch_dtype == torch.float16
|
||||
or torch_dtype == torch.bfloat16
|
||||
or getattr(hf_quantizer, "use_keep_in_fp32_modules", False)
|
||||
):
|
||||
# We need to match exact layers, so we add either `.` on each side, or start/end of string
|
||||
keep_in_fp32_regex = re.compile(
|
||||
@ -4732,6 +4737,12 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
|
||||
# Set model in evaluation mode to deactivate DropOut modules by default
|
||||
model.eval()
|
||||
|
||||
# check if using kernels
|
||||
if use_kernels:
|
||||
from kernels import Device, kernelize
|
||||
|
||||
kernelize(model, device=Device(type=model.device.type))
|
||||
|
||||
# If it is a model with generation capabilities, attempt to load generation files (generation config,
|
||||
# custom generate function)
|
||||
if model.can_generate() and generation_config is not None:
|
||||
|
@ -21,6 +21,7 @@ if TYPE_CHECKING:
|
||||
from .albert import *
|
||||
from .align import *
|
||||
from .altclip import *
|
||||
from .arcee import *
|
||||
from .aria import *
|
||||
from .audio_spectrogram_transformer import *
|
||||
from .auto import *
|
||||
@ -284,6 +285,7 @@ if TYPE_CHECKING:
|
||||
from .squeezebert import *
|
||||
from .stablelm import *
|
||||
from .starcoder2 import *
|
||||
from .stt import *
|
||||
from .superglue import *
|
||||
from .superpoint import *
|
||||
from .swiftformer import *
|
||||
|
@ -570,30 +570,21 @@ class AlbertPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AlbertForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`AlbertForPreTraining`].
|
||||
|
||||
Args:
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
sop_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class AlbertForPreTrainingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
sop_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -14,12 +14,6 @@
|
||||
# limitations under the License.
|
||||
"""ALIGN model configuration"""
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
pass
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
|
||||
|
@ -23,6 +23,7 @@ import torch.utils.checkpoint
|
||||
from torch import nn
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithNoAttention,
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
@ -39,20 +40,15 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AlignVisionModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
"""
|
||||
)
|
||||
class AlignVisionModelOutput(ModelOutput):
|
||||
r"""
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
"""
|
||||
|
||||
image_embeds: Optional[torch.FloatTensor] = None
|
||||
@ -61,26 +57,15 @@ class AlignVisionModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AlignTextModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for text model's outputs that also contains a pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The text embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class AlignTextModelOutput(ModelOutput):
|
||||
r"""
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The text embeddings obtained by applying the projection layer to the pooler_output.
|
||||
"""
|
||||
|
||||
text_embeds: Optional[torch.FloatTensor] = None
|
||||
@ -90,25 +75,25 @@ class AlignTextModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class AlignOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of [`AlignTextModel`].
|
||||
image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The output of [`AlignVisionModel`].
|
||||
text_model_output(`BaseModelOutputWithPoolingAndCrossAttentions`):
|
||||
The output of the [`AlignTextModel`].
|
||||
vision_model_output(`BaseModelOutputWithPoolingAndNoAttention`):
|
||||
The output of the [`AlignVisionModel`].
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of [`AlignTextModel`].
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The output of [`AlignVisionModel`].
|
||||
text_model_output (`BaseModelOutputWithPoolingAndCrossAttentions`):
|
||||
The output of the [`AlignTextModel`].
|
||||
vision_model_output (`BaseModelOutputWithPoolingAndNoAttention`):
|
||||
The output of the [`AlignVisionModel`].
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -827,7 +812,7 @@ class AlignTextOutput(nn.Module):
|
||||
|
||||
|
||||
# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->AlignText
|
||||
class AlignTextLayer(nn.Module):
|
||||
class AlignTextLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.chunk_size_feed_forward = config.chunk_size_feed_forward
|
||||
@ -953,27 +938,15 @@ class AlignTextEncoder(nn.Module):
|
||||
layer_head_mask = head_mask[i] if head_mask is not None else None
|
||||
past_key_value = past_key_values[i] if past_key_values is not None else None
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
layer_module.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states, # as a positional argument for gradient checkpointing
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
past_key_value=past_key_value,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
if use_cache:
|
||||
|
@ -23,6 +23,7 @@ import torch.nn as nn
|
||||
import torch.utils.checkpoint
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
@ -52,26 +53,26 @@ def clip_loss(similarity: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->AltCLIP
|
||||
class AltCLIPOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPTextModel`].
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPVisionModel`].
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`AltCLIPTextModel`].
|
||||
vision_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`AltCLIPVisionModel`].
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPTextModel`].
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPVisionModel`].
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`AltCLIPTextModel`].
|
||||
vision_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`AltCLIPVisionModel`].
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -418,7 +419,7 @@ class AltRobertaOutput(nn.Module):
|
||||
|
||||
|
||||
# Copied from transformers.models.roberta.modeling_roberta.RobertaLayer with Roberta->AltRoberta
|
||||
class AltRobertaLayer(nn.Module):
|
||||
class AltRobertaLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.chunk_size_feed_forward = config.chunk_size_feed_forward
|
||||
@ -544,27 +545,15 @@ class AltRobertaEncoder(nn.Module):
|
||||
layer_head_mask = head_mask[i] if head_mask is not None else None
|
||||
past_key_value = past_key_values[i] if past_key_values is not None else None
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
layer_module.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states, # as a positional argument for gradient checkpointing
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
past_key_value=past_key_value,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
if use_cache:
|
||||
@ -732,7 +721,7 @@ class AltCLIPMLP(nn.Module):
|
||||
return hidden_states
|
||||
|
||||
|
||||
class AltCLIPEncoderLayer(nn.Module):
|
||||
class AltCLIPEncoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: AltCLIPConfig):
|
||||
super().__init__()
|
||||
self.embed_dim = config.hidden_size
|
||||
@ -848,21 +837,12 @@ class AltCLIPEncoder(nn.Module):
|
||||
for idx, encoder_layer in enumerate(self.layers):
|
||||
if output_hidden_states:
|
||||
encoder_states = encoder_states + (hidden_states,)
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
encoder_layer.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
causal_attention_mask,
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
causal_attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
causal_attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
|
27
src/transformers/models/arcee/__init__.py
Normal file
27
src/transformers/models/arcee/__init__.py
Normal file
@ -0,0 +1,27 @@
|
||||
# Copyright 2025 Arcee AI and the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ...utils import _LazyModule
|
||||
from ...utils.import_utils import define_import_structure
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_arcee import *
|
||||
from .modeling_arcee import *
|
||||
else:
|
||||
import sys
|
||||
|
||||
_file = globals()["__file__"]
|
||||
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|
201
src/transformers/models/arcee/configuration_arcee.py
Normal file
201
src/transformers/models/arcee/configuration_arcee.py
Normal file
@ -0,0 +1,201 @@
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
# This file was automatically generated from src/transformers/models/arcee/modular_arcee.py.
|
||||
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
||||
# the file from the modular. If any change should be done, please apply the change to the
|
||||
# modular_arcee.py file directly. One of our CI enforces this.
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
# coding=utf-8
|
||||
# Copyright 2025 Arcee AI and the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...modeling_rope_utils import rope_config_validation
|
||||
|
||||
|
||||
class ArceeConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`ArceeModel`]. It is used to instantiate an Arcee
|
||||
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
||||
defaults will yield a similar configuration to that of the AFM-4.5B-Base.
|
||||
|
||||
Pre-trained weights are available at
|
||||
[arcee-ai/AFM-4.5B](https://huggingface.co/arcee-ai/AFM-4.5B)
|
||||
and were used to build the examples below.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 32000):
|
||||
Vocabulary size of the Arcee model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`ArceeModel`]
|
||||
hidden_size (`int`, *optional*, defaults to 2560):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 18432):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 32):
|
||||
Number of hidden layers in the Transformer decoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer in the Transformer decoder.
|
||||
num_key_value_heads (`int`, *optional*):
|
||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group. For more details checkout [this
|
||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||
`num_attention_heads`.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
|
||||
The non-linear activation function (function or string) in the decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 4096):
|
||||
The maximum sequence length that this model might ever be used with. AFM-4.5B-Base supports up to 16384 tokens.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the rms normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
pad_token_id (`int`, *optional*):
|
||||
Padding token id.
|
||||
bos_token_id (`int`, *optional*, defaults to 128000):
|
||||
Beginning of stream token id.
|
||||
eos_token_id (`int`, *optional*, defaults to 128001):
|
||||
End of stream token id.
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether to tie weight embeddings
|
||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
rope_scaling (`Dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
|
||||
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
|
||||
accordingly.
|
||||
Expected contents:
|
||||
`rope_type` (`str`):
|
||||
The sub-variant of RoPE to use. Can be one of ['default', 'yarn'], with 'default' being the original RoPE implementation.
|
||||
`factor` (`float`, *optional*):
|
||||
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
|
||||
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
|
||||
original maximum pre-trained length.
|
||||
`original_max_position_embeddings` (`int`, *optional*):
|
||||
Used with 'yarn'. The original max position embeddings used during pretraining.
|
||||
`attention_factor` (`float`, *optional*):
|
||||
Used with 'yarn'. The scaling factor to be applied on the attention computation. If unspecified,
|
||||
it defaults to value recommended by the implementation, using the `factor` field to infer the suggested value.
|
||||
`beta_fast` (`float`, *optional*):
|
||||
Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
|
||||
ramp function. If unspecified, it defaults to 32.
|
||||
`beta_slow` (`float`, *optional*):
|
||||
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
|
||||
ramp function. If unspecified, it defaults to 1.
|
||||
attention_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
mlp_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
|
||||
head_dim (`int`, *optional*):
|
||||
The attention head dimension. If None, it will default to hidden_size // num_attention_heads
|
||||
|
||||
```python
|
||||
>>> from transformers import ArceeModel, ArceeConfig
|
||||
|
||||
>>> # Initializing an Arcee AFM-4.5B-Base style configuration
|
||||
>>> configuration = ArceeConfig()
|
||||
|
||||
>>> # Initializing a model from the AFM-4.5B-Base style configuration
|
||||
>>> model = ArceeModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "arcee"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
base_model_tp_plan = {
|
||||
"layers.*.self_attn.q_proj": "colwise",
|
||||
"layers.*.self_attn.k_proj": "colwise",
|
||||
"layers.*.self_attn.v_proj": "colwise",
|
||||
"layers.*.self_attn.o_proj": "rowwise",
|
||||
"layers.*.mlp.up_proj": "colwise",
|
||||
"layers.*.mlp.down_proj": "rowwise",
|
||||
}
|
||||
base_model_pp_plan = {
|
||||
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
|
||||
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
|
||||
"norm": (["hidden_states"], ["hidden_states"]),
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=32000,
|
||||
hidden_size=2560,
|
||||
intermediate_size=18432,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=None,
|
||||
hidden_act="relu2",
|
||||
max_position_embeddings=4096,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-5,
|
||||
use_cache=True,
|
||||
pad_token_id=None,
|
||||
bos_token_id=128000,
|
||||
eos_token_id=128001,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
mlp_bias=False,
|
||||
head_dim=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
num_key_value_heads = num_attention_heads
|
||||
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.use_cache = use_cache
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
self.mlp_bias = mlp_bias
|
||||
self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
|
||||
# Validate the correctness of rotary position embeddings parameters
|
||||
# BC: if there is a 'type' field, copy it it to 'rope_type'.
|
||||
if self.rope_scaling is not None and "type" in self.rope_scaling:
|
||||
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
|
||||
rope_config_validation(self)
|
||||
|
||||
|
||||
__all__ = ["ArceeConfig"]
|
811
src/transformers/models/arcee/modeling_arcee.py
Normal file
811
src/transformers/models/arcee/modeling_arcee.py
Normal file
@ -0,0 +1,811 @@
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
# This file was automatically generated from src/transformers/models/arcee/modular_arcee.py.
|
||||
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
||||
# the file from the modular. If any change should be done, please apply the change to the
|
||||
# modular_arcee.py file directly. One of our CI enforces this.
|
||||
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
||||
# coding=utf-8
|
||||
# Copyright 2025 Arcee AI and the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Callable, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from transformers.utils import auto_docstring, logging
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...integrations import use_kernel_forward_from_hub
|
||||
from ...masking_utils import create_causal_mask
|
||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPast,
|
||||
CausalLMOutputWithPast,
|
||||
QuestionAnsweringModelOutput,
|
||||
SequenceClassifierOutputWithPast,
|
||||
TokenClassifierOutput,
|
||||
)
|
||||
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
||||
from ...processing_utils import Unpack
|
||||
from ...utils import LossKwargs, can_return_tuple
|
||||
from .configuration_arcee import ArceeConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class ArceeMLP(nn.Module):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.hidden_size = config.hidden_size
|
||||
self.intermediate_size = config.intermediate_size
|
||||
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
|
||||
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
|
||||
self.act_fn = ACT2FN[config.hidden_act]
|
||||
|
||||
def forward(self, x):
|
||||
return self.down_proj(self.act_fn(self.up_proj(x)))
|
||||
|
||||
|
||||
@use_kernel_forward_from_hub("RMSNorm")
|
||||
class ArceeRMSNorm(nn.Module):
|
||||
def __init__(self, hidden_size, eps=1e-6):
|
||||
"""
|
||||
ArceeRMSNorm is equivalent to T5LayerNorm
|
||||
"""
|
||||
super().__init__()
|
||||
self.weight = nn.Parameter(torch.ones(hidden_size))
|
||||
self.variance_epsilon = eps
|
||||
|
||||
def forward(self, hidden_states):
|
||||
input_dtype = hidden_states.dtype
|
||||
hidden_states = hidden_states.to(torch.float32)
|
||||
variance = hidden_states.pow(2).mean(-1, keepdim=True)
|
||||
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
|
||||
return self.weight * hidden_states.to(input_dtype)
|
||||
|
||||
def extra_repr(self):
|
||||
return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
|
||||
|
||||
|
||||
class ArceeRotaryEmbedding(nn.Module):
|
||||
def __init__(self, config: ArceeConfig, device=None):
|
||||
super().__init__()
|
||||
# BC: "rope_type" was originally "type"
|
||||
if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
|
||||
self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
|
||||
else:
|
||||
self.rope_type = "default"
|
||||
self.max_seq_len_cached = config.max_position_embeddings
|
||||
self.original_max_seq_len = config.max_position_embeddings
|
||||
|
||||
self.config = config
|
||||
self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
|
||||
|
||||
inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
|
||||
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
||||
self.original_inv_freq = self.inv_freq
|
||||
|
||||
@torch.no_grad()
|
||||
@dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)
|
||||
def forward(self, x, position_ids):
|
||||
inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
|
||||
position_ids_expanded = position_ids[:, None, :].float()
|
||||
|
||||
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
|
||||
with torch.autocast(device_type=device_type, enabled=False): # Force float32
|
||||
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
||||
emb = torch.cat((freqs, freqs), dim=-1)
|
||||
cos = emb.cos() * self.attention_scaling
|
||||
sin = emb.sin() * self.attention_scaling
|
||||
|
||||
return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
|
||||
|
||||
|
||||
def rotate_half(x):
|
||||
"""Rotates half the hidden dims of the input."""
|
||||
x1 = x[..., : x.shape[-1] // 2]
|
||||
x2 = x[..., x.shape[-1] // 2 :]
|
||||
return torch.cat((-x2, x1), dim=-1)
|
||||
|
||||
|
||||
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
|
||||
"""Applies Rotary Position Embedding to the query and key tensors.
|
||||
|
||||
Args:
|
||||
q (`torch.Tensor`): The query tensor.
|
||||
k (`torch.Tensor`): The key tensor.
|
||||
cos (`torch.Tensor`): The cosine part of the rotary embedding.
|
||||
sin (`torch.Tensor`): The sine part of the rotary embedding.
|
||||
position_ids (`torch.Tensor`, *optional*):
|
||||
Deprecated and unused.
|
||||
unsqueeze_dim (`int`, *optional*, defaults to 1):
|
||||
The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
|
||||
sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
|
||||
that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
|
||||
k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
|
||||
cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
|
||||
the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
|
||||
Returns:
|
||||
`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
|
||||
"""
|
||||
cos = cos.unsqueeze(unsqueeze_dim)
|
||||
sin = sin.unsqueeze(unsqueeze_dim)
|
||||
q_embed = (q * cos) + (rotate_half(q) * sin)
|
||||
k_embed = (k * cos) + (rotate_half(k) * sin)
|
||||
return q_embed, k_embed
|
||||
|
||||
|
||||
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
|
||||
"""
|
||||
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
|
||||
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
|
||||
"""
|
||||
batch, num_key_value_heads, slen, head_dim = hidden_states.shape
|
||||
if n_rep == 1:
|
||||
return hidden_states
|
||||
hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
|
||||
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
|
||||
|
||||
|
||||
def eager_attention_forward(
|
||||
module: nn.Module,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
attention_mask: Optional[torch.Tensor],
|
||||
scaling: float,
|
||||
dropout: float = 0.0,
|
||||
**kwargs,
|
||||
):
|
||||
key_states = repeat_kv(key, module.num_key_value_groups)
|
||||
value_states = repeat_kv(value, module.num_key_value_groups)
|
||||
|
||||
attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
|
||||
if attention_mask is not None:
|
||||
causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
|
||||
attn_weights = attn_weights + causal_mask
|
||||
|
||||
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
|
||||
attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
|
||||
attn_output = torch.matmul(attn_weights, value_states)
|
||||
attn_output = attn_output.transpose(1, 2).contiguous()
|
||||
|
||||
return attn_output, attn_weights
|
||||
|
||||
|
||||
class ArceeAttention(nn.Module):
|
||||
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
||||
|
||||
def __init__(self, config: ArceeConfig, layer_idx: int):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.layer_idx = layer_idx
|
||||
self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
|
||||
self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
|
||||
self.scaling = self.head_dim**-0.5
|
||||
self.attention_dropout = config.attention_dropout
|
||||
self.is_causal = True
|
||||
|
||||
self.q_proj = nn.Linear(
|
||||
config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
|
||||
)
|
||||
self.k_proj = nn.Linear(
|
||||
config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
|
||||
)
|
||||
self.v_proj = nn.Linear(
|
||||
config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
|
||||
)
|
||||
self.o_proj = nn.Linear(
|
||||
config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
position_embeddings: tuple[torch.Tensor, torch.Tensor],
|
||||
attention_mask: Optional[torch.Tensor],
|
||||
past_key_value: Optional[Cache] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
|
||||
input_shape = hidden_states.shape[:-1]
|
||||
hidden_shape = (*input_shape, -1, self.head_dim)
|
||||
|
||||
query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
|
||||
key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
|
||||
value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
|
||||
|
||||
cos, sin = position_embeddings
|
||||
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
|
||||
|
||||
if past_key_value is not None:
|
||||
# sin and cos are specific to RoPE models; cache_position needed for the static cache
|
||||
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
||||
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
|
||||
|
||||
attention_interface: Callable = eager_attention_forward
|
||||
if self.config._attn_implementation != "eager":
|
||||
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||
|
||||
attn_output, attn_weights = attention_interface(
|
||||
self,
|
||||
query_states,
|
||||
key_states,
|
||||
value_states,
|
||||
attention_mask,
|
||||
dropout=0.0 if not self.training else self.attention_dropout,
|
||||
scaling=self.scaling,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
attn_output = attn_output.reshape(*input_shape, -1).contiguous()
|
||||
attn_output = self.o_proj(attn_output)
|
||||
return attn_output, attn_weights
|
||||
|
||||
|
||||
class ArceeDecoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: ArceeConfig, layer_idx: int):
|
||||
super().__init__()
|
||||
self.hidden_size = config.hidden_size
|
||||
|
||||
self.self_attn = ArceeAttention(config=config, layer_idx=layer_idx)
|
||||
|
||||
self.mlp = ArceeMLP(config)
|
||||
self.input_layernorm = ArceeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.post_attention_layernorm = ArceeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_value: Optional[Cache] = None,
|
||||
output_attentions: Optional[bool] = False,
|
||||
use_cache: Optional[bool] = False,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
||||
**kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
||||
residual = hidden_states
|
||||
hidden_states = self.input_layernorm(hidden_states)
|
||||
|
||||
# Self Attention
|
||||
hidden_states, self_attn_weights = self.self_attn(
|
||||
hidden_states=hidden_states,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_value=past_key_value,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
position_embeddings=position_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
hidden_states = residual + hidden_states
|
||||
|
||||
# Fully Connected
|
||||
residual = hidden_states
|
||||
hidden_states = self.post_attention_layernorm(hidden_states)
|
||||
hidden_states = self.mlp(hidden_states)
|
||||
hidden_states = residual + hidden_states
|
||||
|
||||
outputs = (hidden_states,)
|
||||
if output_attentions:
|
||||
outputs += (self_attn_weights,)
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class ArceePreTrainedModel(PreTrainedModel):
|
||||
config_class = ArceeConfig
|
||||
base_model_prefix = "model"
|
||||
supports_gradient_checkpointing = True
|
||||
_no_split_modules = ["ArceeDecoderLayer"]
|
||||
_skip_keys_device_placement = ["past_key_values"]
|
||||
_supports_flash_attn_2 = True
|
||||
_supports_sdpa = True
|
||||
_supports_flex_attn = True
|
||||
_supports_cache_class = True
|
||||
_supports_quantized_cache = True
|
||||
_supports_static_cache = True
|
||||
_supports_attention_backend = True
|
||||
|
||||
def _init_weights(self, module):
|
||||
std = self.config.initializer_range
|
||||
if isinstance(module, nn.Linear):
|
||||
module.weight.data.normal_(mean=0.0, std=std)
|
||||
if module.bias is not None:
|
||||
module.bias.data.zero_()
|
||||
elif isinstance(module, nn.Embedding):
|
||||
module.weight.data.normal_(mean=0.0, std=std)
|
||||
if module.padding_idx is not None:
|
||||
module.weight.data[module.padding_idx].zero_()
|
||||
elif isinstance(module, ArceeRMSNorm):
|
||||
module.weight.data.fill_(1.0)
|
||||
|
||||
|
||||
@auto_docstring
|
||||
class ArceeModel(ArceePreTrainedModel):
|
||||
def __init__(self, config: ArceeConfig):
|
||||
super().__init__(config)
|
||||
self.padding_idx = config.pad_token_id
|
||||
self.vocab_size = config.vocab_size
|
||||
|
||||
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
|
||||
self.layers = nn.ModuleList(
|
||||
[ArceeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
|
||||
)
|
||||
self.norm = ArceeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
self.rotary_emb = ArceeRotaryEmbedding(config=config)
|
||||
self.gradient_checkpointing = False
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.embed_tokens
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.embed_tokens = value
|
||||
|
||||
@can_return_tuple
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[Cache] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**flash_attn_kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> BaseModelOutputWithPast:
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||
|
||||
if (input_ids is None) ^ (inputs_embeds is not None):
|
||||
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
|
||||
|
||||
if self.gradient_checkpointing and self.training and use_cache:
|
||||
logger.warning_once(
|
||||
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
|
||||
)
|
||||
use_cache = False
|
||||
|
||||
# TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
|
||||
if not isinstance(past_key_values, (type(None), Cache)):
|
||||
raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
|
||||
|
||||
if inputs_embeds is None:
|
||||
inputs_embeds = self.embed_tokens(input_ids)
|
||||
|
||||
if use_cache and past_key_values is None:
|
||||
past_key_values = DynamicCache()
|
||||
|
||||
if cache_position is None:
|
||||
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
|
||||
cache_position = torch.arange(
|
||||
past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
|
||||
)
|
||||
|
||||
if position_ids is None:
|
||||
position_ids = cache_position.unsqueeze(0)
|
||||
|
||||
causal_mask = create_causal_mask(
|
||||
config=self.config,
|
||||
input_embeds=inputs_embeds,
|
||||
attention_mask=attention_mask,
|
||||
cache_position=cache_position,
|
||||
past_key_values=past_key_values,
|
||||
)
|
||||
|
||||
hidden_states = inputs_embeds
|
||||
|
||||
# create position embeddings to be shared across the decoder layers
|
||||
position_embeddings = self.rotary_emb(hidden_states, position_ids)
|
||||
|
||||
# decoder layers
|
||||
all_hidden_states = () if output_hidden_states else None
|
||||
all_self_attns = () if output_attentions else None
|
||||
|
||||
for decoder_layer in self.layers[: self.config.num_hidden_layers]:
|
||||
if output_hidden_states:
|
||||
all_hidden_states += (hidden_states,)
|
||||
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
attention_mask=causal_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_value=past_key_values,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
position_embeddings=position_embeddings,
|
||||
**flash_attn_kwargs,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
if output_attentions:
|
||||
all_self_attns += (layer_outputs[1],)
|
||||
|
||||
hidden_states = self.norm(hidden_states)
|
||||
|
||||
# add hidden states from the last decoder layer
|
||||
if output_hidden_states:
|
||||
all_hidden_states += (hidden_states,)
|
||||
|
||||
return BaseModelOutputWithPast(
|
||||
last_hidden_state=hidden_states,
|
||||
past_key_values=past_key_values if use_cache else None,
|
||||
hidden_states=all_hidden_states,
|
||||
attentions=all_self_attns,
|
||||
)
|
||||
|
||||
|
||||
class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
|
||||
|
||||
|
||||
@auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
|
||||
class ArceeForCausalLM(ArceePreTrainedModel, GenerationMixin):
|
||||
_tied_weights_keys = ["lm_head.weight"]
|
||||
_tp_plan = {"lm_head": "colwise_rep"}
|
||||
_pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.model = ArceeModel(config)
|
||||
self.vocab_size = config.vocab_size
|
||||
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.model.embed_tokens
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.model.embed_tokens = value
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.lm_head
|
||||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.lm_head = new_embeddings
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model
|
||||
|
||||
@can_return_tuple
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[Cache] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
) -> CausalLMOutputWithPast:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import AutoTokenizer, ArceeForCausalLM
|
||||
|
||||
>>> model = ArceeForCausalLM.from_pretrained("meta-arcee/Arcee-2-7b-hf")
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("meta-arcee/Arcee-2-7b-hf")
|
||||
|
||||
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
||||
>>> inputs = tokenizer(prompt, return_tensors="pt")
|
||||
|
||||
>>> # Generate
|
||||
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
|
||||
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
||||
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
|
||||
```"""
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
|
||||
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
|
||||
outputs: BaseModelOutputWithPast = self.model(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_values=past_key_values,
|
||||
inputs_embeds=inputs_embeds,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
cache_position=cache_position,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
hidden_states = outputs.last_hidden_state
|
||||
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
|
||||
slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
|
||||
logits = self.lm_head(hidden_states[:, slice_indices, :])
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
|
||||
|
||||
return CausalLMOutputWithPast(
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
past_key_values=outputs.past_key_values,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
@auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
|
||||
class ArceeForSequenceClassification(ArceePreTrainedModel):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
self.model = ArceeModel(config)
|
||||
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.model.embed_tokens
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.model.embed_tokens = value
|
||||
|
||||
@can_return_tuple
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[Cache] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
) -> SequenceClassifierOutputWithPast:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
|
||||
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
||||
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
"""
|
||||
|
||||
transformer_outputs: BaseModelOutputWithPast = self.model(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_values=past_key_values,
|
||||
inputs_embeds=inputs_embeds,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
)
|
||||
hidden_states = transformer_outputs.last_hidden_state
|
||||
logits = self.score(hidden_states)
|
||||
|
||||
if input_ids is not None:
|
||||
batch_size = input_ids.shape[0]
|
||||
else:
|
||||
batch_size = inputs_embeds.shape[0]
|
||||
|
||||
if self.config.pad_token_id is None and batch_size != 1:
|
||||
raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
|
||||
if self.config.pad_token_id is None:
|
||||
last_non_pad_token = -1
|
||||
elif input_ids is not None:
|
||||
# To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
|
||||
non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
|
||||
token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
|
||||
last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
|
||||
else:
|
||||
last_non_pad_token = -1
|
||||
logger.warning_once(
|
||||
f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
|
||||
"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
|
||||
)
|
||||
|
||||
pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
|
||||
|
||||
return SequenceClassifierOutputWithPast(
|
||||
loss=loss,
|
||||
logits=pooled_logits,
|
||||
past_key_values=transformer_outputs.past_key_values,
|
||||
hidden_states=transformer_outputs.hidden_states,
|
||||
attentions=transformer_outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
@auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
|
||||
class ArceeForQuestionAnswering(ArceePreTrainedModel):
|
||||
base_model_prefix = "transformer"
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.transformer = ArceeModel(config)
|
||||
self.qa_outputs = nn.Linear(config.hidden_size, 2)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.transformer.embed_tokens
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.transformer.embed_tokens = value
|
||||
|
||||
@can_return_tuple
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[Cache] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
start_positions: Optional[torch.LongTensor] = None,
|
||||
end_positions: Optional[torch.LongTensor] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
**kwargs,
|
||||
) -> QuestionAnsweringModelOutput:
|
||||
outputs: BaseModelOutputWithPast = self.transformer(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_values=past_key_values,
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
)
|
||||
|
||||
sequence_output = outputs.last_hidden_state
|
||||
|
||||
logits = self.qa_outputs(sequence_output)
|
||||
start_logits, end_logits = logits.split(1, dim=-1)
|
||||
start_logits = start_logits.squeeze(-1).contiguous()
|
||||
end_logits = end_logits.squeeze(-1).contiguous()
|
||||
|
||||
loss = None
|
||||
if start_positions is not None and end_positions is not None:
|
||||
loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
|
||||
|
||||
return QuestionAnsweringModelOutput(
|
||||
loss=loss,
|
||||
start_logits=start_logits,
|
||||
end_logits=end_logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
@auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
|
||||
class ArceeForTokenClassification(ArceePreTrainedModel):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
self.model = ArceeModel(config)
|
||||
if getattr(config, "classifier_dropout", None) is not None:
|
||||
classifier_dropout = config.classifier_dropout
|
||||
elif getattr(config, "hidden_dropout", None) is not None:
|
||||
classifier_dropout = config.hidden_dropout
|
||||
else:
|
||||
classifier_dropout = 0.1
|
||||
self.dropout = nn.Dropout(classifier_dropout)
|
||||
self.score = nn.Linear(config.hidden_size, config.num_labels)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.model.embed_tokens
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.model.embed_tokens = value
|
||||
|
||||
@can_return_tuple
|
||||
@auto_docstring
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[Cache] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
) -> TokenClassifierOutput:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
|
||||
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
||||
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
"""
|
||||
|
||||
outputs: BaseModelOutputWithPast = self.model(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_values=past_key_values,
|
||||
inputs_embeds=inputs_embeds,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
)
|
||||
sequence_output = outputs.last_hidden_state
|
||||
sequence_output = self.dropout(sequence_output)
|
||||
logits = self.score(sequence_output)
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
loss = self.loss_function(logits, labels, self.config)
|
||||
|
||||
return TokenClassifierOutput(
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"ArceeForCausalLM",
|
||||
"ArceeForQuestionAnswering",
|
||||
"ArceeForSequenceClassification",
|
||||
"ArceeForTokenClassification",
|
||||
"ArceeModel",
|
||||
"ArceePreTrainedModel",
|
||||
]
|
225
src/transformers/models/arcee/modular_arcee.py
Normal file
225
src/transformers/models/arcee/modular_arcee.py
Normal file
@ -0,0 +1,225 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 Arcee AI and the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""PyTorch Arcee model."""
|
||||
|
||||
from transformers.utils import auto_docstring, logging
|
||||
|
||||
from ..llama.configuration_llama import LlamaConfig
|
||||
from ..llama.modeling_llama import (
|
||||
LlamaForCausalLM,
|
||||
LlamaForQuestionAnswering,
|
||||
LlamaForSequenceClassification,
|
||||
LlamaForTokenClassification,
|
||||
)
|
||||
from ..nemotron.modeling_nemotron import NemotronMLP
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class ArceeConfig(LlamaConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`ArceeModel`]. It is used to instantiate an Arcee
|
||||
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
||||
defaults will yield a similar configuration to that of the AFM-4.5B-Base.
|
||||
|
||||
Pre-trained weights are available at
|
||||
[arcee-ai/AFM-4.5B](https://huggingface.co/arcee-ai/AFM-4.5B)
|
||||
and were used to build the examples below.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 32000):
|
||||
Vocabulary size of the Arcee model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`ArceeModel`]
|
||||
hidden_size (`int`, *optional*, defaults to 2560):
|
||||
Dimension of the hidden representations.
|
||||
intermediate_size (`int`, *optional*, defaults to 18432):
|
||||
Dimension of the MLP representations.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 32):
|
||||
Number of hidden layers in the Transformer decoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 32):
|
||||
Number of attention heads for each attention layer in the Transformer decoder.
|
||||
num_key_value_heads (`int`, *optional*):
|
||||
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||
by meanpooling all the original heads within that group. For more details checkout [this
|
||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||
`num_attention_heads`.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
|
||||
The non-linear activation function (function or string) in the decoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 4096):
|
||||
The maximum sequence length that this model might ever be used with. AFM-4.5B-Base supports up to 16384 tokens.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the rms normalization layers.
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
pad_token_id (`int`, *optional*):
|
||||
Padding token id.
|
||||
bos_token_id (`int`, *optional*, defaults to 128000):
|
||||
Beginning of stream token id.
|
||||
eos_token_id (`int`, *optional*, defaults to 128001):
|
||||
End of stream token id.
|
||||
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether to tie weight embeddings
|
||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||
The base period of the RoPE embeddings.
|
||||
rope_scaling (`Dict`, *optional*):
|
||||
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
|
||||
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
|
||||
accordingly.
|
||||
Expected contents:
|
||||
`rope_type` (`str`):
|
||||
The sub-variant of RoPE to use. Can be one of ['default', 'yarn'], with 'default' being the original RoPE implementation.
|
||||
`factor` (`float`, *optional*):
|
||||
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
|
||||
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
|
||||
original maximum pre-trained length.
|
||||
`original_max_position_embeddings` (`int`, *optional*):
|
||||
Used with 'yarn'. The original max position embeddings used during pretraining.
|
||||
`attention_factor` (`float`, *optional*):
|
||||
Used with 'yarn'. The scaling factor to be applied on the attention computation. If unspecified,
|
||||
it defaults to value recommended by the implementation, using the `factor` field to infer the suggested value.
|
||||
`beta_fast` (`float`, *optional*):
|
||||
Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
|
||||
ramp function. If unspecified, it defaults to 32.
|
||||
`beta_slow` (`float`, *optional*):
|
||||
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
|
||||
ramp function. If unspecified, it defaults to 1.
|
||||
attention_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
mlp_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
|
||||
head_dim (`int`, *optional*):
|
||||
The attention head dimension. If None, it will default to hidden_size // num_attention_heads
|
||||
|
||||
```python
|
||||
>>> from transformers import ArceeModel, ArceeConfig
|
||||
|
||||
>>> # Initializing an Arcee AFM-4.5B-Base style configuration
|
||||
>>> configuration = ArceeConfig()
|
||||
|
||||
>>> # Initializing a model from the AFM-4.5B-Base style configuration
|
||||
>>> model = ArceeModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "arcee"
|
||||
base_model_tp_plan = {
|
||||
"layers.*.self_attn.q_proj": "colwise",
|
||||
"layers.*.self_attn.k_proj": "colwise",
|
||||
"layers.*.self_attn.v_proj": "colwise",
|
||||
"layers.*.self_attn.o_proj": "rowwise",
|
||||
"layers.*.mlp.up_proj": "colwise",
|
||||
"layers.*.mlp.down_proj": "rowwise",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=32000,
|
||||
hidden_size=2560,
|
||||
intermediate_size=18432,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=32,
|
||||
num_key_value_heads=None,
|
||||
hidden_act="relu2",
|
||||
max_position_embeddings=4096,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-5,
|
||||
use_cache=True,
|
||||
pad_token_id=None,
|
||||
bos_token_id=128000,
|
||||
eos_token_id=128001,
|
||||
tie_word_embeddings=False,
|
||||
rope_theta=10000.0,
|
||||
rope_scaling=None,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
mlp_bias=False,
|
||||
head_dim=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
vocab_size=vocab_size,
|
||||
hidden_size=hidden_size,
|
||||
intermediate_size=intermediate_size,
|
||||
num_hidden_layers=num_hidden_layers,
|
||||
num_attention_heads=num_attention_heads,
|
||||
num_key_value_heads=num_key_value_heads,
|
||||
hidden_act=hidden_act,
|
||||
max_position_embeddings=max_position_embeddings,
|
||||
initializer_range=initializer_range,
|
||||
rms_norm_eps=rms_norm_eps,
|
||||
use_cache=use_cache,
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
tie_word_embeddings=tie_word_embeddings,
|
||||
rope_theta=rope_theta,
|
||||
rope_scaling=rope_scaling,
|
||||
attention_bias=attention_bias,
|
||||
attention_dropout=attention_dropout,
|
||||
mlp_bias=mlp_bias,
|
||||
head_dim=head_dim,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
del self.pretraining_tp
|
||||
|
||||
|
||||
class ArceeMLP(NemotronMLP):
|
||||
pass
|
||||
|
||||
|
||||
@auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
|
||||
class ArceeForCausalLM(LlamaForCausalLM):
|
||||
pass
|
||||
|
||||
|
||||
@auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
|
||||
class ArceeForSequenceClassification(LlamaForSequenceClassification):
|
||||
pass
|
||||
|
||||
|
||||
@auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
|
||||
class ArceeForQuestionAnswering(LlamaForQuestionAnswering):
|
||||
pass
|
||||
|
||||
|
||||
@auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
|
||||
class ArceeForTokenClassification(LlamaForTokenClassification):
|
||||
pass
|
||||
|
||||
|
||||
__all__ = [
|
||||
"ArceeConfig",
|
||||
"ArceeForCausalLM",
|
||||
"ArceeForQuestionAnswering",
|
||||
"ArceeForSequenceClassification",
|
||||
"ArceeForTokenClassification",
|
||||
"ArceeModel", # noqa: F822
|
||||
"ArceePreTrainedModel", # noqa: F822
|
||||
]
|
@ -963,35 +963,26 @@ class AriaTextForCausalLM(AriaTextPreTrainedModel, GenerationMixin):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AriaCausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Aria causal language model (or autoregressive) outputs.
|
||||
"""
|
||||
)
|
||||
class AriaCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -1003,33 +994,22 @@ class AriaCausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AriaModelOutputWithPast(BaseModelOutputWithPast):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for Aria outputs, with hidden states and attentions.
|
||||
"""
|
||||
)
|
||||
class AriaModelOutputWithPast(BaseModelOutputWithPast):
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||
@ -1056,6 +1036,12 @@ class AriaModel(AriaPreTrainedModel):
|
||||
def set_input_embeddings(self, value):
|
||||
self.language_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.language_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model
|
||||
|
||||
def get_image_features(
|
||||
self,
|
||||
pixel_values: torch.FloatTensor,
|
||||
@ -1220,10 +1206,10 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
|
||||
self.lm_head = new_embeddings
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model = decoder
|
||||
self.model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model
|
||||
return self.model.get_decoder
|
||||
|
||||
def get_image_features(
|
||||
self,
|
||||
|
@ -206,7 +206,7 @@ def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_fo
|
||||
|
||||
if "speech-commands" in model_name:
|
||||
# TODO: Convert dataset to Parquet
|
||||
dataset = load_dataset("google/speech_commands", "v0.02", split="validation")
|
||||
dataset = load_dataset("google/speech_commands", "v0.02", split="validation", trust_remote_code=True)
|
||||
waveform = dataset[0]["audio"]["array"]
|
||||
else:
|
||||
filepath = hf_hub_download(
|
||||
|
@ -22,6 +22,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, SequenceClassifierOutput
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
||||
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
|
||||
@ -282,7 +283,7 @@ class ASTOutput(nn.Module):
|
||||
|
||||
|
||||
# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->AST,VIT->AST
|
||||
class ASTLayer(nn.Module):
|
||||
class ASTLayer(GradientCheckpointingLayer):
|
||||
"""This corresponds to the Block class in the timm implementation."""
|
||||
|
||||
def __init__(self, config: ASTConfig) -> None:
|
||||
@ -349,16 +350,7 @@ class ASTEncoder(nn.Module):
|
||||
|
||||
layer_head_mask = head_mask[i] if head_mask is not None else None
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
layer_module.__call__,
|
||||
hidden_states,
|
||||
layer_head_mask,
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
|
||||
|
||||
layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
if output_attentions:
|
||||
|
@ -39,6 +39,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
|
||||
("albert", "AlbertConfig"),
|
||||
("align", "AlignConfig"),
|
||||
("altclip", "AltCLIPConfig"),
|
||||
("arcee", "ArceeConfig"),
|
||||
("aria", "AriaConfig"),
|
||||
("aria_text", "AriaTextConfig"),
|
||||
("audio-spectrogram-transformer", "ASTConfig"),
|
||||
@ -321,6 +322,7 @@ CONFIG_MAPPING_NAMES = OrderedDict[str, str](
|
||||
("squeezebert", "SqueezeBertConfig"),
|
||||
("stablelm", "StableLmConfig"),
|
||||
("starcoder2", "Starcoder2Config"),
|
||||
("stt", "KyutaiSpeechToTextConfig"),
|
||||
("superglue", "SuperGlueConfig"),
|
||||
("superpoint", "SuperPointConfig"),
|
||||
("swiftformer", "SwiftFormerConfig"),
|
||||
@ -395,6 +397,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
||||
("albert", "ALBERT"),
|
||||
("align", "ALIGN"),
|
||||
("altclip", "AltCLIP"),
|
||||
("arcee", "Arcee"),
|
||||
("aria", "Aria"),
|
||||
("aria_text", "AriaText"),
|
||||
("audio-spectrogram-transformer", "Audio Spectrogram Transformer"),
|
||||
@ -705,6 +708,7 @@ MODEL_NAMES_MAPPING = OrderedDict[str, str](
|
||||
("squeezebert", "SqueezeBERT"),
|
||||
("stablelm", "StableLm"),
|
||||
("starcoder2", "Starcoder2"),
|
||||
("stt", "KyutaiSpeechToText"),
|
||||
("superglue", "SuperGlue"),
|
||||
("superpoint", "SuperPoint"),
|
||||
("swiftformer", "SwiftFormer"),
|
||||
|
@ -91,6 +91,7 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
|
||||
("sew-d", "Wav2Vec2FeatureExtractor"),
|
||||
("speech_to_text", "Speech2TextFeatureExtractor"),
|
||||
("speecht5", "SpeechT5FeatureExtractor"),
|
||||
("stt", "KyutaiSpeechToTextFeatureExtractor"),
|
||||
("swiftformer", "ViTFeatureExtractor"),
|
||||
("swin", "ViTFeatureExtractor"),
|
||||
("swinv2", "ViTFeatureExtractor"),
|
||||
|
@ -95,8 +95,8 @@ else:
|
||||
("groupvit", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
|
||||
("hiera", ("BitImageProcessor", "BitImageProcessorFast")),
|
||||
("idefics", ("IdeficsImageProcessor",)),
|
||||
("idefics2", ("Idefics2ImageProcessor",)),
|
||||
("idefics3", ("Idefics3ImageProcessor",)),
|
||||
("idefics2", ("Idefics2ImageProcessor", "Idefics2ImageProcessorFast")),
|
||||
("idefics3", ("Idefics3ImageProcessor", "Idefics3ImageProcessorFast")),
|
||||
("ijepa", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||
("imagegpt", ("ImageGPTImageProcessor",)),
|
||||
("instructblip", ("BlipImageProcessor", "BlipImageProcessorFast")),
|
||||
@ -148,6 +148,7 @@ else:
|
||||
("shieldgemma2", ("Gemma3ImageProcessor", "Gemma3ImageProcessorFast")),
|
||||
("siglip", ("SiglipImageProcessor", "SiglipImageProcessorFast")),
|
||||
("siglip2", ("Siglip2ImageProcessor", "Siglip2ImageProcessorFast")),
|
||||
("smolvlm", ("SmolVLMImageProcessor", "SmolVLMImageProcessorFast")),
|
||||
("superglue", ("SuperGlueImageProcessor",)),
|
||||
("swiftformer", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||
("swin", ("ViTImageProcessor", "ViTImageProcessorFast")),
|
||||
|
@ -35,6 +35,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("albert", "AlbertModel"),
|
||||
("align", "AlignModel"),
|
||||
("altclip", "AltCLIPModel"),
|
||||
("arcee", "ArceeModel"),
|
||||
("aria", "AriaModel"),
|
||||
("aria_text", "AriaTextModel"),
|
||||
("audio-spectrogram-transformer", "ASTModel"),
|
||||
@ -299,6 +300,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("squeezebert", "SqueezeBertModel"),
|
||||
("stablelm", "StableLmModel"),
|
||||
("starcoder2", "Starcoder2Model"),
|
||||
("stt", "KyutaiSpeechToTextModel"),
|
||||
("superglue", "SuperGlueForKeypointMatching"),
|
||||
("swiftformer", "SwiftFormerModel"),
|
||||
("swin", "SwinModel"),
|
||||
@ -536,6 +538,7 @@ MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
|
||||
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
|
||||
[
|
||||
# Model for Causal LM mapping
|
||||
("arcee", "ArceeForCausalLM"),
|
||||
("aria_text", "AriaTextForCausalLM"),
|
||||
("bamba", "BambaForCausalLM"),
|
||||
("bart", "BartForCausalLM"),
|
||||
@ -1053,6 +1056,7 @@ MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict(
|
||||
("speech-encoder-decoder", "SpeechEncoderDecoderModel"),
|
||||
("speech_to_text", "Speech2TextForConditionalGeneration"),
|
||||
("speecht5", "SpeechT5ForSpeechToText"),
|
||||
("stt", "KyutaiSpeechToTextForConditionalGeneration"),
|
||||
("whisper", "WhisperForConditionalGeneration"),
|
||||
]
|
||||
)
|
||||
@ -1061,6 +1065,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
||||
[
|
||||
# Model for Sequence Classification mapping
|
||||
("albert", "AlbertForSequenceClassification"),
|
||||
("arcee", "ArceeForSequenceClassification"),
|
||||
("bart", "BartForSequenceClassification"),
|
||||
("bert", "BertForSequenceClassification"),
|
||||
("big_bird", "BigBirdForSequenceClassification"),
|
||||
@ -1166,6 +1171,7 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
|
||||
[
|
||||
# Model for Question Answering mapping
|
||||
("albert", "AlbertForQuestionAnswering"),
|
||||
("arcee", "ArceeForQuestionAnswering"),
|
||||
("bart", "BartForQuestionAnswering"),
|
||||
("bert", "BertForQuestionAnswering"),
|
||||
("big_bird", "BigBirdForQuestionAnswering"),
|
||||
@ -1268,6 +1274,7 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
||||
[
|
||||
# Model for Token Classification mapping
|
||||
("albert", "AlbertForTokenClassification"),
|
||||
("arcee", "ArceeForTokenClassification"),
|
||||
("bert", "BertForTokenClassification"),
|
||||
("big_bird", "BigBirdForTokenClassification"),
|
||||
("biogpt", "BioGptForTokenClassification"),
|
||||
|
@ -27,13 +27,7 @@ from ...feature_extraction_utils import FeatureExtractionMixin
|
||||
from ...image_processing_utils import ImageProcessingMixin
|
||||
from ...processing_utils import ProcessorMixin
|
||||
from ...tokenization_utils import TOKENIZER_CONFIG_FILE
|
||||
from ...utils import (
|
||||
FEATURE_EXTRACTOR_NAME,
|
||||
PROCESSOR_NAME,
|
||||
VIDEO_PROCESSOR_NAME,
|
||||
cached_file,
|
||||
logging,
|
||||
)
|
||||
from ...utils import FEATURE_EXTRACTOR_NAME, PROCESSOR_NAME, VIDEO_PROCESSOR_NAME, cached_file, logging
|
||||
from ...video_processing_utils import BaseVideoProcessor
|
||||
from .auto_factory import _LazyAutoMapping
|
||||
from .configuration_auto import (
|
||||
@ -118,9 +112,11 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
|
||||
("shieldgemma2", "ShieldGemma2Processor"),
|
||||
("siglip", "SiglipProcessor"),
|
||||
("siglip2", "Siglip2Processor"),
|
||||
("smolvlm", "SmolVLMProcessor"),
|
||||
("speech_to_text", "Speech2TextProcessor"),
|
||||
("speech_to_text_2", "Speech2Text2Processor"),
|
||||
("speecht5", "SpeechT5Processor"),
|
||||
("stt", "KyutaiSpeechToTextProcessor"),
|
||||
("trocr", "TrOCRProcessor"),
|
||||
("tvlt", "TvltProcessor"),
|
||||
("tvp", "TvpProcessor"),
|
||||
|
@ -64,6 +64,7 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
|
||||
),
|
||||
),
|
||||
("align", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("arcee", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("aria", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("aya_vision", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("bark", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
|
||||
|
@ -30,6 +30,7 @@ from ...modeling_attn_mask_utils import (
|
||||
_prepare_4d_attention_mask,
|
||||
_prepare_4d_attention_mask_for_sdpa,
|
||||
)
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BaseModelOutput, ModelOutput, SampleTSPredictionOutput, Seq2SeqTSPredictionOutput
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
|
||||
@ -45,44 +46,35 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AutoFormerDecoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
|
||||
"""
|
||||
)
|
||||
class AutoFormerDecoderOutput(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
|
||||
hidden_size)` is output.
|
||||
trend (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Trend tensor for each time series.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
|
||||
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
|
||||
encoder_sequence_length, embed_size_per_head)`.
|
||||
|
||||
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
|
||||
hidden_size)` is output.
|
||||
trend (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Trend tensor for each time series.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
|
||||
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
|
||||
encoder_sequence_length, embed_size_per_head)`.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
||||
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
||||
input) to speed up sequential decoding.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
||||
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
||||
input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
|
||||
weighted average in the cross-attention heads.
|
||||
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
|
||||
weighted average in the cross-attention heads.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -94,63 +86,35 @@ class AutoFormerDecoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AutoformerModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Autoformer model output that contains the additional trend output.
|
||||
"""
|
||||
)
|
||||
class AutoformerModelOutput(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the decoder of the model.
|
||||
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
|
||||
hidden_size)` is output.
|
||||
trend (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Trend tensor for each time series.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
|
||||
`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
|
||||
|
||||
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
|
||||
hidden_size)` is output.
|
||||
trend (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Trend tensor for each time series.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
|
||||
`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
||||
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
|
||||
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
|
||||
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
|
||||
weighted average in the cross-attention heads.
|
||||
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder of the model.
|
||||
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
|
||||
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
|
||||
self-attention heads.
|
||||
loc (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
|
||||
Shift values of each time series' context window which is used to give the model inputs of the same
|
||||
magnitude and then used to shift back to the original magnitude.
|
||||
scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
|
||||
Scaling values of each time series' context window which is used to give the model inputs of the same
|
||||
magnitude and then used to rescale back to the original magnitude.
|
||||
static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
|
||||
Static features of each time series' in a batch which are copied to the covariates at inference time.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
||||
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
|
||||
loc (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
|
||||
Shift values of each time series' context window which is used to give the model inputs of the same
|
||||
magnitude and then used to shift back to the original magnitude.
|
||||
scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
|
||||
Scaling values of each time series' context window which is used to give the model inputs of the same
|
||||
magnitude and then used to rescale back to the original magnitude.
|
||||
static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
|
||||
Static features of each time series' in a batch which are copied to the covariates at inference time.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -670,7 +634,7 @@ class AutoformerAttention(nn.Module):
|
||||
return attn_output, attn_weights_reshaped, past_key_value
|
||||
|
||||
|
||||
class AutoformerEncoderLayer(nn.Module):
|
||||
class AutoformerEncoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: AutoformerConfig):
|
||||
super().__init__()
|
||||
self.embed_dim = config.d_model
|
||||
@ -744,7 +708,7 @@ class AutoformerEncoderLayer(nn.Module):
|
||||
return outputs
|
||||
|
||||
|
||||
class AutoformerDecoderLayer(nn.Module):
|
||||
class AutoformerDecoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: AutoformerConfig):
|
||||
super().__init__()
|
||||
self.embed_dim = config.d_model
|
||||
@ -1042,21 +1006,12 @@ class AutoformerEncoder(AutoformerPreTrainedModel):
|
||||
if to_drop:
|
||||
layer_outputs = (None, None)
|
||||
else:
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
encoder_layer.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
(head_mask[idx] if head_mask is not None else None),
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
@ -1186,6 +1141,12 @@ class AutoformerDecoder(AutoformerPreTrainedModel):
|
||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if self.gradient_checkpointing and self.training and use_cache:
|
||||
logger.warning(
|
||||
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
|
||||
)
|
||||
use_cache = False
|
||||
|
||||
input_shape = inputs_embeds.size()[:-1]
|
||||
|
||||
# expand encoder attention mask
|
||||
@ -1228,38 +1189,17 @@ class AutoformerDecoder(AutoformerPreTrainedModel):
|
||||
|
||||
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
if use_cache:
|
||||
logger.warning(
|
||||
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
|
||||
)
|
||||
use_cache = False
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
decoder_layer.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
head_mask[idx] if head_mask is not None else None,
|
||||
cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
|
||||
None,
|
||||
output_attentions,
|
||||
use_cache,
|
||||
)
|
||||
else:
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
attention_mask=attention_mask,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
||||
cross_attn_layer_head_mask=(
|
||||
cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
|
||||
),
|
||||
past_key_value=past_key_value,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
)
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
encoder_hidden_states, # as a positional argument for gradient checkpointing
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
||||
cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None),
|
||||
past_key_value=past_key_value,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
)
|
||||
(hidden_states, residual_trend) = layer_outputs[0]
|
||||
trend = trend + residual_trend
|
||||
|
||||
@ -1818,6 +1758,14 @@ class AutoformerForPrediction(AutoformerPreTrainedModel):
|
||||
Transformer requires to provide additional features.
|
||||
|
||||
The Autoformer only learns additional embeddings for `static_categorical_features`.
|
||||
future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
|
||||
Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
|
||||
in `[0, 1]`:
|
||||
|
||||
- 1 for values that are **observed**,
|
||||
- 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
|
||||
|
||||
This mask is used to filter out missing values for the final loss calculation.
|
||||
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
|
||||
Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
|
||||
|
||||
@ -1827,14 +1775,6 @@ class AutoformerForPrediction(AutoformerPreTrainedModel):
|
||||
Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*)
|
||||
`last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of
|
||||
hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
|
||||
future_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*):
|
||||
Boolean mask to indicate which `future_values` were observed and which were missing. Mask values selected
|
||||
in `[0, 1]`:
|
||||
|
||||
- 1 for values that are **observed**,
|
||||
- 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
|
||||
|
||||
This mask is used to filter out missing values for the final loss calculation.
|
||||
|
||||
Examples:
|
||||
|
||||
|
@ -117,35 +117,26 @@ class AyaVisionPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AyaVisionCausalLMOutputWithPast(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for AyaVision causal language model (or autoregressive) outputs.
|
||||
"""
|
||||
)
|
||||
class AyaVisionCausalLMOutputWithPast(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss (for next-token prediction).
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -157,33 +148,22 @@ class AyaVisionCausalLMOutputWithPast(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AyaVisionModelOutputWithPast(BaseModelOutputWithPast):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for AyaVision outputs, with hidden states and attentions.
|
||||
"""
|
||||
)
|
||||
class AyaVisionModelOutputWithPast(BaseModelOutputWithPast):
|
||||
r"""
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
|
||||
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
|
||||
`past_key_values` input) to speed up sequential decoding.
|
||||
image_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
|
||||
image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
|
||||
"""
|
||||
|
||||
image_hidden_states: Optional[torch.FloatTensor] = None
|
||||
@ -211,6 +191,12 @@ class AyaVisionModel(AyaVisionPreTrainedModel):
|
||||
def set_input_embeddings(self, value):
|
||||
self.language_model.set_input_embeddings(value)
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.language_model = decoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.language_model
|
||||
|
||||
def get_image_features(
|
||||
self,
|
||||
pixel_values: torch.FloatTensor,
|
||||
@ -389,10 +375,10 @@ class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixi
|
||||
self.lm_head = new_embeddings
|
||||
|
||||
def set_decoder(self, decoder):
|
||||
self.model = decoder
|
||||
self.model.set_decoder(decoder)
|
||||
|
||||
def get_decoder(self):
|
||||
return self.model
|
||||
return self.model.get_decoder
|
||||
|
||||
def get_image_features(
|
||||
self,
|
||||
|
@ -52,13 +52,10 @@ from ...utils import (
|
||||
can_return_tuple,
|
||||
logging,
|
||||
)
|
||||
from ...utils.import_utils import is_causal_conv1d_available, is_flash_attn_2_available, is_mamba_2_ssm_available
|
||||
from ...utils.import_utils import is_causal_conv1d_available, is_mamba_2_ssm_available
|
||||
from .configuration_bamba import BambaConfig
|
||||
|
||||
|
||||
if is_flash_attn_2_available():
|
||||
pass
|
||||
|
||||
if is_mamba_2_ssm_available():
|
||||
from mamba_ssm.ops.triton.selective_state_update import selective_state_update
|
||||
from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
|
||||
|
@ -31,6 +31,7 @@ from ...generation.logits_process import (
|
||||
)
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
|
||||
from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import CausalLMOutputWithPast, MaskedLMOutput
|
||||
from ...modeling_utils import PreTrainedModel, get_parameter_device
|
||||
from ...utils import (
|
||||
@ -309,7 +310,7 @@ class BarkMLP(nn.Module):
|
||||
return hidden_states
|
||||
|
||||
|
||||
class BarkBlock(nn.Module):
|
||||
class BarkBlock(GradientCheckpointingLayer):
|
||||
def __init__(self, config, is_causal=False):
|
||||
super().__init__()
|
||||
|
||||
@ -606,25 +607,14 @@ class BarkCausalModel(BarkPreTrainedModel, GenerationMixin):
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
outputs = self._gradient_checkpointing_func(
|
||||
block.__call__,
|
||||
hidden_states,
|
||||
None,
|
||||
attention_mask,
|
||||
head_mask[i],
|
||||
use_cache,
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
outputs = block(
|
||||
hidden_states,
|
||||
past_key_values=past_layer_key_values,
|
||||
attention_mask=attention_mask,
|
||||
head_mask=head_mask[i],
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
outputs = block(
|
||||
hidden_states,
|
||||
past_key_values=past_layer_key_values,
|
||||
attention_mask=attention_mask,
|
||||
head_mask=head_mask[i],
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = outputs[0]
|
||||
|
||||
|
@ -33,6 +33,7 @@ from ...modeling_attn_mask_utils import (
|
||||
_prepare_4d_attention_mask_for_sdpa,
|
||||
)
|
||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
@ -270,7 +271,7 @@ class BartAttention(nn.Module):
|
||||
return attn_output, attn_weights, past_key_value
|
||||
|
||||
|
||||
class BartEncoderLayer(nn.Module):
|
||||
class BartEncoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: BartConfig, layer_idx: Optional[int] = None):
|
||||
super().__init__()
|
||||
self.embed_dim = config.d_model
|
||||
@ -341,7 +342,7 @@ class BartEncoderLayer(nn.Module):
|
||||
return outputs
|
||||
|
||||
|
||||
class BartDecoderLayer(nn.Module):
|
||||
class BartDecoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: BartConfig, layer_idx: Optional[int] = None):
|
||||
super().__init__()
|
||||
self.embed_dim = config.d_model
|
||||
@ -875,21 +876,12 @@ class BartEncoder(BartPreTrainedModel):
|
||||
if to_drop:
|
||||
layer_outputs = (None, None)
|
||||
else:
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
encoder_layer.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
(head_mask[idx] if head_mask is not None else None),
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
@ -1137,35 +1129,18 @@ class BartDecoder(BartPreTrainedModel):
|
||||
if dropout_probability < self.layerdrop:
|
||||
continue
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
decoder_layer.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
head_mask[idx] if head_mask is not None else None,
|
||||
cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
|
||||
None,
|
||||
output_attentions,
|
||||
use_cache,
|
||||
cache_position,
|
||||
)
|
||||
else:
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
attention_mask=attention_mask,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
||||
cross_attn_layer_head_mask=(
|
||||
cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
|
||||
),
|
||||
past_key_value=past_key_values,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
)
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
encoder_hidden_states, # as a positional argument for gradient checkpointing
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
||||
cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None),
|
||||
past_key_value=past_key_values,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
)
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
if use_cache:
|
||||
|
@ -266,7 +266,7 @@ def convert_beit_checkpoint(checkpoint_url, pytorch_dump_folder_path):
|
||||
# Check outputs on an image
|
||||
if is_semantic:
|
||||
image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False)
|
||||
ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
|
||||
ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
|
||||
image = Image.open(ds[0]["file"])
|
||||
else:
|
||||
image_processor = BeitImageProcessor(
|
||||
|
@ -105,6 +105,7 @@ class BeitImageProcessorFast(BaseImageProcessorFast):
|
||||
do_normalize: bool,
|
||||
image_mean: Optional[Union[float, list[float]]],
|
||||
image_std: Optional[Union[float, list[float]]],
|
||||
disable_grouping: Optional[bool],
|
||||
return_tensors: Optional[Union[str, TensorType]],
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
@ -112,7 +113,7 @@ class BeitImageProcessorFast(BaseImageProcessorFast):
|
||||
images = self.reduce_label(images)
|
||||
|
||||
# Group images by size for batched resizing
|
||||
grouped_images, grouped_images_index = group_images_by_shape(images)
|
||||
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
|
||||
resized_images_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
if do_resize:
|
||||
@ -122,7 +123,7 @@ class BeitImageProcessorFast(BaseImageProcessorFast):
|
||||
|
||||
# Group images by size for further processing
|
||||
# Needed in case do_resize is False, or resize returns images with different sizes
|
||||
grouped_images, grouped_images_index = group_images_by_shape(resized_images)
|
||||
grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
|
||||
processed_images_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
if do_center_crop:
|
||||
|
@ -26,6 +26,7 @@ from torch import Tensor, nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BackboneOutput,
|
||||
BaseModelOutput,
|
||||
@ -43,39 +44,19 @@ from .configuration_beit import BeitConfig
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
|
||||
# Base docstring
|
||||
_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
|
||||
|
||||
# Image classification docstring
|
||||
_IMAGE_CLASS_CHECKPOINT = "microsoft/beit-base-patch16-224"
|
||||
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
|
||||
|
||||
|
||||
@dataclass
|
||||
class BeitModelOutputWithPooling(BaseModelOutputWithPooling):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class for outputs of [`BeitModel`].
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
|
||||
*config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
|
||||
will be returned.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class BeitModelOutputWithPooling(BaseModelOutputWithPooling):
|
||||
r"""
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
|
||||
*config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
|
||||
will be returned.
|
||||
"""
|
||||
|
||||
|
||||
@ -497,7 +478,7 @@ class BeitOutput(nn.Module):
|
||||
return hidden_states
|
||||
|
||||
|
||||
class BeitLayer(nn.Module):
|
||||
class BeitLayer(GradientCheckpointingLayer):
|
||||
"""This corresponds to the Block class in the timm implementation."""
|
||||
|
||||
def __init__(self, config: BeitConfig, window_size: Optional[tuple] = None, drop_path_rate: float = 0.0) -> None:
|
||||
@ -525,7 +506,7 @@ class BeitLayer(nn.Module):
|
||||
output_attentions: bool = False,
|
||||
relative_position_bias: Optional[torch.Tensor] = None,
|
||||
interpolate_pos_encoding: bool = False,
|
||||
resolution: Optional[tuple[int]] = None,
|
||||
resolution: Optional[tuple[int, int]] = None,
|
||||
) -> Union[tuple[torch.Tensor], tuple[torch.Tensor, torch.Tensor]]:
|
||||
self_attention_outputs = self.attention(
|
||||
self.layernorm_before(hidden_states), # in BEiT, layernorm is applied before self-attention
|
||||
@ -695,25 +676,14 @@ class BeitEncoder(nn.Module):
|
||||
|
||||
layer_head_mask = head_mask[i] if head_mask is not None else None
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
layer_module.__call__,
|
||||
hidden_states,
|
||||
layer_head_mask,
|
||||
output_attentions,
|
||||
relative_position_bias,
|
||||
interpolate_pos_encoding,
|
||||
resolution,
|
||||
)
|
||||
else:
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
layer_head_mask,
|
||||
output_attentions,
|
||||
relative_position_bias,
|
||||
interpolate_pos_encoding,
|
||||
resolution,
|
||||
)
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
head_mask=layer_head_mask,
|
||||
output_attentions=output_attentions,
|
||||
relative_position_bias=relative_position_bias,
|
||||
interpolate_pos_encoding=interpolate_pos_encoding,
|
||||
resolution=resolution,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
|
@ -30,6 +30,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa, _prepare_4d_causal_attention_mask_for_sdpa
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
BaseModelOutputWithPoolingAndCrossAttentions,
|
||||
@ -522,7 +523,7 @@ class BertOutput(nn.Module):
|
||||
return hidden_states
|
||||
|
||||
|
||||
class BertLayer(nn.Module):
|
||||
class BertLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.chunk_size_feed_forward = config.chunk_size_feed_forward
|
||||
@ -647,27 +648,15 @@ class BertEncoder(nn.Module):
|
||||
layer_head_mask = head_mask[i] if head_mask is not None else None
|
||||
past_key_value = past_key_values[i] if past_key_values is not None else None
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
layer_module.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states, # as a positional argument for gradient checkpointing
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
past_key_value=past_key_value,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
if use_cache:
|
||||
@ -816,30 +805,21 @@ class BertPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class BertForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`BertForPreTraining`].
|
||||
|
||||
Args:
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class BertForPreTrainingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -23,6 +23,7 @@ from torch import nn
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
|
||||
@ -275,7 +276,7 @@ class BertGenerationOutput(nn.Module):
|
||||
|
||||
|
||||
# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->BertGeneration
|
||||
class BertGenerationLayer(nn.Module):
|
||||
class BertGenerationLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.chunk_size_feed_forward = config.chunk_size_feed_forward
|
||||
@ -401,27 +402,15 @@ class BertEncoder(nn.Module):
|
||||
layer_head_mask = head_mask[i] if head_mask is not None else None
|
||||
past_key_value = past_key_values[i] if past_key_values is not None else None
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
layer_module.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states, # as a positional argument for gradient checkpointing
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
past_key_value=past_key_value,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
if use_cache:
|
||||
|
@ -27,6 +27,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
BaseModelOutputWithPoolingAndCrossAttentions,
|
||||
@ -1419,7 +1420,7 @@ class BigBirdOutput(nn.Module):
|
||||
return hidden_states
|
||||
|
||||
|
||||
class BigBirdLayer(nn.Module):
|
||||
class BigBirdLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config, seed=None):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
@ -1593,35 +1594,19 @@ class BigBirdEncoder(nn.Module):
|
||||
layer_head_mask = head_mask[i] if head_mask is not None else None
|
||||
past_key_value = past_key_values[i] if past_key_values is not None else None
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
layer_module.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
band_mask,
|
||||
from_mask,
|
||||
to_mask,
|
||||
blocked_encoder_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
band_mask,
|
||||
from_mask,
|
||||
to_mask,
|
||||
blocked_encoder_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
band_mask,
|
||||
from_mask,
|
||||
to_mask,
|
||||
blocked_encoder_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
if use_cache:
|
||||
@ -1759,30 +1744,21 @@ class BigBirdPreTrainedModel(PreTrainedModel):
|
||||
|
||||
|
||||
@dataclass
|
||||
class BigBirdForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`BigBirdForPreTraining`].
|
||||
|
||||
Args:
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class BigBirdForPreTrainingOutput(ModelOutput):
|
||||
r"""
|
||||
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -1793,30 +1769,17 @@ class BigBirdForPreTrainingOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class BigBirdForQuestionAnsweringModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of question answering models.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
|
||||
start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
Span-start scores (before SoftMax).
|
||||
end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
||||
Span-end scores (before SoftMax).
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, 1)`):
|
||||
pooler output from BigBigModel
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
|
||||
shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class BigBirdForQuestionAnsweringModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, 1)`):
|
||||
pooler output from BigBigModel
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -32,6 +32,7 @@ from ...modeling_attn_mask_utils import (
|
||||
_prepare_4d_attention_mask_for_sdpa,
|
||||
)
|
||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
@ -1333,7 +1334,7 @@ class BigBirdPegasusDecoderAttention(nn.Module):
|
||||
return attn_output, attn_weights, past_key_value
|
||||
|
||||
|
||||
class BigBirdPegasusEncoderLayer(nn.Module):
|
||||
class BigBirdPegasusEncoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: BigBirdPegasusConfig, seed=None):
|
||||
super().__init__()
|
||||
self.attention_type = config.attention_type
|
||||
@ -1420,7 +1421,7 @@ class BigBirdPegasusEncoderLayer(nn.Module):
|
||||
self.self_attn.set_attention_type(value)
|
||||
|
||||
|
||||
class BigBirdPegasusDecoderLayer(nn.Module):
|
||||
class BigBirdPegasusDecoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: BigBirdPegasusConfig, layer_idx: Optional[int] = None):
|
||||
super().__init__()
|
||||
self.embed_dim = config.d_model
|
||||
@ -1947,31 +1948,17 @@ class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel):
|
||||
if to_drop:
|
||||
layer_outputs = (None, None)
|
||||
else:
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
encoder_layer.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
(head_mask[idx] if head_mask is not None else None),
|
||||
band_mask,
|
||||
from_mask,
|
||||
to_mask,
|
||||
blocked_encoder_mask,
|
||||
blocked_encoder_mask,
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
||||
band_mask=band_mask,
|
||||
from_mask=from_mask,
|
||||
to_mask=to_mask,
|
||||
from_blocked_mask=blocked_encoder_mask,
|
||||
to_blocked_mask=blocked_encoder_mask,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
||||
band_mask=band_mask,
|
||||
from_mask=from_mask,
|
||||
to_mask=to_mask,
|
||||
from_blocked_mask=blocked_encoder_mask,
|
||||
to_blocked_mask=blocked_encoder_mask,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
@ -2297,35 +2284,18 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel):
|
||||
if dropout_probability < self.layerdrop:
|
||||
continue
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
decoder_layer.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
head_mask[idx] if head_mask is not None else None,
|
||||
cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
|
||||
None,
|
||||
output_attentions,
|
||||
use_cache,
|
||||
cache_position,
|
||||
)
|
||||
else:
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
attention_mask=attention_mask,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
||||
cross_attn_layer_head_mask=(
|
||||
cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
|
||||
),
|
||||
past_key_value=past_key_values,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
)
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
encoder_hidden_states, # as a positional argument for gradient checkpointing
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
||||
cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None),
|
||||
past_key_value=past_key_values,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
)
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
if use_cache:
|
||||
|
@ -20,7 +20,6 @@
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
from functools import partial
|
||||
from typing import Callable, Optional, Union
|
||||
|
||||
import torch
|
||||
@ -32,6 +31,7 @@ from ...cache_utils import Cache, EncoderDecoderCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
CausalLMOutputWithCrossAttentions,
|
||||
@ -248,7 +248,7 @@ class BioGptAttention(nn.Module):
|
||||
return attn_output, attn_weights, past_key_value
|
||||
|
||||
|
||||
class BioGptDecoderLayer(nn.Module):
|
||||
class BioGptDecoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: BioGptConfig, layer_idx: Optional[int] = None):
|
||||
super().__init__()
|
||||
self.embed_dim = config.hidden_size
|
||||
@ -646,30 +646,17 @@ class BioGptModel(BioGptPreTrainedModel):
|
||||
if dropout_probability < self.layerdrop:
|
||||
continue
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
partial(decoder_layer.__call__, **flash_attn_kwargs),
|
||||
hidden_states,
|
||||
causal_mask,
|
||||
head_mask[idx] if head_mask is not None else None,
|
||||
None,
|
||||
output_attentions,
|
||||
use_cache,
|
||||
position_ids,
|
||||
cache_position,
|
||||
)
|
||||
else:
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
attention_mask=causal_mask,
|
||||
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
||||
past_key_value=past_key_values,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
position_ids=position_ids,
|
||||
cache_position=cache_position,
|
||||
**flash_attn_kwargs,
|
||||
)
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
attention_mask=causal_mask,
|
||||
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
||||
past_key_value=past_key_values,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
position_ids=position_ids,
|
||||
cache_position=cache_position,
|
||||
**flash_attn_kwargs,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
|
@ -15,7 +15,6 @@
|
||||
"""PyTorch BioGPT model."""
|
||||
|
||||
import math
|
||||
from functools import partial
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
@ -473,30 +472,17 @@ class BioGptModel(BioGptPreTrainedModel):
|
||||
if dropout_probability < self.layerdrop:
|
||||
continue
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
partial(decoder_layer.__call__, **flash_attn_kwargs),
|
||||
hidden_states,
|
||||
causal_mask,
|
||||
head_mask[idx] if head_mask is not None else None,
|
||||
None,
|
||||
output_attentions,
|
||||
use_cache,
|
||||
position_ids,
|
||||
cache_position,
|
||||
)
|
||||
else:
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
attention_mask=causal_mask,
|
||||
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
||||
past_key_value=past_key_values,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
position_ids=position_ids,
|
||||
cache_position=cache_position,
|
||||
**flash_attn_kwargs,
|
||||
)
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
attention_mask=causal_mask,
|
||||
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
||||
past_key_value=past_key_values,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
position_ids=position_ids,
|
||||
cache_position=cache_position,
|
||||
**flash_attn_kwargs,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
|
@ -34,6 +34,7 @@ from ...modeling_attn_mask_utils import (
|
||||
_prepare_4d_attention_mask_for_sdpa,
|
||||
)
|
||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
@ -270,7 +271,7 @@ class BlenderbotAttention(nn.Module):
|
||||
|
||||
|
||||
# Copied from transformers.models.mbart.modeling_mbart.MBartEncoderLayer with MBart->Blenderbot, MBART->BLENDERBOT
|
||||
class BlenderbotEncoderLayer(nn.Module):
|
||||
class BlenderbotEncoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: BlenderbotConfig):
|
||||
super().__init__()
|
||||
self.embed_dim = config.d_model
|
||||
@ -339,7 +340,7 @@ class BlenderbotEncoderLayer(nn.Module):
|
||||
|
||||
|
||||
# Copied from transformers.models.mbart.modeling_mbart.MBartDecoderLayer with MBart->Blenderbot, MBART->BLENDERBOT
|
||||
class BlenderbotDecoderLayer(nn.Module):
|
||||
class BlenderbotDecoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: BlenderbotConfig, layer_idx: Optional[int] = None):
|
||||
super().__init__()
|
||||
self.embed_dim = config.d_model
|
||||
@ -825,21 +826,12 @@ class BlenderbotEncoder(BlenderbotPreTrainedModel):
|
||||
if to_drop:
|
||||
layer_outputs = (None, None)
|
||||
else:
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
encoder_layer.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
(head_mask[idx] if head_mask is not None else None),
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
@ -1090,35 +1082,18 @@ class BlenderbotDecoder(BlenderbotPreTrainedModel):
|
||||
if dropout_probability < self.layerdrop:
|
||||
continue
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
decoder_layer.__call__,
|
||||
hidden_states,
|
||||
causal_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
head_mask[idx] if head_mask is not None else None,
|
||||
cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
|
||||
None,
|
||||
output_attentions,
|
||||
use_cache,
|
||||
cache_position,
|
||||
)
|
||||
else:
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
attention_mask=causal_mask,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
||||
cross_attn_layer_head_mask=(
|
||||
cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
|
||||
),
|
||||
past_key_value=past_key_values,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
)
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
causal_mask,
|
||||
encoder_hidden_states, # as a positional argument for gradient checkpointing
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
||||
cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None),
|
||||
past_key_value=past_key_values,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
)
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
if use_cache:
|
||||
|
@ -32,6 +32,7 @@ from ...modeling_attn_mask_utils import (
|
||||
_prepare_4d_attention_mask_for_sdpa,
|
||||
)
|
||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
@ -254,7 +255,7 @@ class BlenderbotSmallAttention(nn.Module):
|
||||
|
||||
|
||||
# Copied from transformers.models.bart.modeling_bart.BartEncoderLayer with Bart->BlenderbotSmall, BART->BLENDERBOT_SMALL
|
||||
class BlenderbotSmallEncoderLayer(nn.Module):
|
||||
class BlenderbotSmallEncoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: BlenderbotSmallConfig, layer_idx: Optional[int] = None):
|
||||
super().__init__()
|
||||
self.embed_dim = config.d_model
|
||||
@ -326,7 +327,7 @@ class BlenderbotSmallEncoderLayer(nn.Module):
|
||||
|
||||
|
||||
# Copied from transformers.models.bart.modeling_bart.BartDecoderLayer with Bart->BlenderbotSmall, BART->BLENDERBOT_SMALL
|
||||
class BlenderbotSmallDecoderLayer(nn.Module):
|
||||
class BlenderbotSmallDecoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: BlenderbotSmallConfig, layer_idx: Optional[int] = None):
|
||||
super().__init__()
|
||||
self.embed_dim = config.d_model
|
||||
@ -812,21 +813,12 @@ class BlenderbotSmallEncoder(BlenderbotSmallPreTrainedModel):
|
||||
if to_drop:
|
||||
layer_outputs = (None, None)
|
||||
else:
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
encoder_layer.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
(head_mask[idx] if head_mask is not None else None),
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
@ -1073,35 +1065,18 @@ class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel):
|
||||
if dropout_probability < self.layerdrop:
|
||||
continue
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
decoder_layer.__call__,
|
||||
hidden_states,
|
||||
causal_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
head_mask[idx] if head_mask is not None else None,
|
||||
cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
|
||||
None,
|
||||
output_attentions,
|
||||
use_cache,
|
||||
cache_position,
|
||||
)
|
||||
else:
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
attention_mask=causal_mask,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
||||
cross_attn_layer_head_mask=(
|
||||
cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
|
||||
),
|
||||
past_key_value=past_key_values,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
)
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
causal_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
||||
cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None),
|
||||
past_key_value=past_key_values,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
)
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
if use_cache:
|
||||
|
@ -49,31 +49,31 @@ def blip_loss(similarity: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
|
||||
@dataclass
|
||||
class BlipForConditionalGenerationModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
|
||||
last hidden states. This class also adds the loss term from the text decoder.
|
||||
"""
|
||||
)
|
||||
class BlipForConditionalGenerationModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Language modeling loss from the text decoder.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
|
||||
Prediction scores of the language modeling head of the text decoder model.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*):
|
||||
The image embeddings obtained after applying the Vision Transformer model to the input image.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Language modeling loss from the text decoder.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
|
||||
Prediction scores of the language modeling head of the text decoder model.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*):
|
||||
The image embeddings obtained after applying the Vision Transformer model to the input image.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[tuple[torch.FloatTensor]] = None
|
||||
@ -94,29 +94,18 @@ class BlipForConditionalGenerationModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class BlipTextVisionModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
|
||||
last hidden states. This class also adds the loss term from the text decoder.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss from the text decoder.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class BlipTextVisionModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss from the text decoder.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -127,36 +116,25 @@ class BlipTextVisionModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class BlipImageTextMatchingModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
|
||||
last hidden states. This class also adds the loss term from the text decoder as well as the image-text similarity
|
||||
scores.
|
||||
|
||||
Args:
|
||||
itm_score (`torch.FloatTensor`):
|
||||
The image-text similarity scores.
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss from the text decoder.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
vision_pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*):
|
||||
Last layer hidden-state of the vision of the vision-only branch of the model.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
question_embeds (`torch.FloatTensor`):
|
||||
The question embeddings obtained by the text projection layer.
|
||||
"""
|
||||
)
|
||||
class BlipImageTextMatchingModelOutput(ModelOutput):
|
||||
r"""
|
||||
itm_score (`torch.FloatTensor`):
|
||||
The image-text similarity scores.
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Language modeling loss from the text decoder.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
vision_pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*):
|
||||
Last layer hidden-state of the vision of the vision-only branch of the model.
|
||||
question_embeds (`torch.FloatTensor`):
|
||||
The question embeddings obtained by the text projection layer.
|
||||
"""
|
||||
|
||||
itm_score: Optional[torch.FloatTensor] = None
|
||||
@ -170,25 +148,25 @@ class BlipImageTextMatchingModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class BlipOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of [`BlipTextModel`].
|
||||
image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output of [`BlipVisionModel`].
|
||||
text_model_output(`BaseModelOutputWithPooling`):
|
||||
The output of the [`BlipTextModel`].
|
||||
vision_model_output(`BaseModelOutputWithPooling`):
|
||||
The output of the [`BlipVisionModel`].
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of [`BlipTextModel`].
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output of [`BlipVisionModel`].
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`BlipTextModel`].
|
||||
vision_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`BlipVisionModel`].
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -552,7 +530,7 @@ class BlipEncoder(nn.Module):
|
||||
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
attention_mask=attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
|
@ -45,21 +45,23 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Blip2ForConditionalGenerationModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Class defining the outputs of [`Blip2ForConditionalGeneration`].
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Language modeling loss from the language model.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head of the language model.
|
||||
vision_outputs (`BaseModelOutputWithPooling`):
|
||||
Outputs of the vision encoder.
|
||||
qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
|
||||
Outputs of the Q-Former (Querying Transformer).
|
||||
language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
|
||||
Outputs of the language model.
|
||||
"""
|
||||
)
|
||||
class Blip2ForConditionalGenerationModelOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
|
||||
Language modeling loss from the language model.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head of the language model.
|
||||
vision_outputs (`BaseModelOutputWithPooling`):
|
||||
Outputs of the vision encoder.
|
||||
qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
|
||||
Outputs of the Q-Former (Querying Transformer).
|
||||
language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
|
||||
Outputs of the language model.
|
||||
"""
|
||||
|
||||
loss: Optional[tuple[torch.FloatTensor]] = None
|
||||
@ -78,25 +80,25 @@ class Blip2ForConditionalGenerationModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class Blip2ImageTextMatchingModelOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output.
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`Blip2QFormerModel`].
|
||||
vision_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`Blip2VisionModel`].
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output.
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output.
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`Blip2QFormerModel`].
|
||||
vision_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`Blip2VisionModel`].
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -115,27 +117,16 @@ class Blip2ImageTextMatchingModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for text model's outputs that also contains a pooling of the last hidden states.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->Blip2
|
||||
class Blip2TextModelOutput(ModelOutput):
|
||||
"""
|
||||
Base class for text model's outputs that also contains a pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The text embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
r"""
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The text embeddings obtained by applying the projection layer to the pooler_output.
|
||||
"""
|
||||
|
||||
text_embeds: Optional[torch.FloatTensor] = None
|
||||
@ -145,27 +136,16 @@ class Blip2TextModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Blip2
|
||||
class Blip2VisionModelOutput(ModelOutput):
|
||||
"""
|
||||
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
r"""
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
"""
|
||||
|
||||
image_embeds: Optional[torch.FloatTensor] = None
|
||||
@ -531,7 +511,7 @@ class Blip2Encoder(nn.Module):
|
||||
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
attention_mask=attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
@ -992,11 +972,11 @@ class Blip2QFormerEncoder(nn.Module):
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
query_length,
|
||||
encoder_hidden_states, # as a positional argument for gradient checkpointing
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
past_key_value=past_key_value,
|
||||
output_attentions=output_attentions,
|
||||
query_length=query_length,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
@ -27,6 +27,7 @@ from torch.nn import functional as F
|
||||
from ...cache_utils import Cache, DynamicCache, StaticCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
CausalLMOutputWithCrossAttentions,
|
||||
@ -366,7 +367,7 @@ class BloomMLP(nn.Module):
|
||||
return output
|
||||
|
||||
|
||||
class BloomBlock(nn.Module):
|
||||
class BloomBlock(GradientCheckpointingLayer):
|
||||
def __init__(self, config: BloomConfig, layer_idx: Optional[int] = None):
|
||||
super().__init__()
|
||||
hidden_size = config.hidden_size
|
||||
@ -605,29 +606,16 @@ class BloomModel(BloomPreTrainedModel):
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
outputs = self._gradient_checkpointing_func(
|
||||
block.__call__,
|
||||
hidden_states,
|
||||
alibi,
|
||||
causal_mask,
|
||||
past_key_values,
|
||||
head_mask[i],
|
||||
use_cache,
|
||||
output_attentions,
|
||||
cache_position,
|
||||
)
|
||||
else:
|
||||
outputs = block(
|
||||
hidden_states,
|
||||
layer_past=past_key_values,
|
||||
attention_mask=causal_mask,
|
||||
head_mask=head_mask[i],
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
alibi=alibi,
|
||||
cache_position=cache_position,
|
||||
)
|
||||
outputs = block(
|
||||
hidden_states,
|
||||
layer_past=past_key_values,
|
||||
attention_mask=causal_mask,
|
||||
head_mask=head_mask[i],
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
alibi=alibi,
|
||||
cache_position=cache_position,
|
||||
)
|
||||
|
||||
hidden_states = outputs[0]
|
||||
if use_cache:
|
||||
|
@ -223,6 +223,7 @@ class BridgeTowerImageProcessorFast(BaseImageProcessorFast):
|
||||
images: list["torch.Tensor"],
|
||||
constant_values: Union[float, Iterable[float]] = 0,
|
||||
return_pixel_mask: bool = True,
|
||||
disable_grouping: Optional[bool] = False,
|
||||
) -> tuple:
|
||||
"""
|
||||
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
|
||||
@ -235,6 +236,8 @@ class BridgeTowerImageProcessorFast(BaseImageProcessorFast):
|
||||
The value to use for the padding if `mode` is `"constant"`.
|
||||
return_pixel_mask (`bool`, *optional*, defaults to `True`):
|
||||
Whether to return a pixel mask.
|
||||
disable_grouping (`bool`, *optional*, defaults to `False`):
|
||||
Whether to disable grouping of images by size.
|
||||
return_tensors (`str` or `TensorType`, *optional*):
|
||||
The type of tensors to return. Can be one of:
|
||||
- Unset: Return a list of `np.ndarray`.
|
||||
@ -245,7 +248,7 @@ class BridgeTowerImageProcessorFast(BaseImageProcessorFast):
|
||||
"""
|
||||
pad_size = get_max_height_width(images)
|
||||
|
||||
grouped_images, grouped_images_index = group_images_by_shape(images)
|
||||
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
|
||||
processed_images_grouped = {}
|
||||
processed_masks_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
@ -283,11 +286,12 @@ class BridgeTowerImageProcessorFast(BaseImageProcessorFast):
|
||||
do_normalize: bool,
|
||||
image_mean: Optional[Union[float, list[float]]],
|
||||
image_std: Optional[Union[float, list[float]]],
|
||||
disable_grouping: Optional[bool],
|
||||
return_tensors: Optional[Union[str, TensorType]],
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
# Group images by size for batched resizing
|
||||
grouped_images, grouped_images_index = group_images_by_shape(images)
|
||||
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
|
||||
resized_images_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
if do_resize:
|
||||
@ -299,7 +303,7 @@ class BridgeTowerImageProcessorFast(BaseImageProcessorFast):
|
||||
|
||||
# Group images by size for further processing
|
||||
# Needed in case do_resize is False, or resize returns images with different sizes
|
||||
grouped_images, grouped_images_index = group_images_by_shape(resized_images)
|
||||
grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
|
||||
processed_images_grouped = {}
|
||||
for shape, stacked_images in grouped_images.items():
|
||||
if do_center_crop:
|
||||
@ -314,7 +318,9 @@ class BridgeTowerImageProcessorFast(BaseImageProcessorFast):
|
||||
|
||||
data = {}
|
||||
if do_pad:
|
||||
processed_images, processed_masks = self.pad(processed_images, return_pixel_mask=True)
|
||||
processed_images, processed_masks = self.pad(
|
||||
processed_images, return_pixel_mask=True, disable_grouping=disable_grouping
|
||||
)
|
||||
processed_masks = torch.stack(processed_masks, dim=0) if return_tensors else processed_masks
|
||||
data["pixel_mask"] = processed_masks
|
||||
|
||||
|
@ -25,6 +25,7 @@ from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN, QuickGELUActivation
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
BaseModelOutputWithPoolingAndCrossAttentions,
|
||||
@ -44,28 +45,20 @@ _TOKENIZER_FOR_DOC = "RobertaTokenizer"
|
||||
|
||||
|
||||
@dataclass
|
||||
class BridgeTowerModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`BridgeTowerModel`].
|
||||
|
||||
Args:
|
||||
text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the text output of the last layer of the model.
|
||||
image_features (`torch.FloatTensor` of shape `(batch_size, image_sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the image output of the last layer of the model.
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size x 2)`):
|
||||
Concatenation of last layer hidden-state of the first token of the text and image sequence (classification
|
||||
token), respectively, after further processing through layers used for auxiliary pretraining tasks.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
|
||||
the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class BridgeTowerModelOutput(ModelOutput):
|
||||
r"""
|
||||
text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the text output of the last layer of the model.
|
||||
image_features (`torch.FloatTensor` of shape `(batch_size, image_sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the image output of the last layer of the model.
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size x 2)`):
|
||||
Concatenation of last layer hidden-state of the first token of the text and image sequence (classification
|
||||
token), respectively, after further processing through layers used for auxiliary pretraining tasks.
|
||||
"""
|
||||
|
||||
text_features: Optional[torch.FloatTensor] = None
|
||||
@ -76,28 +69,26 @@ class BridgeTowerModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class BridgeTowerContrastiveOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of ['BridgeTowerForContrastiveLearning']
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`:
|
||||
Image-text contrastive loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
text_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
|
||||
The text embeddings obtained by applying the projection layer to the pooler_output.
|
||||
image_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
cross_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
|
||||
The text-image cross-modal embeddings obtained by applying the projection layer to the pooler_output.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
|
||||
the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
"""
|
||||
)
|
||||
class BridgeTowerContrastiveOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Image-text contrastive loss.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
text_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
|
||||
The text embeddings obtained by applying the projection layer to the pooler_output.
|
||||
image_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
cross_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
|
||||
The text-image cross-modal embeddings obtained by applying the projection layer to the pooler_output.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -662,7 +653,7 @@ class BridgeTowerBertCrossLayer(nn.Module):
|
||||
return layer_output
|
||||
|
||||
|
||||
class BridgeTowerTextLayer(nn.Module):
|
||||
class BridgeTowerTextLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.chunk_size_feed_forward = config.chunk_size_feed_forward
|
||||
@ -788,27 +779,15 @@ class BridgeTowerTextEncoder(nn.Module):
|
||||
layer_head_mask = head_mask[i] if head_mask is not None else None
|
||||
past_key_value = past_key_values[i] if past_key_values is not None else None
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
layer_module.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states, # as a positional argument for gradient checkpointing
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
past_key_value=past_key_value,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
if use_cache:
|
||||
|
@ -24,6 +24,7 @@ from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
BaseModelOutputWithPoolingAndCrossAttentions,
|
||||
@ -39,28 +40,19 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class BrosSpadeOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for outputs of token classification models.
|
||||
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
|
||||
Classification loss.
|
||||
initial_token_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
|
||||
Classification scores for entity initial tokens (before SoftMax).
|
||||
subsequent_token_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, sequence_length+1)`):
|
||||
Classification scores for entity sequence tokens (before SoftMax).
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class BrosSpadeOutput(ModelOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Classification loss.
|
||||
initial_token_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
|
||||
Classification scores for entity initial tokens (before SoftMax).
|
||||
subsequent_token_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, sequence_length+1)`):
|
||||
Classification scores for entity sequence tokens (before SoftMax).
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -428,7 +420,7 @@ class BrosOutput(nn.Module):
|
||||
return hidden_states
|
||||
|
||||
|
||||
class BrosLayer(nn.Module):
|
||||
class BrosLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.chunk_size_feed_forward = config.chunk_size_feed_forward
|
||||
@ -550,34 +542,16 @@ class BrosEncoder(nn.Module):
|
||||
layer_head_mask = head_mask[i] if head_mask is not None else None
|
||||
past_key_value = past_key_values[i] if past_key_values is not None else None
|
||||
|
||||
if getattr(self.config, "gradient_checkpointing", False) and self.training:
|
||||
if use_cache:
|
||||
logger.warning(
|
||||
"`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
|
||||
"`use_cache=False`..."
|
||||
)
|
||||
use_cache = False
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
layer_module.__call__,
|
||||
hidden_states,
|
||||
bbox_pos_emb,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = layer_module(
|
||||
hidden_states=hidden_states,
|
||||
bbox_pos_emb=bbox_pos_emb,
|
||||
attention_mask=attention_mask,
|
||||
head_mask=layer_head_mask,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
past_key_value=past_key_value,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
bbox_pos_emb,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states, # as a positional argument for gradient checkpointing
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
past_key_value=past_key_value,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
if use_cache:
|
||||
|
@ -27,6 +27,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
from ...activations import ACT2FN, gelu
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa, _prepare_4d_causal_attention_mask_for_sdpa
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
BaseModelOutputWithPoolingAndCrossAttentions,
|
||||
@ -478,7 +479,7 @@ class CamembertOutput(nn.Module):
|
||||
|
||||
|
||||
# Copied from transformers.models.roberta.modeling_roberta.RobertaLayer with Roberta->Camembert
|
||||
class CamembertLayer(nn.Module):
|
||||
class CamembertLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.chunk_size_feed_forward = config.chunk_size_feed_forward
|
||||
@ -604,27 +605,15 @@ class CamembertEncoder(nn.Module):
|
||||
layer_head_mask = head_mask[i] if head_mask is not None else None
|
||||
past_key_value = past_key_values[i] if past_key_values is not None else None
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
layer_module.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states, # as a positional argument for gradient checkpointing
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
past_key_value=past_key_value,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
if use_cache:
|
||||
|
@ -26,6 +26,7 @@ from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
ModelOutput,
|
||||
@ -48,32 +49,34 @@ _PRIMES = [31, 43, 59, 61, 73, 97, 103, 113, 137, 149, 157, 173, 181, 193, 211,
|
||||
|
||||
|
||||
@dataclass
|
||||
class CanineModelOutputWithPooling(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Output type of [`CanineModel`]. Based on [`~modeling_outputs.BaseModelOutputWithPooling`], but with slightly
|
||||
different `hidden_states` and `attentions`, as these also include the hidden states and attentions of the shallow
|
||||
Transformer encoders.
|
||||
|
||||
Args:
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model (i.e. the output of the final
|
||||
shallow Transformer encoder).
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Hidden-state of the first token of the sequence (classification token) at the last layer of the deep
|
||||
Transformer encoder, further processed by a Linear layer and a Tanh activation function. The Linear layer
|
||||
weights are trained from the next sentence prediction (classification) objective during pretraining.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the input to each encoder + one for the output of each layer of each
|
||||
encoder) of shape `(batch_size, sequence_length, hidden_size)` and `(batch_size, sequence_length //
|
||||
config.downsampling_rate, hidden_size)`. Hidden-states of the model at the output of each layer plus the
|
||||
initial input to each Transformer encoder. The hidden states of the shallow encoders have length
|
||||
`sequence_length`, but the hidden states of the deep encoder have length `sequence_length` //
|
||||
`config.downsampling_rate`.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of the 3 Transformer encoders of shape `(batch_size,
|
||||
num_heads, sequence_length, sequence_length)` and `(batch_size, num_heads, sequence_length //
|
||||
config.downsampling_rate, sequence_length // config.downsampling_rate)`. Attentions weights after the
|
||||
attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
"""
|
||||
)
|
||||
class CanineModelOutputWithPooling(ModelOutput):
|
||||
r"""
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model (i.e. the output of the final
|
||||
shallow Transformer encoder).
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Hidden-state of the first token of the sequence (classification token) at the last layer of the deep
|
||||
Transformer encoder, further processed by a Linear layer and a Tanh activation function. The Linear layer
|
||||
weights are trained from the next sentence prediction (classification) objective during pretraining.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the input to each encoder + one for the output of each layer of each
|
||||
encoder) of shape `(batch_size, sequence_length, hidden_size)` and `(batch_size, sequence_length //
|
||||
config.downsampling_rate, hidden_size)`. Hidden-states of the model at the output of each layer plus the
|
||||
initial input to each Transformer encoder. The hidden states of the shallow encoders have length
|
||||
`sequence_length`, but the hidden states of the deep encoder have length `sequence_length` //
|
||||
`config.downsampling_rate`.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of the 3 Transformer encoders of shape `(batch_size,
|
||||
num_heads, sequence_length, sequence_length)` and `(batch_size, num_heads, sequence_length //
|
||||
config.downsampling_rate, sequence_length // config.downsampling_rate)`. Attentions weights after the
|
||||
attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
"""
|
||||
|
||||
last_hidden_state: Optional[torch.FloatTensor] = None
|
||||
@ -672,7 +675,7 @@ class CanineOutput(nn.Module):
|
||||
return hidden_states
|
||||
|
||||
|
||||
class CanineLayer(nn.Module):
|
||||
class CanineLayer(GradientCheckpointingLayer):
|
||||
def __init__(
|
||||
self,
|
||||
config,
|
||||
@ -779,16 +782,7 @@ class CanineEncoder(nn.Module):
|
||||
|
||||
layer_head_mask = head_mask[i] if head_mask is not None else None
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
layer_module.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = layer_module(hidden_states, attention_mask, layer_head_mask, output_attentions)
|
||||
layer_outputs = layer_module(hidden_states, attention_mask, layer_head_mask, output_attentions)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
if output_attentions:
|
||||
|
@ -19,11 +19,7 @@ from typing import Optional, Union
|
||||
import numpy as np
|
||||
|
||||
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
|
||||
from ...image_transforms import (
|
||||
get_resize_output_image_size,
|
||||
resize,
|
||||
to_channel_dimension_format,
|
||||
)
|
||||
from ...image_transforms import get_resize_output_image_size, resize, to_channel_dimension_format
|
||||
from ...image_utils import (
|
||||
ChannelDimension,
|
||||
ImageInput,
|
||||
|
@ -27,6 +27,7 @@ from ...cache_utils import Cache, DynamicCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
||||
from ...processing_utils import Unpack
|
||||
@ -383,7 +384,7 @@ class ChameleonAttention(nn.Module):
|
||||
|
||||
|
||||
# copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->Chameleon, LLAMA->CHAMELEON
|
||||
class ChameleonDecoderLayer(nn.Module):
|
||||
class ChameleonDecoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: ChameleonConfig, layer_idx: int):
|
||||
super().__init__()
|
||||
self.hidden_size = config.hidden_size
|
||||
@ -458,7 +459,7 @@ class ChameleonDecoderLayer(nn.Module):
|
||||
return outputs
|
||||
|
||||
|
||||
class ChameleonSwinDecoderLayer(nn.Module):
|
||||
class ChameleonSwinDecoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: ChameleonConfig, layer_idx: int):
|
||||
super().__init__()
|
||||
self.hidden_size = config.hidden_size
|
||||
@ -1011,28 +1012,16 @@ class ChameleonModel(ChameleonPreTrainedModel):
|
||||
if output_hidden_states:
|
||||
all_hidden_states += (hidden_states,)
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
decoder_layer.__call__,
|
||||
hidden_states,
|
||||
causal_mask,
|
||||
position_ids,
|
||||
past_key_values,
|
||||
output_attentions,
|
||||
use_cache,
|
||||
cache_position,
|
||||
)
|
||||
else:
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
attention_mask=causal_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_value=past_key_values,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
**kwargs,
|
||||
)
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
attention_mask=causal_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_value=past_key_values,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
|
@ -23,6 +23,7 @@ import torch.utils.checkpoint
|
||||
from torch import nn
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutput,
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
@ -51,27 +52,27 @@ def chinese_clip_loss(similarity: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class ChineseCLIPOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of
|
||||
[`ChineseCLIPTextModel`].
|
||||
image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output of
|
||||
[`ChineseCLIPVisionModel`].
|
||||
text_model_output(`BaseModelOutputWithPoolingAndCrossAttentions`):
|
||||
The output of the [`ChineseCLIPTextModel`].
|
||||
vision_model_output(`BaseModelOutputWithPoolingAndCrossAttentions`):
|
||||
The output of the [`ChineseCLIPVisionModel`].
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of
|
||||
[`ChineseCLIPTextModel`].
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output of
|
||||
[`ChineseCLIPVisionModel`].
|
||||
text_model_output (`BaseModelOutputWithPoolingAndCrossAttentions`):
|
||||
The output of the [`ChineseCLIPTextModel`].
|
||||
vision_model_output (`BaseModelOutputWithPoolingAndCrossAttentions`):
|
||||
The output of the [`ChineseCLIPVisionModel`].
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -577,7 +578,7 @@ class ChineseCLIPVisionMLP(nn.Module):
|
||||
|
||||
|
||||
# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->ChineseCLIPText
|
||||
class ChineseCLIPTextLayer(nn.Module):
|
||||
class ChineseCLIPTextLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.chunk_size_feed_forward = config.chunk_size_feed_forward
|
||||
@ -663,7 +664,7 @@ class ChineseCLIPTextLayer(nn.Module):
|
||||
return layer_output
|
||||
|
||||
|
||||
class ChineseCLIPVisionLayer(nn.Module):
|
||||
class ChineseCLIPVisionLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: ChineseCLIPConfig):
|
||||
super().__init__()
|
||||
self.embed_dim = config.hidden_size
|
||||
@ -816,27 +817,15 @@ class ChineseCLIPTextEncoder(nn.Module):
|
||||
layer_head_mask = head_mask[i] if head_mask is not None else None
|
||||
past_key_value = past_key_values[i] if past_key_values is not None else None
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
layer_module.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states, # as a positional argument for gradient checkpointing
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
past_key_value=past_key_value,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
if use_cache:
|
||||
@ -920,17 +909,10 @@ class ChineseCLIPVisionEncoder(nn.Module):
|
||||
for idx, encoder_layer in enumerate(self.layers):
|
||||
if output_hidden_states:
|
||||
encoder_states = encoder_states + (hidden_states,)
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
encoder_layer.__call__,
|
||||
hidden_states,
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
|
@ -24,6 +24,7 @@ import torch.nn.functional as F
|
||||
from torch import nn
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
BaseModelOutputWithPooling,
|
||||
@ -121,27 +122,16 @@ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for text model's outputs that also contains a pooling of the last hidden states.
|
||||
"""
|
||||
)
|
||||
# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->Clap
|
||||
class ClapTextModelOutput(ModelOutput):
|
||||
"""
|
||||
Base class for text model's outputs that also contains a pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The text embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
r"""
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The text embeddings obtained by applying the projection layer to the pooler_output.
|
||||
"""
|
||||
|
||||
text_embeds: Optional[torch.FloatTensor] = None
|
||||
@ -151,26 +141,15 @@ class ClapTextModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class ClapAudioModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
ClapAudio model output to mimic the output of the original implementation.
|
||||
|
||||
Args:
|
||||
audio_embeds (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
The Audio embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
"""
|
||||
)
|
||||
class ClapAudioModelOutput(ModelOutput):
|
||||
r"""
|
||||
audio_embeds (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
The Audio embeddings obtained by applying the projection layer to the pooler_output.
|
||||
"""
|
||||
|
||||
audio_embeds: Optional[torch.FloatTensor] = None
|
||||
@ -180,26 +159,26 @@ class ClapAudioModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->Clap, vision->audio, Vision->Audio, image->audio
|
||||
class ClapOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for audio-text similarity.
|
||||
logits_per_audio (`torch.FloatTensor` of shape `(audio_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `audio_embeds` and `text_embeds`. This represents the audio-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, audio_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `audio_embeds`. This represents the text-audio
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of [`ClapTextModel`].
|
||||
audio_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The audio embeddings obtained by applying the projection layer to the pooled output of [`ClapAudioModel`].
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`ClapTextModel`].
|
||||
audio_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`ClapAudioModel`].
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for audio-text similarity.
|
||||
logits_per_audio (`torch.FloatTensor` of shape `(audio_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `audio_embeds` and `text_embeds`. This represents the audio-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, audio_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `audio_embeds`. This represents the text-audio
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of [`ClapTextModel`].
|
||||
audio_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The audio embeddings obtained by applying the projection layer to the pooled output of [`ClapAudioModel`].
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`ClapTextModel`].
|
||||
audio_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`ClapAudioModel`].
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -691,7 +670,7 @@ class ClapAudioLayer(nn.Module):
|
||||
|
||||
|
||||
# Copied from transformers.models.swin.modeling_swin.SwinStage with Swin->ClapAudio
|
||||
class ClapAudioStage(nn.Module):
|
||||
class ClapAudioStage(GradientCheckpointingLayer):
|
||||
def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, downsample):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
@ -928,14 +907,9 @@ class ClapAudioEncoder(nn.Module):
|
||||
|
||||
input_dimensions = self.input_resolutions[i]
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
layer_module.__call__, hidden_states, input_dimensions, layer_head_mask, output_attentions
|
||||
)
|
||||
else:
|
||||
layer_outputs = layer_module(
|
||||
hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
|
||||
)
|
||||
layer_outputs = layer_module(
|
||||
hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
@ -1355,7 +1329,7 @@ class ClapTextOutput(nn.Module):
|
||||
|
||||
|
||||
# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->ClapText
|
||||
class ClapTextLayer(nn.Module):
|
||||
class ClapTextLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.chunk_size_feed_forward = config.chunk_size_feed_forward
|
||||
@ -1481,27 +1455,15 @@ class ClapTextEncoder(nn.Module):
|
||||
layer_head_mask = head_mask[i] if head_mask is not None else None
|
||||
past_key_value = past_key_values[i] if past_key_values is not None else None
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
layer_module.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states, # as a positional argument for gradient checkpointing
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
past_key_value=past_key_value,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
if use_cache:
|
||||
@ -1947,11 +1909,11 @@ class ClapModel(ClapPreTrainedModel):
|
||||
input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Input audio features. This should be returned by the [`ClapFeatureExtractor`] class that you can also
|
||||
retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
|
||||
return_loss (`bool`, *optional*):
|
||||
Whether or not to return the contrastive loss.
|
||||
is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
|
||||
Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
|
||||
the features.
|
||||
return_loss (`bool`, *optional*):
|
||||
Whether or not to return the contrastive loss.
|
||||
|
||||
Examples:
|
||||
|
||||
|
@ -23,6 +23,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
||||
from ...utils import ModelOutput, auto_docstring, can_return_tuple, logging, torch_int
|
||||
@ -56,26 +57,15 @@ def _get_vector_norm(tensor: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
|
||||
@dataclass
|
||||
class CLIPVisionModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class CLIPVisionModelOutput(ModelOutput):
|
||||
r"""
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The image embeddings obtained by applying the projection layer to the pooler_output.
|
||||
"""
|
||||
|
||||
image_embeds: Optional[torch.FloatTensor] = None
|
||||
@ -85,26 +75,15 @@ class CLIPVisionModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
class CLIPTextModelOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for text model's outputs that also contains a pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The text embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
)
|
||||
class CLIPTextModelOutput(ModelOutput):
|
||||
r"""
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
||||
The text embeddings obtained by applying the projection layer to the pooler_output.
|
||||
"""
|
||||
|
||||
text_embeds: Optional[torch.FloatTensor] = None
|
||||
@ -114,25 +93,25 @@ class CLIPTextModelOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class CLIPOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`CLIPTextModel`].
|
||||
vision_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`CLIPVisionModel`].
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`CLIPTextModel`].
|
||||
vision_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`CLIPVisionModel`].
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -393,7 +372,7 @@ class CLIPMLP(nn.Module):
|
||||
return hidden_states
|
||||
|
||||
|
||||
class CLIPEncoderLayer(nn.Module):
|
||||
class CLIPEncoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: Union[CLIPVisionConfig, CLIPTextConfig]):
|
||||
super().__init__()
|
||||
self.embed_dim = config.hidden_size
|
||||
@ -575,21 +554,12 @@ class CLIPEncoder(nn.Module):
|
||||
for idx, encoder_layer in enumerate(self.layers):
|
||||
if output_hidden_states:
|
||||
encoder_states = encoder_states + (hidden_states,)
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
encoder_layer.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
causal_attention_mask,
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
causal_attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
causal_attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
|
@ -25,6 +25,7 @@ from torch import nn
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
|
||||
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
||||
from ...utils import ModelOutput, auto_docstring, logging, torch_int
|
||||
@ -48,26 +49,26 @@ def clipseg_loss(similarity: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->CLIPSeg
|
||||
class CLIPSegOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`CLIPSegTextModel`].
|
||||
vision_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`CLIPSegVisionModel`].
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
|
||||
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`CLIPSegTextModel`].
|
||||
vision_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`CLIPSegVisionModel`].
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -86,18 +87,11 @@ class CLIPSegOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class CLIPSegDecoderOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
|
||||
Classification scores for each pixel.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
r"""
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
|
||||
Classification scores for each pixel.
|
||||
"""
|
||||
|
||||
logits: Optional[torch.FloatTensor] = None
|
||||
@ -106,14 +100,21 @@ class CLIPSegDecoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class CLIPSegImageSegmentationOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for image-text similarity.
|
||||
...
|
||||
vision_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`CLIPSegVisionModel`].
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
||||
Binary cross entropy loss for segmentation.
|
||||
logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
|
||||
Classification scores for each pixel.
|
||||
conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, projection_dim)`):
|
||||
Conditional embeddings used for segmentation.
|
||||
pooled_output (`torch.FloatTensor` of shape `(batch_size, embed_dim)`):
|
||||
Pooled output of the [`CLIPSegVisionModel`].
|
||||
vision_model_output (`BaseModelOutputWithPooling`):
|
||||
The output of the [`CLIPSegVisionModel`].
|
||||
decoder_output (`CLIPSegDecoderOutput`):
|
||||
The output of the [`CLIPSegDecoder`].
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
@ -374,7 +375,7 @@ class CLIPSegMLP(nn.Module):
|
||||
|
||||
|
||||
# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoderLayer with AltCLIP->CLIPSeg
|
||||
class CLIPSegEncoderLayer(nn.Module):
|
||||
class CLIPSegEncoderLayer(GradientCheckpointingLayer):
|
||||
def __init__(self, config: CLIPSegConfig):
|
||||
super().__init__()
|
||||
self.embed_dim = config.hidden_size
|
||||
@ -539,22 +540,12 @@ class CLIPSegEncoder(nn.Module):
|
||||
for idx, encoder_layer in enumerate(self.layers):
|
||||
if output_hidden_states:
|
||||
encoder_states = encoder_states + (hidden_states,)
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
encoder_layer.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
causal_attention_mask,
|
||||
output_attentions,
|
||||
)
|
||||
else:
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
causal_attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
causal_attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
if output_attentions:
|
||||
@ -1269,15 +1260,15 @@ class CLIPSegForImageSegmentation(CLIPSegPreTrainedModel):
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[tuple, CLIPSegOutput]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
|
||||
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
||||
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
conditional_pixel_values (`torch.FloatTensor`, *optional*):
|
||||
The pixel values of the conditional images.
|
||||
conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, config.projection_dim)`, *optional*):
|
||||
The conditional embeddings for the query images. If provided, the model will use this instead of computing
|
||||
the embeddings from the conditional_pixel_values.
|
||||
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
|
||||
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
||||
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
|
||||
Examples:
|
||||
|
||||
|
@ -15,11 +15,7 @@
|
||||
"""CLVP model configuration"""
|
||||
|
||||
import os
|
||||
from typing import TYPE_CHECKING, Union
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
pass
|
||||
from typing import Union
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
|
@ -144,26 +144,20 @@ def _pad_extra_bos_eos_tokens(
|
||||
|
||||
|
||||
@dataclass
|
||||
class ClvpEncoderOutput(ModelOutput):
|
||||
"""
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Base class for CLVP encoder's outputs that contains a pooling of the last hidden states as well as a projection
|
||||
output (a linear layer on top of the pooled output).
|
||||
|
||||
Args:
|
||||
embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when model is initialized with `with_projection=True`):
|
||||
The embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
The hidden state of the last layer of the model.
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Pooled output of the `last_hidden_state`.
|
||||
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
||||
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
||||
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
|
||||
the model at the output of each layer plus the optional initial embedding outputs.
|
||||
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
||||
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
"""
|
||||
)
|
||||
class ClvpEncoderOutput(ModelOutput):
|
||||
r"""
|
||||
embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when model is initialized with `with_projection=True`):
|
||||
The embeddings obtained by applying the projection layer to the pooler_output.
|
||||
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
||||
The hidden state of the last layer of the model.
|
||||
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
|
||||
Pooled output of the `last_hidden_state`.
|
||||
"""
|
||||
|
||||
embeds: Optional[torch.FloatTensor] = None
|
||||
@ -174,35 +168,35 @@ class ClvpEncoderOutput(ModelOutput):
|
||||
|
||||
|
||||
@dataclass
|
||||
@auto_docstring
|
||||
class ClvpOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for speech-text similarity.
|
||||
speech_ids (`torch.LongTensor`, *optional*):
|
||||
speech_ids (or speech candidates) generated by the `ClvpForCausalLM` model.
|
||||
logits_per_speech (`torch.FloatTensor` of shape `(speech_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `speech_embeds` and `text_embeds`. This represents the speech-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, speech_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `speech_embeds`. This represents the text-speech
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of the text encoder
|
||||
model.
|
||||
speech_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The speech embeddings obtained by applying the projection layer to the pooled output of the speech encoder
|
||||
model.
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The pooled output of the `last_hidden_state` of the text encoder Model.
|
||||
speech_model_output (`BaseModelOutputWithPooling`):
|
||||
The pooled output of the `last_hidden_state` of the speech encoder Model.
|
||||
decoder_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
The hidden states of the decoder model.
|
||||
text_encoder_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
The hidden states of the text encoder model.
|
||||
speech_encoder_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
The hidden states of the speech encoder model.
|
||||
r"""
|
||||
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
||||
Contrastive loss for speech-text similarity.
|
||||
speech_ids (`torch.LongTensor`, *optional*):
|
||||
speech_ids (or speech candidates) generated by the `ClvpForCausalLM` model.
|
||||
logits_per_speech (`torch.FloatTensor` of shape `(speech_batch_size, text_batch_size)`):
|
||||
The scaled dot product scores between `speech_embeds` and `text_embeds`. This represents the speech-text
|
||||
similarity scores.
|
||||
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, speech_batch_size)`):
|
||||
The scaled dot product scores between `text_embeds` and `speech_embeds`. This represents the text-speech
|
||||
similarity scores.
|
||||
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The text embeddings obtained by applying the projection layer to the pooled output of the text encoder
|
||||
model.
|
||||
speech_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
||||
The speech embeddings obtained by applying the projection layer to the pooled output of the speech encoder
|
||||
model.
|
||||
text_model_output (`BaseModelOutputWithPooling`):
|
||||
The pooled output of the `last_hidden_state` of the text encoder Model.
|
||||
speech_model_output (`BaseModelOutputWithPooling`):
|
||||
The pooled output of the `last_hidden_state` of the speech encoder Model.
|
||||
decoder_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
The hidden states of the decoder model.
|
||||
text_encoder_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
The hidden states of the text encoder model.
|
||||
speech_encoder_hidden_states (`torch.FloatTensor`, *optional*):
|
||||
The hidden states of the speech encoder model.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
|
@ -24,6 +24,7 @@ from ...activations import ACT2FN
|
||||
from ...cache_utils import Cache, DynamicCache
|
||||
from ...generation import GenerationMixin
|
||||
from ...modeling_attn_mask_utils import AttentionMaskConverter
|
||||
from ...modeling_layers import GradientCheckpointingLayer
|
||||
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
||||
from ...modeling_utils import PreTrainedModel
|
||||
from ...utils import (
|
||||
@ -245,7 +246,7 @@ class CodeGenMLP(nn.Module):
|
||||
|
||||
|
||||
# Copied from transformers.models.gptj.modeling_gptj.GPTJBlock with GPTJ->CodeGen
|
||||
class CodeGenBlock(nn.Module):
|
||||
class CodeGenBlock(GradientCheckpointingLayer):
|
||||
# Ignore copy
|
||||
def __init__(self, config, layer_idx=None):
|
||||
super().__init__()
|
||||
@ -437,29 +438,16 @@ class CodeGenModel(CodeGenPreTrainedModel):
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
outputs = self._gradient_checkpointing_func(
|
||||
block.__call__,
|
||||
hidden_states,
|
||||
None,
|
||||
causal_mask,
|
||||
position_ids,
|
||||
head_mask[i],
|
||||
use_cache,
|
||||
output_attentions,
|
||||
cache_position,
|
||||
)
|
||||
else:
|
||||
outputs = block(
|
||||
hidden_states=hidden_states,
|
||||
layer_past=past_key_values,
|
||||
attention_mask=causal_mask,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask[i],
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
cache_position=cache_position,
|
||||
)
|
||||
outputs = block(
|
||||
hidden_states,
|
||||
layer_past=past_key_values,
|
||||
attention_mask=causal_mask,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask[i],
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
cache_position=cache_position,
|
||||
)
|
||||
|
||||
hidden_states = outputs[0]
|
||||
if use_cache is True:
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user