mirror of
https://github.com/huggingface/transformers.git
synced 2025-11-05 04:34:37 +08:00
Compare commits
1 Commits
refactor-f
...
parallel
| Author | SHA1 | Date | |
|---|---|---|---|
| 8e3fbca4cd |
@ -160,7 +160,7 @@ jobs:
|
||||
environment:
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
PYTEST_TIMEOUT: 120
|
||||
parallelism: 1
|
||||
parallelism: 4
|
||||
steps:
|
||||
- checkout
|
||||
- run: uv pip install -e ".[quality]"
|
||||
@ -169,19 +169,19 @@ jobs:
|
||||
command: pip freeze | tee installed.txt
|
||||
- store_artifacts:
|
||||
path: ~/transformers/installed.txt
|
||||
- run: python utils/check_copies.py
|
||||
- run: python utils/check_modular_conversion.py --num_workers 4
|
||||
- run: python utils/check_table.py
|
||||
- run: python utils/check_dummies.py
|
||||
- run: python utils/check_repo.py
|
||||
- run: python utils/check_inits.py
|
||||
- run: python utils/check_config_docstrings.py
|
||||
- run: python utils/check_config_attributes.py
|
||||
- run: python utils/check_doctest_list.py
|
||||
- run: make deps_table_check_updated
|
||||
- run: python utils/update_metadata.py --check-only
|
||||
- run: python utils/check_docstrings.py
|
||||
- run: python utils/check_support_list.py
|
||||
- run: if [ "$CIRCLE_NODE_INDEX" == "0" ]; then python utils/check_copies.py; fi
|
||||
- run: if [ "$CIRCLE_NODE_INDEX" == "1" ]; then python utils/check_modular_conversion.py --num_workers 4; fi
|
||||
- run: if [ "$CIRCLE_NODE_INDEX" == "3" ]; then python utils/check_table.py; fi
|
||||
- run: if [ "$CIRCLE_NODE_INDEX" == "0" ]; then python utils/check_dummies.py; fi
|
||||
- run: if [ "$CIRCLE_NODE_INDEX" == "2" ]; then python utils/check_repo.py; fi
|
||||
- run: if [ "$CIRCLE_NODE_INDEX" == "2" ]; then python utils/check_inits.py; fi
|
||||
- run: if [ "$CIRCLE_NODE_INDEX" == "2" ]; then python utils/check_config_docstrings.py; fi
|
||||
- run: if [ "$CIRCLE_NODE_INDEX" == "2" ]; then python utils/check_config_attributes.py; fi
|
||||
- run: if [ "$CIRCLE_NODE_INDEX" == "0" ]; then python utils/check_doctest_list.py; fi
|
||||
- run: if [ "$CIRCLE_NODE_INDEX" == "0" ]; then make deps_table_check_updated; fi
|
||||
- run: if [ "$CIRCLE_NODE_INDEX" == "3" ]; then python utils/update_metadata.py --check-only; fi
|
||||
- run: if [ "$CIRCLE_NODE_INDEX" == "3" ]; then python utils/check_docstrings.py; fi
|
||||
- run: if [ "$CIRCLE_NODE_INDEX" == "0" ]; then python utils/check_support_list.py; fi
|
||||
|
||||
workflows:
|
||||
version: 2
|
||||
|
||||
@ -28,6 +28,8 @@ COMMON_ENV_VARIABLES = {
|
||||
"TRANSFORMERS_IS_CI": True,
|
||||
"PYTEST_TIMEOUT": 120,
|
||||
"RUN_PIPELINE_TESTS": False,
|
||||
"RUN_PT_TF_CROSS_TESTS": False,
|
||||
"RUN_PT_FLAX_CROSS_TESTS": False,
|
||||
}
|
||||
# Disable the use of {"s": None} as the output is way too long, causing the navigation on CircleCI impractical
|
||||
COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "dist": "loadfile", "vvv": None, "rsfE":None}
|
||||
@ -175,6 +177,23 @@ class CircleCIJob:
|
||||
|
||||
|
||||
# JOBS
|
||||
torch_and_tf_job = CircleCIJob(
|
||||
"torch_and_tf",
|
||||
docker_image=[{"image":"huggingface/transformers-torch-tf-light"}],
|
||||
additional_env={"RUN_PT_TF_CROSS_TESTS": True},
|
||||
marker="is_pt_tf_cross_test",
|
||||
pytest_options={"rA": None, "durations": 0},
|
||||
)
|
||||
|
||||
|
||||
torch_and_flax_job = CircleCIJob(
|
||||
"torch_and_flax",
|
||||
additional_env={"RUN_PT_FLAX_CROSS_TESTS": True},
|
||||
docker_image=[{"image":"huggingface/transformers-torch-jax-light"}],
|
||||
marker="is_pt_flax_cross_test",
|
||||
pytest_options={"rA": None, "durations": 0},
|
||||
)
|
||||
|
||||
torch_job = CircleCIJob(
|
||||
"torch",
|
||||
docker_image=[{"image": "huggingface/transformers-torch-light"}],
|
||||
@ -334,7 +353,7 @@ doc_test_job = CircleCIJob(
|
||||
pytest_num_workers=1,
|
||||
)
|
||||
|
||||
REGULAR_TESTS = [torch_job, tf_job, flax_job, hub_job, onnx_job, tokenization_job, processor_job, generate_job, non_model_job] # fmt: skip
|
||||
REGULAR_TESTS = [torch_and_tf_job, torch_and_flax_job, torch_job, tf_job, flax_job, hub_job, onnx_job, tokenization_job, processor_job, generate_job, non_model_job] # fmt: skip
|
||||
EXAMPLES_TESTS = [examples_torch_job, examples_tensorflow_job]
|
||||
PIPELINE_TESTS = [pipelines_torch_job, pipelines_tf_job]
|
||||
REPO_UTIL_TESTS = [repo_utils_job]
|
||||
|
||||
6
.github/workflows/build-ci-docker-images.yml
vendored
6
.github/workflows/build-ci-docker-images.yml
vendored
@ -26,7 +26,7 @@ jobs:
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
file: ["quality", "consistency", "custom-tokenizers", "torch-light", "tf-light", "exotic-models", "torch-tf-light", "jax-light", "examples-torch", "examples-tf"]
|
||||
file: ["quality", "consistency", "custom-tokenizers", "torch-light", "tf-light", "exotic-models", "torch-tf-light", "torch-jax-light", "jax-light", "examples-torch", "examples-tf"]
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
@ -34,11 +34,11 @@ jobs:
|
||||
name: Set tag
|
||||
run: |
|
||||
if ${{contains(github.event.head_commit.message, '[build-ci-image]')}}; then
|
||||
echo "TAG=huggingface/transformers-${{ matrix.file }}:dev" >> "$GITHUB_ENV"
|
||||
echo "TAG=huggingface/transformers-${{ matrix.file }}:dev" >> "$GITHUB_ENV"
|
||||
echo "setting it to DEV!"
|
||||
else
|
||||
echo "TAG=huggingface/transformers-${{ matrix.file }}" >> "$GITHUB_ENV"
|
||||
|
||||
|
||||
fi
|
||||
-
|
||||
name: Set up Docker Buildx
|
||||
|
||||
@ -22,6 +22,7 @@ env:
|
||||
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
||||
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||
RUN_PT_TF_CROSS_TESTS: 1
|
||||
CUDA_VISIBLE_DEVICES: 0,1
|
||||
|
||||
|
||||
|
||||
1
.github/workflows/model_jobs.yml
vendored
1
.github/workflows/model_jobs.yml
vendored
@ -30,6 +30,7 @@ env:
|
||||
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
||||
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||
RUN_PT_TF_CROSS_TESTS: 1
|
||||
CUDA_VISIBLE_DEVICES: 0,1
|
||||
|
||||
jobs:
|
||||
|
||||
1
.github/workflows/model_jobs_amd.yml
vendored
1
.github/workflows/model_jobs_amd.yml
vendored
@ -30,6 +30,7 @@ env:
|
||||
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
||||
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||
RUN_PT_TF_CROSS_TESTS: 1
|
||||
CUDA_VISIBLE_DEVICES: 0,1
|
||||
|
||||
jobs:
|
||||
|
||||
43
.github/workflows/push-important-models.yml
vendored
43
.github/workflows/push-important-models.yml
vendored
@ -7,13 +7,14 @@ on:
|
||||
env:
|
||||
OUTPUT_SLACK_CHANNEL_ID: "C06L2SGMEEA"
|
||||
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||
HF_HOME: /mnt/cache
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
OMP_NUM_THREADS: 8
|
||||
MKL_NUM_THREADS: 8
|
||||
RUN_SLOW: yes # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. # This token is created under the bot `hf-transformers-bot`.
|
||||
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
||||
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||
HF_HOME: /mnt/cache
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
OMP_NUM_THREADS: 8
|
||||
MKL_NUM_THREADS: 8
|
||||
RUN_SLOW: yes # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. # This token is created under the bot `hf-transformers-bot`.
|
||||
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
||||
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||
RUN_PT_TF_CROSS_TESTS: 1
|
||||
|
||||
jobs:
|
||||
get_modified_models:
|
||||
@ -24,13 +25,13 @@ jobs:
|
||||
steps:
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
|
||||
- name: Get changed files
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@3f54ebb830831fc121d3263c1857cfbdc310cdb9 #v42
|
||||
with:
|
||||
files: src/transformers/models/**
|
||||
|
||||
|
||||
- name: Run step if only the files listed above change
|
||||
if: steps.changed-files.outputs.any_changed == 'true'
|
||||
id: set-matrix
|
||||
@ -59,41 +60,41 @@ jobs:
|
||||
if: ${{ needs.get_modified_models.outputs.matrix != '[]' && needs.get_modified_models.outputs.matrix != '' && fromJson(needs.get_modified_models.outputs.matrix)[0] != null }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
matrix:
|
||||
model-name: ${{ fromJson(needs.get_modified_models.outputs.matrix) }}
|
||||
|
||||
steps:
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
|
||||
- name: Install locally transformers & other libs
|
||||
run: |
|
||||
apt install sudo
|
||||
sudo -H pip install --upgrade pip
|
||||
sudo -H pip uninstall -y transformers
|
||||
sudo -H pip install -U -e ".[testing]"
|
||||
sudo -H pip uninstall -y transformers
|
||||
sudo -H pip install -U -e ".[testing]"
|
||||
MAX_JOBS=4 pip install flash-attn --no-build-isolation
|
||||
pip install bitsandbytes
|
||||
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
run: pip freeze
|
||||
|
||||
|
||||
- name: Run FA2 tests
|
||||
id: run_fa2_tests
|
||||
run:
|
||||
pytest -rsfE -m "flash_attn_test" --make-reports=${{ matrix.model-name }}_fa2_tests/ tests/${{ matrix.model-name }}/test_modeling_*
|
||||
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ matrix.model-name }}_fa2_tests"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ matrix.model-name }}_fa2_tests
|
||||
path: /transformers/reports/${{ matrix.model-name }}_fa2_tests
|
||||
|
||||
|
||||
- name: Post to Slack
|
||||
if: always()
|
||||
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
||||
@ -102,13 +103,13 @@ jobs:
|
||||
title: 🤗 Results of the FA2 tests - ${{ matrix.model-name }}
|
||||
status: ${{ steps.run_fa2_tests.conclusion}}
|
||||
slack_token: ${{ secrets.CI_SLACK_BOT_TOKEN }}
|
||||
|
||||
|
||||
- name: Run integration tests
|
||||
id: run_integration_tests
|
||||
if: always()
|
||||
run:
|
||||
pytest -rsfE -k "IntegrationTest" --make-reports=tests_integration_${{ matrix.model-name }} tests/${{ matrix.model-name }}/test_modeling_*
|
||||
|
||||
|
||||
- name: "Test suite reports artifacts: tests_integration_${{ matrix.model-name }}"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
@ -118,7 +119,7 @@ jobs:
|
||||
|
||||
- name: Post to Slack
|
||||
if: always()
|
||||
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
||||
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
||||
with:
|
||||
slack_channel: ${{ env.OUTPUT_SLACK_CHANNEL_ID }}
|
||||
title: 🤗 Results of the Integration tests - ${{ matrix.model-name }}
|
||||
|
||||
1
.github/workflows/self-comment-ci.yml
vendored
1
.github/workflows/self-comment-ci.yml
vendored
@ -22,6 +22,7 @@ env:
|
||||
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
||||
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||
RUN_PT_TF_CROSS_TESTS: 1
|
||||
CUDA_VISIBLE_DEVICES: 0,1
|
||||
|
||||
jobs:
|
||||
|
||||
1
.github/workflows/self-push-amd.yml
vendored
1
.github/workflows/self-push-amd.yml
vendored
@ -14,6 +14,7 @@ env:
|
||||
MKL_NUM_THREADS: 8
|
||||
PYTEST_TIMEOUT: 60
|
||||
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||
RUN_PT_TF_CROSS_TESTS: 1
|
||||
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||
|
||||
jobs:
|
||||
|
||||
9
.github/workflows/self-push.yml
vendored
9
.github/workflows/self-push.yml
vendored
@ -24,6 +24,7 @@ env:
|
||||
MKL_NUM_THREADS: 8
|
||||
PYTEST_TIMEOUT: 60
|
||||
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||
RUN_PT_TF_CROSS_TESTS: 1
|
||||
CUDA_VISIBLE_DEVICES: 0,1
|
||||
|
||||
jobs:
|
||||
@ -292,7 +293,7 @@ jobs:
|
||||
|
||||
echo "$machine_type"
|
||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||
|
||||
|
||||
- name: Update clone using environment variables
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
@ -405,7 +406,7 @@ jobs:
|
||||
|
||||
echo "$machine_type"
|
||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||
|
||||
|
||||
- name: Update clone using environment variables
|
||||
working-directory: /workspace/transformers
|
||||
run: |
|
||||
@ -515,7 +516,7 @@ jobs:
|
||||
|
||||
echo "$machine_type"
|
||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||
|
||||
|
||||
- name: Update clone using environment variables
|
||||
working-directory: /workspace/transformers
|
||||
run: |
|
||||
@ -647,6 +648,6 @@ jobs:
|
||||
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
|
||||
run: |
|
||||
pip install huggingface_hub
|
||||
pip install slack_sdk
|
||||
pip install slack_sdk
|
||||
pip show slack_sdk
|
||||
python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
|
||||
|
||||
22
.github/workflows/self-scheduled-caller.yml
vendored
22
.github/workflows/self-scheduled-caller.yml
vendored
@ -7,19 +7,19 @@ on:
|
||||
- cron: "17 2 * * *"
|
||||
push:
|
||||
branches:
|
||||
- refactor-from-pretrained-base-commit
|
||||
- run_scheduled_ci*
|
||||
|
||||
jobs:
|
||||
# model-ci:
|
||||
# name: Model CI
|
||||
# uses: ./.github/workflows/self-scheduled.yml
|
||||
# with:
|
||||
# job: run_models_gpu
|
||||
# slack_report_channel: "#transformers-ci-daily-models"
|
||||
# runner: daily-ci
|
||||
# docker: huggingface/transformers-all-latest-gpu
|
||||
# ci_event: Daily CI
|
||||
# secrets: inherit
|
||||
model-ci:
|
||||
name: Model CI
|
||||
uses: ./.github/workflows/self-scheduled.yml
|
||||
with:
|
||||
job: run_models_gpu
|
||||
slack_report_channel: "#transformers-ci-daily-models"
|
||||
runner: daily-ci
|
||||
docker: huggingface/transformers-all-latest-gpu
|
||||
ci_event: Daily CI
|
||||
secrets: inherit
|
||||
|
||||
torch-pipeline:
|
||||
name: Torch pipeline CI
|
||||
|
||||
3
.github/workflows/self-scheduled.yml
vendored
3
.github/workflows/self-scheduled.yml
vendored
@ -40,6 +40,7 @@ env:
|
||||
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
||||
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||
RUN_PT_TF_CROSS_TESTS: 1
|
||||
CUDA_VISIBLE_DEVICES: 0,1
|
||||
NUM_SLICES: 2
|
||||
|
||||
@ -570,4 +571,4 @@ jobs:
|
||||
with:
|
||||
docker: ${{ inputs.docker }}
|
||||
start_sha: ${{ github.sha }}
|
||||
secrets: inherit
|
||||
secrets: inherit
|
||||
19
.github/workflows/ssh-runner.yml
vendored
19
.github/workflows/ssh-runner.yml
vendored
@ -5,7 +5,7 @@ on:
|
||||
inputs:
|
||||
runner_type:
|
||||
description: 'Type of runner to test (a10 or t4)'
|
||||
required: true
|
||||
required: true
|
||||
docker_image:
|
||||
description: 'Name of the Docker image'
|
||||
required: true
|
||||
@ -15,14 +15,15 @@ on:
|
||||
|
||||
env:
|
||||
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||
HF_HOME: /mnt/cache
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
OMP_NUM_THREADS: 8
|
||||
MKL_NUM_THREADS: 8
|
||||
RUN_SLOW: yes # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. # This token is created under the bot `hf-transformers-bot`.
|
||||
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
||||
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||
HF_HOME: /mnt/cache
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
OMP_NUM_THREADS: 8
|
||||
MKL_NUM_THREADS: 8
|
||||
RUN_SLOW: yes # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. # This token is created under the bot `hf-transformers-bot`.
|
||||
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
||||
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||
CUDA_VISIBLE_DEVICES: 0,1
|
||||
RUN_PT_TF_CROSS_TESTS: 1
|
||||
|
||||
jobs:
|
||||
get_runner:
|
||||
@ -77,7 +78,7 @@ jobs:
|
||||
- name: Show installed libraries and their versions
|
||||
working-directory: /transformers
|
||||
run: pip freeze
|
||||
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
@ -343,6 +343,8 @@ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/t
|
||||
|
||||
Like the slow tests, there are other environment variables available which are not enabled by default during testing:
|
||||
- `RUN_CUSTOM_TOKENIZERS`: Enables tests for custom tokenizers.
|
||||
- `RUN_PT_FLAX_CROSS_TESTS`: Enables tests for PyTorch + Flax integration.
|
||||
- `RUN_PT_TF_CROSS_TESTS`: Enables tests for TensorFlow + PyTorch integration.
|
||||
|
||||
More environment variables and additional information can be found in the [testing_utils.py](https://github.com/huggingface/transformers/blob/main/src/transformers/testing_utils.py).
|
||||
|
||||
|
||||
@ -61,6 +61,7 @@ NOT_DEVICE_TESTS = {
|
||||
"test_load_save_without_tied_weights",
|
||||
"test_tied_weights_keys",
|
||||
"test_model_weights_reload_no_missing_tied_weights",
|
||||
"test_pt_tf_model_equivalence",
|
||||
"test_mismatched_shapes_have_properly_initialized_weights",
|
||||
"test_matched_shapes_have_loaded_weights_when_some_mismatched_shapes_exist",
|
||||
"test_model_is_small",
|
||||
@ -84,6 +85,12 @@ warnings.simplefilter(action="ignore", category=FutureWarning)
|
||||
|
||||
|
||||
def pytest_configure(config):
|
||||
config.addinivalue_line(
|
||||
"markers", "is_pt_tf_cross_test: mark test to run only when PT and TF interactions are tested"
|
||||
)
|
||||
config.addinivalue_line(
|
||||
"markers", "is_pt_flax_cross_test: mark test to run only when PT and FLAX interactions are tested"
|
||||
)
|
||||
config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipelines are tested")
|
||||
config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
|
||||
config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate")
|
||||
|
||||
@ -2,10 +2,10 @@ FROM rocm/dev-ubuntu-22.04:6.2.4
|
||||
LABEL maintainer="Hugging Face"
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
ARG PYTORCH='2.6.0'
|
||||
ARG TORCH_VISION='0.21.0'
|
||||
ARG TORCH_AUDIO='2.6.0'
|
||||
ARG ROCM='6.2.4'
|
||||
ARG PYTORCH='2.5.1'
|
||||
ARG TORCH_VISION='0.20.0'
|
||||
ARG TORCH_AUDIO='2.5.0'
|
||||
ARG ROCM='6.2'
|
||||
|
||||
RUN apt update && \
|
||||
apt install -y --no-install-recommends \
|
||||
@ -16,11 +16,9 @@ RUN apt update && \
|
||||
python-is-python3 \
|
||||
rocrand-dev \
|
||||
rocthrust-dev \
|
||||
rocblas-dev \
|
||||
hipsolver-dev \
|
||||
hipsparse-dev \
|
||||
hipblas-dev \
|
||||
hipblaslt-dev && \
|
||||
rocblas-dev && \
|
||||
apt clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
||||
@ -76,9 +76,6 @@ RUN python3 -m pip install git+https://github.com/NetEase-FuXi/EETQ.git
|
||||
RUN python3 -m pip install --no-cache-dir flute-kernel==0.3.0 -i https://flute-ai.github.io/whl/cu118
|
||||
RUN python3 -m pip install --no-cache-dir fast_hadamard_transform==1.0.4.post1
|
||||
|
||||
# Add compressed-tensors for quantization testing
|
||||
RUN python3 -m pip install --no-cache-dir compressed-tensors
|
||||
|
||||
# When installing in editable mode, `transformers` is not recognized as a package.
|
||||
# this line must be added in order for python to be aware of transformers.
|
||||
RUN cd transformers && python3 setup.py develop
|
||||
|
||||
@ -673,29 +673,6 @@ tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Tensor Parallelism with PyTorch 2">
|
||||
|
||||
```yml
|
||||
compute_environment: LOCAL_MACHINE
|
||||
tp_config:
|
||||
tp_size: 4
|
||||
distributed_type: TP
|
||||
downcast_bf16: 'no'
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
mixed_precision: 'no'
|
||||
num_machines: 1
|
||||
num_processes: 4
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
يُعد أمر [`accelerate_launch`](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-launch) هو الطريقة المُوصى بها لتشغيل نص البرمجى للتدريب على نظام موزع باستخدام Accelerate و [`Trainer`] مع المعلمات المحددة في `config_file.yaml`. يتم حفظ هذا الملف في مجلد ذاكرة التخزين المؤقت لـ Accelerate ويتم تحميله تلقائيًا عند تشغيل `accelerate_launch`.
|
||||
|
||||
@ -283,6 +283,8 @@ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/t
|
||||
Wie bei den langsamen Tests gibt es auch andere Umgebungsvariablen, die standardmäßig beim Testen nicht gesetzt sind:
|
||||
|
||||
* `RUN_CUSTOM_TOKENIZERS`: Aktiviert Tests für benutzerdefinierte Tokenizer.
|
||||
* `RUN_PT_FLAX_CROSS_TESTS`: Aktiviert Tests für die Integration von PyTorch + Flax.
|
||||
* `RUN_PT_TF_CROSS_TESTS`: Aktiviert Tests für die Integration von TensorFlow + PyTorch.
|
||||
|
||||
Weitere Umgebungsvariablen und zusätzliche Informationen finden Sie in der [testing_utils.py](src/transformers/testing_utils.py).
|
||||
|
||||
|
||||
@ -461,8 +461,6 @@
|
||||
title: Granite
|
||||
- local: model_doc/granitemoe
|
||||
title: GraniteMoe
|
||||
- local: model_doc/granitemoeshared
|
||||
title: GraniteMoeShared
|
||||
- local: model_doc/granitevision
|
||||
title: GraniteVision
|
||||
- local: model_doc/helium
|
||||
|
||||
@ -173,7 +173,6 @@ Flax), PyTorch, and/or TensorFlow.
|
||||
| [GPTSAN-japanese](model_doc/gptsan-japanese) | ✅ | ❌ | ❌ |
|
||||
| [Granite](model_doc/granite) | ✅ | ❌ | ❌ |
|
||||
| [GraniteMoeMoe](model_doc/granitemoe) | ✅ | ❌ | ❌ |
|
||||
| [GraniteMoeSharedMoe](model_doc/granitemoeshared) | ✅ | ❌ | ❌ |
|
||||
| [Graphormer](model_doc/graphormer) | ✅ | ❌ | ❌ |
|
||||
| [Grounding DINO](model_doc/grounding-dino) | ✅ | ❌ | ❌ |
|
||||
| [GroupViT](model_doc/groupvit) | ✅ | ✅ | ❌ |
|
||||
|
||||
@ -55,7 +55,7 @@ To give some examples of how much VRAM it roughly takes to load a model in bfloa
|
||||
|
||||
As of writing this document, the largest GPU chip on the market is the A100 & H100 offering 80GB of VRAM. Most of the models listed before require more than 80GB just to be loaded and therefore necessarily require [tensor parallelism](https://huggingface.co/docs/transformers/perf_train_gpu_many#tensor-parallelism) and/or [pipeline parallelism](https://huggingface.co/docs/transformers/perf_train_gpu_many#naive-model-parallelism-vertical-and-pipeline-parallelism).
|
||||
|
||||
🤗 Transformers now supports tensor parallelism for supported models having `base_tp_plan` in their respecitve config classes. Learn more about Tensor Parallelism [here](perf_train_gpu_many#tensor-parallelism). Furthermore, if you're interested in writing models in a tensor-parallelism-friendly way, feel free to have a look at [the text-generation-inference library](https://github.com/huggingface/text-generation-inference/tree/main/server/text_generation_server/models/custom_modeling).
|
||||
🤗 Transformers does not support tensor parallelism out of the box as it requires the model architecture to be written in a specific way. If you're interested in writing models in a tensor-parallelism-friendly way, feel free to have a look at [the text-generation-inference library](https://github.com/huggingface/text-generation-inference/tree/main/server/text_generation_server/models/custom_modeling).
|
||||
|
||||
Naive pipeline parallelism is supported out of the box. For this, simply load the model with `device="auto"` which will automatically place the different layers on the available GPUs as explained [here](https://huggingface.co/docs/accelerate/v0.22.0/en/concept_guides/big_model_inference).
|
||||
Note, however that while very effective, this naive pipeline parallelism does not tackle the issues of GPU idling. For this more advanced pipeline parallelism is required as explained [here](https://huggingface.co/docs/transformers/en/perf_train_gpu_many#naive-model-parallelism-vertical-and-pipeline-parallelism).
|
||||
|
||||
@ -1,66 +0,0 @@
|
||||
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# GraniteMoeShared
|
||||
|
||||
## Overview
|
||||
|
||||
|
||||
The GraniteMoe model was proposed in [Power Scheduler: A Batch Size and Token Number Agnostic Learning Rate Scheduler](https://arxiv.org/abs/2408.13359) by Yikang Shen, Matthew Stallone, Mayank Mishra, Gaoyuan Zhang, Shawn Tan, Aditya Prasad, Adriana Meza Soria, David D. Cox and Rameswar Panda.
|
||||
|
||||
Additionally this class GraniteMoeSharedModel adds shared experts for Moe.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
model_path = "ibm-research/moe-7b-1b-active-shared-experts"
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
||||
|
||||
# drop device_map if running on CPU
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
|
||||
model.eval()
|
||||
|
||||
# change input text as desired
|
||||
prompt = "Write a code to find the maximum value in a list of numbers."
|
||||
|
||||
# tokenize the text
|
||||
input_tokens = tokenizer(prompt, return_tensors="pt")
|
||||
# generate output tokens
|
||||
output = model.generate(**input_tokens, max_new_tokens=100)
|
||||
# decode output tokens into text
|
||||
output = tokenizer.batch_decode(output)
|
||||
# loop over the batch to print, in this example the batch size is 1
|
||||
for i in output:
|
||||
print(i)
|
||||
```
|
||||
|
||||
This HF implementation is contributed by [Mayank Mishra](https://huggingface.co/mayank-mishra), [Shawn Tan](https://huggingface.co/shawntan) and [Sukriti Sharma](https://huggingface.co/SukritiSharma).
|
||||
|
||||
|
||||
## GraniteMoeSharedConfig
|
||||
|
||||
[[autodoc]] GraniteMoeSharedConfig
|
||||
|
||||
## GraniteMoeSharedModel
|
||||
|
||||
[[autodoc]] GraniteMoeSharedModel
|
||||
- forward
|
||||
|
||||
## GraniteMoeSharedForCausalLM
|
||||
|
||||
[[autodoc]] GraniteMoeSharedForCausalLM
|
||||
- forward
|
||||
@ -60,7 +60,6 @@ FlashAttention-2 is currently supported for the following architectures:
|
||||
* [GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj#transformers.GPTJModel)
|
||||
* [Granite](https://huggingface.co/docs/transformers/model_doc/granite#transformers.GraniteModel)
|
||||
* [GraniteMoe](https://huggingface.co/docs/transformers/model_doc/granitemoe#transformers.GraniteMoeModel)
|
||||
* [GraniteMoeShared](https://huggingface.co/docs/transformers/model_doc/granitemoeshared#transformers.GraniteMoeSharedModel)
|
||||
* [Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2#transformers.Idefics2Model)
|
||||
* [Idefics3](https://huggingface.co/docs/transformers/model_doc/idefics3#transformers.Idefics3Model)
|
||||
* [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel)
|
||||
@ -267,7 +266,6 @@ For now, Transformers supports SDPA inference and training for the following arc
|
||||
* [Idefics3](https://huggingface.co/docs/transformers/model_doc/idefics3#transformers.Idefics3Model)
|
||||
* [I-JEPA](https://huggingface.co/docs/transformers/model_doc/ijepa#transformers.IJepaModel)
|
||||
* [GraniteMoe](https://huggingface.co/docs/transformers/model_doc/granitemoe#transformers.GraniteMoeModel)
|
||||
* [GraniteMoeShared](https://huggingface.co/docs/transformers/model_doc/granitemoeshared#transformers.GraniteMoeSharedModel)
|
||||
* [JetMoe](https://huggingface.co/docs/transformers/model_doc/jetmoe#transformers.JetMoeModel)
|
||||
* [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel)
|
||||
* [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
|
||||
|
||||
@ -450,13 +450,12 @@ Implementations:
|
||||
- [parallelformers](https://github.com/tunib-ai/parallelformers) (only inference at the moment)
|
||||
- [SageMaker](https://arxiv.org/abs/2111.05972) - this is a proprietary solution that can only be used on AWS.
|
||||
- [OSLO](https://github.com/tunib-ai/oslo) has the tensor parallelism implementation based on the Transformers.
|
||||
- [`transformers` integration](main_classes/trainer) tensor parallelism is available through tp_size attribute for models having `base_tp_plan`. Further you can look at [example usage](perf_infer_gpu_multi)
|
||||
|
||||
SageMaker combines TP with DP for a more efficient processing.
|
||||
|
||||
🤗 Transformers status:
|
||||
- core: uses PyTorch 2 APIs to support tensor parallelism to models having base_tp_plan in their respective config classes.
|
||||
- Alternatively, you can as well try [parallelformers](https://github.com/tunib-ai/parallelformers) that provides this support for most of our models. Training mode with TP is as well supported natively in transformers.
|
||||
- core: not yet implemented in the core
|
||||
- but if you want inference [parallelformers](https://github.com/tunib-ai/parallelformers) provides this support for most of our models. So until this is implemented in the core you can use theirs. And hopefully training mode will be supported too.
|
||||
- Deepspeed-Inference also supports our BERT, GPT-2, and GPT-Neo models in their super-fast CUDA-kernel-based inference mode, see more [here](https://www.deepspeed.ai/tutorials/inference-tutorial/)
|
||||
|
||||
🤗 Accelerate integrates with [TP from Megatron-LM](https://huggingface.co/docs/accelerate/v0.23.0/en/usage_guides/megatron_lm).
|
||||
@ -536,7 +535,7 @@ Important papers:
|
||||
- [Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model](
|
||||
https://arxiv.org/abs/2201.11990)
|
||||
|
||||
🤗 Transformers status: not yet implemented, since we have no PP.
|
||||
🤗 Transformers status: not yet implemented, since we have no PP and TP.
|
||||
|
||||
## FlexFlow
|
||||
|
||||
|
||||
@ -799,29 +799,6 @@ tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Tensor Parallelism with PyTorch 2">
|
||||
|
||||
```yml
|
||||
compute_environment: LOCAL_MACHINE
|
||||
tp_config:
|
||||
tp_size: 4
|
||||
distributed_type: TP
|
||||
downcast_bf16: 'no'
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
mixed_precision: 'no'
|
||||
num_machines: 1
|
||||
num_processes: 4
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
|
||||
@ -361,30 +361,6 @@ use_cpu: false
|
||||
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
|
||||
<hfoption id="Tensor Parallelism with PyTorch 2">
|
||||
|
||||
```yml
|
||||
compute_environment: LOCAL_MACHINE
|
||||
tp_config:
|
||||
tp_size: 4
|
||||
distributed_type: TP
|
||||
downcast_bf16: 'no'
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
mixed_precision: 'no'
|
||||
num_machines: 1
|
||||
num_processes: 4
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
|
||||
@ -85,7 +85,7 @@ python src/transformers/commands/transformers_cli.py env
|
||||
3. 해당 기능의 사용법을 보여주는 *코드 스니펫*을 제공해 주세요.
|
||||
4. 기능과 관련된 논문이 있는 경우 링크를 포함해 주세요.
|
||||
|
||||
이슈가 잘 작성되었다면 이슈가 생성된 순간, 이미 80% 정도의 작업이 완료된 것입니다.
|
||||
이슈가 잘 작성되었다면 이슈가 생성된 순간, 이미 80% 정도의 작업이 완료된 것입니다.
|
||||
|
||||
이슈를 제기하는 데 도움이 될 만한 [템플릿](https://github.com/huggingface/transformers/tree/main/templates)도 준비되어 있습니다.
|
||||
|
||||
@ -140,7 +140,7 @@ python src/transformers/commands/transformers_cli.py env
|
||||
```
|
||||
|
||||
만약 이미 가상 환경에 🤗 Transformers가 설치되어 있다면, `-e` 플래그를 사용하여 설치하기 전에 `pip uninstall transformers`로 제거해주세요.
|
||||
|
||||
|
||||
여러분의 운영체제에 따라서, 그리고 🤗 Transformers의 선택적 의존성의 수가 증가하면서, 이 명령이 실패할 수도 있습니다. 그럴 경우 사용하려는 딥러닝 프레임워크(PyTorch, TensorFlow, 그리고/또는 Flax)를 설치한 후 아래 명령을 실행해주세요:
|
||||
|
||||
```bash
|
||||
@ -188,7 +188,7 @@ python src/transformers/commands/transformers_cli.py env
|
||||
이러한 검사에 대해 자세히 알아보고 관련 문제를 해결하는 방법은 [Pull Request에 대한 검사](https://huggingface.co/docs/transformers/pr_checks) 가이드를 확인하세요.
|
||||
|
||||
만약 `docs/source` 디렉터리 아래의 문서를 수정하는 경우, 문서가 빌드될 수 있는지 확인하세요. 이 검사는 Pull Request를 열 때도 CI에서 실행됩니다. 로컬 검사를 실행하려면 문서 빌더를 설치해야 합니다:
|
||||
|
||||
|
||||
```bash
|
||||
pip install ".[docs]"
|
||||
```
|
||||
@ -216,7 +216,7 @@ python src/transformers/commands/transformers_cli.py env
|
||||
git fetch upstream
|
||||
git rebase upstream/main
|
||||
```
|
||||
|
||||
|
||||
변경 사항을 브랜치에 푸시하세요:
|
||||
|
||||
```bash
|
||||
@ -238,7 +238,7 @@ python src/transformers/commands/transformers_cli.py env
|
||||
☐ 새로운 기능을 추가하는 경우, 해당 기능에 대한 테스트도 추가하세요.<br>
|
||||
- 새 모델을 추가하는 경우, `ModelTester.all_model_classes = (MyModel, MyModelWithLMHead,...)`을 사용하여 일반적인 테스트를 활성화하세요.
|
||||
- 새 `@slow` 테스트를 추가하는 경우, 다음 명령으로 테스트를 통과하는지 확인하세요: `RUN_SLOW=1 python -m pytest tests/models/my_new_model/test_my_new_model.py`.
|
||||
- 새 토크나이저를 추가하는 경우, 테스트를 작성하고 다음 명령으로 테스트를 통과하는지 확인하세요: `RUN_SLOW=1 python -m pytest tests/models/{your_model_name}/test_tokenization_{your_model_name}.py`.
|
||||
- 새 토크나이저를 추가하는 경우, 테스트를 작성하고 다음 명령으로 테스트를 통과하는지 확인하세요: `RUN_SLOW=1 python -m pytest tests/models/{your_model_name}/test_tokenization_{your_model_name}.py`.
|
||||
- CircleCI에서는 느린 테스트를 실행하지 않지만, GitHub Actions에서는 매일 밤 실행됩니다!<br>
|
||||
|
||||
☐ 모든 공개 메소드는 유용한 기술문서를 가져야 합니다 (예를 들어 [`modeling_bert.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py) 참조).<br>
|
||||
@ -282,6 +282,8 @@ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/t
|
||||
|
||||
느린 테스트와 마찬가지로, 다음과 같이 테스트 중에 기본적으로 활성화되지 않는 다른 환경 변수도 있습니다:
|
||||
- `RUN_CUSTOM_TOKENIZERS`: 사용자 정의 토크나이저 테스트를 활성화합니다.
|
||||
- `RUN_PT_FLAX_CROSS_TESTS`: PyTorch + Flax 통합 테스트를 활성화합니다.
|
||||
- `RUN_PT_TF_CROSS_TESTS`: TensorFlow + PyTorch 통합 테스트를 활성화합니다.
|
||||
|
||||
더 많은 환경 변수와 추가 정보는 [testing_utils.py](src/transformers/testing_utils.py)에서 찾을 수 있습니다.
|
||||
|
||||
|
||||
@ -548,29 +548,6 @@ tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Tensor Parallelism with PyTorch 2">
|
||||
|
||||
```yml
|
||||
compute_environment: LOCAL_MACHINE
|
||||
tp_config:
|
||||
tp_size: 4
|
||||
distributed_type: TP
|
||||
downcast_bf16: 'no'
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
mixed_precision: 'no'
|
||||
num_machines: 1
|
||||
num_processes: 4
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
|
||||
@ -33,7 +33,7 @@ limitations under the License.
|
||||
* 实现新的模型。
|
||||
* 为示例或文档做贡献。
|
||||
|
||||
如果你不知道从哪里开始,有一个特别的 [Good First Issue](https://github.com/huggingface/transformers/contribute) 列表。它会列出一些适合初学者的开放的 issues,并帮助你开始为开源项目做贡献。只需要在你想要处理的 issue 下发表评论就行。
|
||||
如果你不知道从哪里开始,有一个特别的 [Good First Issue](https://github.com/huggingface/transformers/contribute) 列表。它会列出一些适合初学者的开放的 issues,并帮助你开始为开源项目做贡献。只需要在你想要处理的 issue 下发表评论就行。
|
||||
|
||||
如果想要稍微更有挑战性的内容,你也可以查看 [Good Second Issue](https://github.com/huggingface/transformers/labels/Good%20Second%20Issue) 列表。总的来说,如果你觉得自己知道该怎么做,就去做吧,我们会帮助你达到目标的!🚀
|
||||
|
||||
@ -139,7 +139,7 @@ python src/transformers/commands/transformers_cli.py env
|
||||
```
|
||||
|
||||
如果在虚拟环境中已经安装了 🤗 Transformers,请先使用 `pip uninstall transformers` 卸载它,然后再用 `-e` 参数以可编辑模式重新安装。
|
||||
|
||||
|
||||
根据你的操作系统,以及 Transformers 的可选依赖项数量的增加,可能会在执行此命令时出现失败。如果出现这种情况,请确保已经安装了你想使用的深度学习框架(PyTorch, TensorFlow 和 Flax),然后执行以下操作:
|
||||
|
||||
```bash
|
||||
@ -187,7 +187,7 @@ python src/transformers/commands/transformers_cli.py env
|
||||
想要了解有关这些检查及如何解决相关问题的更多信息,请阅读 [检查 Pull Request](https://huggingface.co/docs/transformers/pr_checks) 指南。
|
||||
|
||||
如果你修改了 `docs/source` 目录下的文档,请确保文档仍然能够被构建。这个检查也会在你创建 PR 时在 CI 中运行。如果要进行本地检查,请确保安装了文档构建工具:
|
||||
|
||||
|
||||
```bash
|
||||
pip install ".[docs]"
|
||||
```
|
||||
@ -281,6 +281,8 @@ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/t
|
||||
|
||||
和时间较长的测试一样,还有其他环境变量在测试过程中,在默认情况下是未启用的:
|
||||
- `RUN_CUSTOM_TOKENIZERS`: 启用自定义分词器的测试。
|
||||
- `RUN_PT_FLAX_CROSS_TESTS`: 启用 PyTorch + Flax 整合的测试。
|
||||
- `RUN_PT_TF_CROSS_TESTS`: 启用 TensorFlow + PyTorch 整合的测试。
|
||||
|
||||
更多环境变量和额外信息可以在 [testing_utils.py](src/transformers/testing_utils.py) 中找到。
|
||||
|
||||
|
||||
@ -61,7 +61,7 @@ from transformers.utils import check_min_version, send_example_telemetry
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
Array = Any
|
||||
Dataset = datasets.arrow_dataset.Dataset
|
||||
|
||||
@ -60,7 +60,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risk.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt")
|
||||
|
||||
|
||||
@ -56,7 +56,7 @@ from transformers.utils import check_min_version, send_example_telemetry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
Array = Any
|
||||
Dataset = datasets.arrow_dataset.Dataset
|
||||
|
||||
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
|
||||
|
||||
|
||||
@ -45,7 +45,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")
|
||||
|
||||
|
||||
@ -54,7 +54,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
|
||||
|
||||
|
||||
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@ -43,7 +43,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
|
||||
|
||||
|
||||
@ -48,7 +48,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
|
||||
|
||||
|
||||
@ -53,7 +53,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
|
||||
|
||||
|
||||
@ -46,7 +46,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")
|
||||
|
||||
|
||||
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")
|
||||
|
||||
|
||||
@ -55,7 +55,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@ -58,7 +58,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@ -54,7 +54,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||
|
||||
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||
|
||||
|
||||
@ -46,7 +46,7 @@ from transformers.utils import check_min_version, send_example_telemetry
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@ -54,7 +54,7 @@ from transformers.utils import check_min_version, send_example_telemetry
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
# You should update this to your particular problem to have better documentation of `model_type`
|
||||
|
||||
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt")
|
||||
|
||||
|
||||
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
|
||||
|
||||
|
||||
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
|
||||
|
||||
|
||||
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
|
||||
|
||||
|
||||
@ -46,7 +46,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
|
||||
|
||||
|
||||
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt")
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
|
||||
|
||||
|
||||
@ -53,7 +53,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
|
||||
|
||||
|
||||
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
|
||||
|
||||
|
||||
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
|
||||
|
||||
|
||||
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
|
||||
|
||||
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
|
||||
|
||||
|
||||
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
|
||||
|
||||
|
||||
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
|
||||
|
||||
|
||||
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
|
||||
|
||||
|
||||
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
|
||||
|
||||
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
|
||||
|
||||
@ -1,78 +0,0 @@
|
||||
import json
|
||||
from typing import Any, Dict
|
||||
|
||||
import torch
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from transformers.quantizers import HfQuantizer, register_quantization_config, register_quantizer
|
||||
from transformers.utils.quantization_config import QuantizationConfigMixin
|
||||
|
||||
|
||||
@register_quantization_config("custom")
|
||||
class CustomConfig(QuantizationConfigMixin):
|
||||
def __init__(self):
|
||||
self.quant_method = "custom"
|
||||
self.bits = 8
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
output = {
|
||||
"num_bits": self.bits,
|
||||
}
|
||||
return output
|
||||
|
||||
def __repr__(self):
|
||||
config_dict = self.to_dict()
|
||||
return f"{self.__class__.__name__} {json.dumps(config_dict, indent=2, sort_keys=True)}\n"
|
||||
|
||||
def to_diff_dict(self) -> Dict[str, Any]:
|
||||
config_dict = self.to_dict()
|
||||
|
||||
default_config_dict = CustomConfig().to_dict()
|
||||
|
||||
serializable_config_dict = {}
|
||||
|
||||
for key, value in config_dict.items():
|
||||
if value != default_config_dict[key]:
|
||||
serializable_config_dict[key] = value
|
||||
|
||||
return serializable_config_dict
|
||||
|
||||
|
||||
@register_quantizer("custom")
|
||||
class CustomQuantizer(HfQuantizer):
|
||||
def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
|
||||
super().__init__(quantization_config, **kwargs)
|
||||
self.quantization_config = quantization_config
|
||||
self.scale_map = {}
|
||||
self.device = kwargs.get("device", "cuda" if torch.cuda.is_available() else "cpu")
|
||||
self.torch_dtype = kwargs.get("torch_dtype", torch.float32)
|
||||
|
||||
def _process_model_before_weight_loading(self, model, **kwargs):
|
||||
return True
|
||||
|
||||
def _process_model_after_weight_loading(self, model, **kwargs):
|
||||
return True
|
||||
|
||||
def is_serializable(self) -> bool:
|
||||
return True
|
||||
|
||||
def is_trainable(self) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
model_8bit = AutoModelForCausalLM.from_pretrained(
|
||||
"facebook/opt-350m", quantization_config=CustomConfig(), torch_dtype="auto"
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
|
||||
input_text = "once there is"
|
||||
inputs = tokenizer(input_text, return_tensors="pt")
|
||||
output = model_8bit.generate(
|
||||
**inputs,
|
||||
max_length=100,
|
||||
num_return_sequences=1,
|
||||
no_repeat_ngram_size=2,
|
||||
)
|
||||
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||
|
||||
print(generated_text)
|
||||
@ -1,257 +0,0 @@
|
||||
import json
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from accelerate import init_empty_weights
|
||||
from huggingface_hub import HfApi
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from transformers.quantizers import HfQuantizer, get_module_from_name, register_quantization_config, register_quantizer
|
||||
from transformers.utils.quantization_config import QuantizationConfigMixin
|
||||
|
||||
|
||||
# Implement INT8 Symmetric Linear layer
|
||||
class Int8SymmetricLinear(torch.nn.Module):
|
||||
def __init__(self, in_features, out_features, bias, dtype=torch.float32):
|
||||
super().__init__()
|
||||
self.in_features = in_features
|
||||
self.out_features = out_features
|
||||
|
||||
self.register_buffer("weight", torch.zeros((out_features, in_features), dtype=torch.int8))
|
||||
self.register_buffer("weight_scale", torch.zeros((out_features, 1), dtype=dtype))
|
||||
|
||||
if bias:
|
||||
self.register_buffer("bias", torch.zeros((self.out_features), dtype=dtype))
|
||||
else:
|
||||
self.bias = None
|
||||
|
||||
def forward(self, x):
|
||||
dequant_weight = self.weight * self.weight_scale
|
||||
output = F.linear(x, dequant_weight)
|
||||
if self.bias is not None:
|
||||
output = output + self.bias
|
||||
return output
|
||||
|
||||
|
||||
# Function to replace standard linear layers with INT8 symmetric quantized layers
|
||||
def _replace_with_int8_symmetric_linear(
|
||||
model,
|
||||
modules_to_not_convert=None,
|
||||
current_key_name=None,
|
||||
quantization_config=None,
|
||||
has_been_replaced=False,
|
||||
pre_quantized=False,
|
||||
):
|
||||
"""
|
||||
Recursively replaces nn.Linear modules with Int8SymmetricLinear modules.
|
||||
"""
|
||||
if current_key_name is None:
|
||||
current_key_name = []
|
||||
|
||||
for name, module in model.named_children():
|
||||
current_key_name.append(name)
|
||||
|
||||
if (isinstance(module, nn.Linear)) and name not in modules_to_not_convert:
|
||||
# Check if the current key is not in the `modules_to_not_convert`
|
||||
current_key_name_str = ".".join(current_key_name)
|
||||
if not any(
|
||||
(key + "." in current_key_name_str) or (key == current_key_name_str) for key in modules_to_not_convert
|
||||
):
|
||||
with init_empty_weights(include_buffers=True):
|
||||
in_features = module.in_features
|
||||
out_features = module.out_features
|
||||
model._modules[name] = Int8SymmetricLinear(
|
||||
in_features, out_features, module.bias is not None, dtype=module.weight.dtype
|
||||
)
|
||||
has_been_replaced = True
|
||||
model._modules[name].requires_grad_(False)
|
||||
|
||||
if len(list(module.children())) > 0:
|
||||
_, has_been_replaced = _replace_with_int8_symmetric_linear(
|
||||
module,
|
||||
modules_to_not_convert,
|
||||
current_key_name,
|
||||
quantization_config,
|
||||
has_been_replaced=has_been_replaced,
|
||||
pre_quantized=pre_quantized,
|
||||
)
|
||||
# Remove the last key for recursion
|
||||
current_key_name.pop(-1)
|
||||
return model, has_been_replaced
|
||||
|
||||
|
||||
def replace_with_int8_symmetric_linear(
|
||||
model, modules_to_not_convert=None, current_key_name=None, quantization_config=None, pre_quantized=False
|
||||
):
|
||||
"""
|
||||
Main function to replace model layers with INT8 symmetric quantized versions.
|
||||
"""
|
||||
modules_to_not_convert = ["lm_head"] if modules_to_not_convert is None else modules_to_not_convert
|
||||
|
||||
if quantization_config.modules_to_not_convert is not None:
|
||||
modules_to_not_convert.extend(quantization_config.modules_to_not_convert)
|
||||
modules_to_not_convert = list(set(modules_to_not_convert))
|
||||
|
||||
model, has_been_replaced = _replace_with_int8_symmetric_linear(
|
||||
model, modules_to_not_convert, current_key_name, quantization_config, pre_quantized=pre_quantized
|
||||
)
|
||||
|
||||
if not has_been_replaced:
|
||||
raise ValueError(
|
||||
"You are loading your model using INT8 symmetric quantization but no linear modules were found in your model."
|
||||
)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
@register_quantization_config("int8_symmetric")
|
||||
class Int8SymmetricConfig(QuantizationConfigMixin):
|
||||
"""
|
||||
Configuration for INT8 symmetric quantization.
|
||||
"""
|
||||
|
||||
def __init__(self, modules_to_not_convert: Optional[List[str]] = None, **kwargs):
|
||||
self.quant_method = "int8_symmetric"
|
||||
self.modules_to_not_convert = modules_to_not_convert
|
||||
|
||||
def __repr__(self):
|
||||
config_dict = self.to_dict()
|
||||
return f"{self.__class__.__name__} {json.dumps(config_dict, indent=2, sort_keys=True)}\n"
|
||||
|
||||
def to_diff_dict(self) -> Dict[str, Any]:
|
||||
config_dict = self.to_dict()
|
||||
default_config_dict = Int8SymmetricConfig().to_dict()
|
||||
|
||||
serializable_config_dict = {}
|
||||
for key, value in config_dict.items():
|
||||
if value != default_config_dict[key]:
|
||||
serializable_config_dict[key] = value
|
||||
|
||||
return serializable_config_dict
|
||||
|
||||
|
||||
@register_quantizer("int8_symmetric")
|
||||
class Int8SymmetricQuantizer(HfQuantizer):
|
||||
"""
|
||||
Implementation of INT8 symmetric quantization.
|
||||
|
||||
"""
|
||||
|
||||
requires_calibration = False
|
||||
requires_parameters_quantization = True
|
||||
|
||||
def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
|
||||
super().__init__(quantization_config, **kwargs)
|
||||
self.quantization_config = quantization_config
|
||||
|
||||
def _process_model_before_weight_loading(self, model, **kwargs):
|
||||
"""
|
||||
Replace model's linear layers with quantized versions before loading weights.
|
||||
"""
|
||||
self.modules_to_not_convert = self.quantization_config.modules_to_not_convert
|
||||
|
||||
model = replace_with_int8_symmetric_linear(
|
||||
model,
|
||||
modules_to_not_convert=self.modules_to_not_convert,
|
||||
quantization_config=self.quantization_config,
|
||||
pre_quantized=self.pre_quantized,
|
||||
)
|
||||
|
||||
def check_quantized_param(
|
||||
self,
|
||||
model,
|
||||
param_value: "torch.Tensor",
|
||||
param_name: str,
|
||||
state_dict: Dict[str, Any],
|
||||
**kwargs,
|
||||
):
|
||||
module, tensor_name = get_module_from_name(model, param_name)
|
||||
|
||||
if isinstance(module, Int8SymmetricLinear):
|
||||
if self.pre_quantized or tensor_name == "bias":
|
||||
if tensor_name == "weight" and param_value.dtype != torch.int8:
|
||||
raise ValueError("Expect quantized weights but got an unquantized weight")
|
||||
return False
|
||||
else:
|
||||
if tensor_name == "weight_scale":
|
||||
raise ValueError("Expect unquantized weights but got a quantized weight_scale")
|
||||
return True
|
||||
return False
|
||||
|
||||
def create_quantized_param(
|
||||
self,
|
||||
model,
|
||||
param_value: "torch.Tensor",
|
||||
param_name: str,
|
||||
target_device: "torch.device",
|
||||
state_dict: Dict[str, Any],
|
||||
unexpected_keys: Optional[List[str]] = None,
|
||||
):
|
||||
"""
|
||||
Quantizes weights to INT8 symmetric format.
|
||||
"""
|
||||
abs_max_per_row = torch.max(torch.abs(param_value), dim=1, keepdim=True)[0].clamp(min=1e-5)
|
||||
|
||||
weight_scale = abs_max_per_row / 127.0
|
||||
|
||||
weight_quantized = torch.round(param_value / weight_scale).clamp(-128, 127).to(torch.int8)
|
||||
|
||||
module, tensor_name = get_module_from_name(model, param_name)
|
||||
module._buffers[tensor_name] = weight_quantized.to(target_device)
|
||||
module._buffers["weight_scale"] = weight_scale.to(target_device)
|
||||
|
||||
def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]:
|
||||
not_missing_keys = []
|
||||
for name, module in model.named_modules():
|
||||
if isinstance(module, Int8SymmetricLinear):
|
||||
for missing in missing_keys:
|
||||
if (
|
||||
(name in missing or name in f"{prefix}.{missing}")
|
||||
and not missing.endswith(".weight")
|
||||
and not missing.endswith(".bias")
|
||||
):
|
||||
not_missing_keys.append(missing)
|
||||
return [k for k in missing_keys if k not in not_missing_keys]
|
||||
|
||||
def _process_model_after_weight_loading(self, model, **kwargs):
|
||||
"""
|
||||
Post-processing after weights are loaded.
|
||||
"""
|
||||
return True
|
||||
|
||||
def is_serializable(self, safe_serialization=None):
|
||||
return True
|
||||
|
||||
@property
|
||||
def is_trainable(self) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
# Example usage
|
||||
if __name__ == "__main__":
|
||||
model_int8 = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-3.2-1B", quantization_config=Int8SymmetricConfig(), torch_dtype=torch.float, device_map="cpu"
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
|
||||
input_text = "once there is"
|
||||
inputs = tokenizer(input_text, return_tensors="pt").to("cpu")
|
||||
output = model_int8.generate(
|
||||
**inputs,
|
||||
max_length=100,
|
||||
num_return_sequences=1,
|
||||
no_repeat_ngram_size=2,
|
||||
)
|
||||
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||
print(generated_text)
|
||||
|
||||
# Save and upload to HUB
|
||||
output_model_dir = "Llama-3.2-1B-INT8-CUSTOM"
|
||||
model_int8.save_pretrained(output_model_dir)
|
||||
tokenizer.save_pretrained(output_model_dir)
|
||||
api = HfApi()
|
||||
repo_id = "medmekk/Llama-3.2-1B-INT8-CUSTOM"
|
||||
api.create_repo(repo_id, private=False)
|
||||
api.upload_folder(folder_path=output_model_dir, repo_id=repo_id, repo_type="model")
|
||||
@ -1,5 +1,5 @@
|
||||
datasets==2.3.2
|
||||
transformers==4.48.0
|
||||
transformers==4.38.0
|
||||
wandb==0.13.1
|
||||
evaluate==0.2.2
|
||||
scikit-learn==1.5.0
|
||||
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version(
|
||||
"datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt"
|
||||
|
||||
@ -55,7 +55,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@ from transformers.utils import check_min_version, send_example_telemetry
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@ -62,7 +62,7 @@ except (ModuleNotFoundError, ImportError):
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@ -53,7 +53,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
# region Checking dependencies
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
|
||||
|
||||
|
||||
@ -47,7 +47,7 @@ from transformers.utils import check_min_version, send_example_telemetry
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
task_to_keys = {
|
||||
"cola": ("sentence", None),
|
||||
|
||||
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
# region Dependencies and constants
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.50.0.dev0")
|
||||
check_min_version("4.49.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
|
||||
|
||||
|
||||
@ -83,7 +83,7 @@ checkpoint: 检查点
|
||||
|
||||
🤗 Transformers 提供了数以千计的预训练模型,支持 100 多种语言的文本分类、信息抽取、问答、摘要、翻译、文本生成。它的宗旨是让最先进的 NLP 技术人人易用。
|
||||
|
||||
🤗 Transformers 提供了便于快速下载和使用的API,让你可以把预训练模型用在给定文本、在你的数据集上微调然后通过 [model hub](https://huggingface.co/models) 与社区共享。同时,每个定义的 Python 模块都是完全独立的,便于修改和快速进行研究实验。
|
||||
🤗 Transformers 提供了便于快速下载和使用的API,让你可以把预训练模型用在给定文本、在你的数据集上微调然后通过 [model hub](https://huggingface.co/models) 与社区共享。同时,每个定义的 Python 模块均完全独立,方便修改和快速研究实验。
|
||||
|
||||
🤗 Transformers 支持三个最热门的深度学习库: [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) 以及 [TensorFlow](https://www.tensorflow.org/) — 并与之无缝整合。你可以直接使用一个框架训练你的模型然后用另一个加载和推理。
|
||||
|
||||
|
||||
4
setup.py
4
setup.py
@ -437,7 +437,7 @@ install_requires = [
|
||||
|
||||
setup(
|
||||
name="transformers",
|
||||
version="4.50.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
||||
version="4.49.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
||||
author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
|
||||
author_email="transformers@huggingface.co",
|
||||
description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
|
||||
@ -473,6 +473,8 @@ setup(
|
||||
extras["tests_torch"] = deps_list()
|
||||
extras["tests_tf"] = deps_list()
|
||||
extras["tests_flax"] = deps_list()
|
||||
extras["tests_torch_and_tf"] = deps_list()
|
||||
extras["tests_torch_and_flax"] = deps_list()
|
||||
extras["tests_hub"] = deps_list()
|
||||
extras["tests_pipelines_torch"] = deps_list()
|
||||
extras["tests_pipelines_tf"] = deps_list()
|
||||
|
||||
@ -18,7 +18,7 @@
|
||||
# to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
|
||||
# in the namespace without actually importing anything (and especially none of the backends).
|
||||
|
||||
__version__ = "4.50.0.dev0"
|
||||
__version__ = "4.49.0.dev0"
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
@ -496,7 +496,6 @@ _import_structure = {
|
||||
"models.gptj": ["GPTJConfig"],
|
||||
"models.granite": ["GraniteConfig"],
|
||||
"models.granitemoe": ["GraniteMoeConfig"],
|
||||
"models.granitemoeshared": ["GraniteMoeSharedConfig"],
|
||||
"models.grounding_dino": [
|
||||
"GroundingDinoConfig",
|
||||
"GroundingDinoProcessor",
|
||||
@ -2540,14 +2539,6 @@ else:
|
||||
"GraniteMoePreTrainedModel",
|
||||
]
|
||||
)
|
||||
_import_structure["models.granitemoeshared"].extend(
|
||||
[
|
||||
"GraniteMoeSharedForCausalLM",
|
||||
"GraniteMoeSharedModel",
|
||||
"GraniteMoeSharedPreTrainedModel",
|
||||
]
|
||||
)
|
||||
|
||||
_import_structure["models.grounding_dino"].extend(
|
||||
[
|
||||
"GroundingDinoForObjectDetection",
|
||||
@ -5614,7 +5605,6 @@ if TYPE_CHECKING:
|
||||
from .models.gptj import GPTJConfig
|
||||
from .models.granite import GraniteConfig
|
||||
from .models.granitemoe import GraniteMoeConfig
|
||||
from .models.granitemoeshared import GraniteMoeSharedConfig
|
||||
from .models.grounding_dino import (
|
||||
GroundingDinoConfig,
|
||||
GroundingDinoProcessor,
|
||||
@ -7489,11 +7479,6 @@ if TYPE_CHECKING:
|
||||
GraniteMoeModel,
|
||||
GraniteMoePreTrainedModel,
|
||||
)
|
||||
from .models.granitemoeshared import (
|
||||
GraniteMoeSharedForCausalLM,
|
||||
GraniteMoeSharedModel,
|
||||
GraniteMoeSharedPreTrainedModel,
|
||||
)
|
||||
from .models.grounding_dino import (
|
||||
GroundingDinoForObjectDetection,
|
||||
GroundingDinoModel,
|
||||
|
||||
@ -390,7 +390,6 @@ def spectrogram(
|
||||
center: bool = True,
|
||||
pad_mode: str = "reflect",
|
||||
onesided: bool = True,
|
||||
dither: float = 0.0,
|
||||
preemphasis: Optional[float] = None,
|
||||
mel_filters: Optional[np.ndarray] = None,
|
||||
mel_floor: float = 1e-10,
|
||||
@ -461,12 +460,6 @@ def spectrogram(
|
||||
onesided (`bool`, *optional*, defaults to `True`):
|
||||
If True, only computes the positive frequencies and returns a spectrogram containing `fft_length // 2 + 1`
|
||||
frequency bins. If False, also computes the negative frequencies and returns `fft_length` frequency bins.
|
||||
dither (`float`, *optional*, defaults to 0.0):
|
||||
Adds dithering. In other words, adds a small Gaussian noise to each frame.
|
||||
E.g. use 4.0 to add dithering with a normal distribution centered
|
||||
around 0.0 with standard deviation 4.0, 0.0 means no dithering.
|
||||
Dithering has similar effect as `mel_floor`. It reduces the high log_mel_fbank
|
||||
values for signals with hard-zero sections, when VAD cutoff is present in the signal.
|
||||
preemphasis (`float`, *optional*)
|
||||
Coefficient for a low-pass filter that applies pre-emphasis before the DFT.
|
||||
mel_filters (`np.ndarray` of shape `(num_freq_bins, num_mel_filters)`, *optional*):
|
||||
@ -547,9 +540,6 @@ def spectrogram(
|
||||
for frame_idx in range(num_frames):
|
||||
buffer[:frame_length] = waveform[timestep : timestep + frame_length]
|
||||
|
||||
if dither != 0.0:
|
||||
buffer[:frame_length] += dither * np.random.randn(frame_length)
|
||||
|
||||
if remove_dc_offset:
|
||||
buffer[:frame_length] = buffer[:frame_length] - buffer[:frame_length].mean()
|
||||
|
||||
@ -601,7 +591,6 @@ def spectrogram_batch(
|
||||
center: bool = True,
|
||||
pad_mode: str = "reflect",
|
||||
onesided: bool = True,
|
||||
dither: float = 0.0,
|
||||
preemphasis: Optional[float] = None,
|
||||
mel_filters: Optional[np.ndarray] = None,
|
||||
mel_floor: float = 1e-10,
|
||||
@ -664,10 +653,6 @@ def spectrogram_batch(
|
||||
The padding strategy when `center` is `True`.
|
||||
onesided (`bool`, *optional*, defaults to `True`):
|
||||
If True, returns a one-sided spectrogram for real input signals.
|
||||
dither (`float`, *optional*, defaults to 0.0):
|
||||
Adds dithering. In other words, adds a small Gaussian noise to each frame.
|
||||
E.g. use 4.0 to add dithering with a normal distribution centered
|
||||
around 0.0 with standard deviation 4.0, 0.0 means no dithering.
|
||||
preemphasis (`float`, *optional*):
|
||||
Applies a pre-emphasis filter to each frame.
|
||||
mel_filters (`np.ndarray`, *optional*):
|
||||
@ -756,9 +741,6 @@ def spectrogram_batch(
|
||||
timestep = frame_idx * hop_length
|
||||
buffer[:, :frame_length] = padded_waveform_batch[:, timestep : timestep + frame_length]
|
||||
|
||||
if dither != 0.0:
|
||||
buffer[:, :frame_length] += dither * np.random.randn(*buffer[:, :frame_length].shape)
|
||||
|
||||
if remove_dc_offset:
|
||||
buffer[:, :frame_length] -= buffer[:, :frame_length].mean(axis=1, keepdims=True)
|
||||
|
||||
|
||||
@ -9,7 +9,12 @@ import torch
|
||||
from packaging import version
|
||||
|
||||
from .configuration_utils import PretrainedConfig
|
||||
from .utils import is_hqq_available, is_optimum_quanto_available, logging
|
||||
from .utils import (
|
||||
is_hqq_available,
|
||||
is_optimum_quanto_available,
|
||||
is_torchdynamo_compiling,
|
||||
logging,
|
||||
)
|
||||
from .utils.deprecation import deprecate_kwarg
|
||||
|
||||
|
||||
@ -19,7 +24,7 @@ if is_hqq_available():
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class Cache:
|
||||
class Cache(torch.nn.Module):
|
||||
"""
|
||||
Base, abstract class for all caches. The actual data structure is specific to each subclass.
|
||||
"""
|
||||
@ -358,7 +363,8 @@ class DynamicCache(Cache):
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
@deprecate_kwarg("num_hidden_layers", version="4.47.0")
|
||||
def __init__(self, num_hidden_layers: Optional[int] = None) -> None:
|
||||
super().__init__()
|
||||
self._seen_tokens = 0 # Used in `generate` to keep tally of how many tokens the cache has seen
|
||||
self.key_cache: List[torch.Tensor] = []
|
||||
@ -460,7 +466,10 @@ class DynamicCache(Cache):
|
||||
return legacy_cache
|
||||
|
||||
@classmethod
|
||||
def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "DynamicCache":
|
||||
@deprecate_kwarg("num_hidden_layers", version="4.47.0")
|
||||
def from_legacy_cache(
|
||||
cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, num_hidden_layers: int = None
|
||||
) -> "DynamicCache":
|
||||
"""Converts a cache in the legacy cache format into an equivalent `DynamicCache`. Used for
|
||||
backward compatibility."""
|
||||
cache = cls()
|
||||
@ -486,7 +495,10 @@ class DynamicCache(Cache):
|
||||
self.key_cache[idx] = self.key_cache[idx][..., :max_length, :]
|
||||
self.value_cache[idx] = self.value_cache[idx][..., :max_length, :]
|
||||
|
||||
def batch_split(self, full_batch_size: int, split_size: int) -> List["DynamicCache"]:
|
||||
@deprecate_kwarg("num_hidden_layers", version="4.47.0")
|
||||
def batch_split(
|
||||
self, full_batch_size: int, split_size: int, num_hidden_layers: int = None
|
||||
) -> List["DynamicCache"]:
|
||||
"""Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
|
||||
`_split_model_inputs()` in `generation.utils`"""
|
||||
out = []
|
||||
@ -499,7 +511,8 @@ class DynamicCache(Cache):
|
||||
return out
|
||||
|
||||
@classmethod
|
||||
def from_batch_splits(cls, splits: List["DynamicCache"]) -> "DynamicCache":
|
||||
@deprecate_kwarg("num_hidden_layers", version="4.47.0")
|
||||
def from_batch_splits(cls, splits: List["DynamicCache"], num_hidden_layers: int = None) -> "DynamicCache":
|
||||
"""This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in
|
||||
`generation.utils`"""
|
||||
cache = cls()
|
||||
@ -1135,10 +1148,18 @@ class StaticCache(Cache):
|
||||
layer_device = self.device
|
||||
new_layer_key_cache = torch.zeros(cache_shape, dtype=self.dtype, device=layer_device)
|
||||
new_layer_value_cache = torch.zeros(cache_shape, dtype=self.dtype, device=layer_device)
|
||||
# Note: `mark_static_address` is used to tag the cache as a fixed data pointer,
|
||||
# preventing compiled graph breaks when updating the cache.
|
||||
torch._dynamo.mark_static_address(new_layer_key_cache)
|
||||
torch._dynamo.mark_static_address(new_layer_value_cache)
|
||||
# Notes:
|
||||
# 1. `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
|
||||
# breaks when updating the cache. It can't be used if the cache code is being compiled (but in that case
|
||||
# it is not needed anyway)
|
||||
# 2. `torch.export()` requires mutations to be registered as buffers.
|
||||
if not is_torchdynamo_compiling():
|
||||
self.register_buffer(f"key_cache_{idx}", torch.zeros(cache_shape, dtype=dtype, device=layer_device))
|
||||
self.register_buffer(f"value_cache_{idx}", torch.zeros(cache_shape, dtype=dtype, device=layer_device))
|
||||
new_layer_key_cache = getattr(self, f"key_cache_{idx}")
|
||||
new_layer_value_cache = getattr(self, f"value_cache_{idx}")
|
||||
torch._dynamo.mark_static_address(new_layer_key_cache)
|
||||
torch._dynamo.mark_static_address(new_layer_value_cache)
|
||||
self.key_cache.append(new_layer_key_cache)
|
||||
self.value_cache.append(new_layer_value_cache)
|
||||
|
||||
@ -1506,7 +1527,10 @@ class EncoderDecoderCache(Cache):
|
||||
self.check_dynamic_cache(self.crop.__name__)
|
||||
self.self_attention_cache.crop(maximum_length)
|
||||
|
||||
def batch_split(self, full_batch_size: int, split_size: int) -> "List[EncoderDecoderCache]":
|
||||
@deprecate_kwarg("num_hidden_layers", version="4.47.0")
|
||||
def batch_split(
|
||||
self, full_batch_size: int, split_size: int, num_hidden_layers: int = None
|
||||
) -> "List[EncoderDecoderCache]":
|
||||
"""Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
|
||||
`_split_model_inputs()` in `generation.utils`"""
|
||||
self.check_dynamic_cache(self.batch_split.__name__)
|
||||
@ -1519,7 +1543,10 @@ class EncoderDecoderCache(Cache):
|
||||
return out
|
||||
|
||||
@classmethod
|
||||
def from_batch_splits(cls, splits: List["EncoderDecoderCache"]) -> "EncoderDecoderCache":
|
||||
@deprecate_kwarg("num_hidden_layers", version="4.47.0")
|
||||
def from_batch_splits(
|
||||
cls, splits: List["EncoderDecoderCache"], num_hidden_layers: int = None
|
||||
) -> "EncoderDecoderCache":
|
||||
"""This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in
|
||||
`generation.utils`"""
|
||||
self_attention_cache = DynamicCache()
|
||||
|
||||
@ -420,7 +420,6 @@ class GenerationMixin:
|
||||
model_inputs[input_ids_key] = input_ids.clone(memory_format=torch.contiguous_format)
|
||||
|
||||
# 4. Create missing `position_ids` on the fly
|
||||
encoder_attention_mask = attention_mask if self.config.is_encoder_decoder else None
|
||||
attention_mask = (
|
||||
kwargs.pop("decoder_attention_mask", None) if self.config.is_encoder_decoder else attention_mask
|
||||
)
|
||||
@ -491,9 +490,6 @@ class GenerationMixin:
|
||||
if attention_mask is not None:
|
||||
model_inputs[attention_mask_key] = attention_mask
|
||||
|
||||
if encoder_attention_mask is not None:
|
||||
model_inputs["attention_mask"] = encoder_attention_mask
|
||||
|
||||
# 7. Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
|
||||
for key, value in kwargs.items():
|
||||
if key not in model_inputs:
|
||||
@ -4524,7 +4520,7 @@ def _ranking_fast(
|
||||
return selected_idx
|
||||
|
||||
|
||||
def _split(data, full_batch_size: int, split_size: int = None):
|
||||
def _split(data, full_batch_size: int, num_hidden_layers: int, split_size: int = None):
|
||||
"""
|
||||
Takes care of three cases:
|
||||
1. data is a tensor: e.g. last_hidden_state, pooler_output etc. split them on the batch_size dim
|
||||
@ -4542,7 +4538,7 @@ def _split(data, full_batch_size: int, split_size: int = None):
|
||||
elif isinstance(data, DynamicCache) or (
|
||||
isinstance(data, EncoderDecoderCache) and isinstance(data.self_attention_cache, DynamicCache)
|
||||
):
|
||||
return data.batch_split(full_batch_size, split_size)
|
||||
return data.batch_split(full_batch_size, split_size, num_hidden_layers)
|
||||
elif isinstance(data, tuple):
|
||||
# If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example)
|
||||
if isinstance(data[0], tuple):
|
||||
@ -4595,9 +4591,11 @@ def _split_model_inputs(
|
||||
keys_to_ignore = ["cache_position", "encoder_outputs", "logits_to_keep"]
|
||||
non_bool_keys = [k for k in keys if not isinstance(model_input[k], bool) and k not in keys_to_ignore]
|
||||
|
||||
num_hidden_layers = config.get_text_config().num_hidden_layers
|
||||
|
||||
# we split the tensors and tuples of tensors
|
||||
data_split_list = [
|
||||
{k: _split(model_input[k], full_batch_size, split_size)[i] for k in non_bool_keys}
|
||||
{k: _split(model_input[k], full_batch_size, num_hidden_layers, split_size)[i] for k in non_bool_keys}
|
||||
for i in range(full_batch_size // split_size)
|
||||
]
|
||||
# bool values are the same and replicated for each split
|
||||
@ -4634,6 +4632,7 @@ def stack_model_outputs(model_outputs: List[ModelOutput], config: PretrainedConf
|
||||
|
||||
# Infer the class from the first object in the list
|
||||
model_output_cls = type(model_outputs[0])
|
||||
num_hidden_layers = config.get_text_config().num_hidden_layers
|
||||
|
||||
# Ensure all objects are of the same type
|
||||
if not all(isinstance(obj, model_output_cls) for obj in model_outputs):
|
||||
@ -4650,9 +4649,9 @@ def stack_model_outputs(model_outputs: List[ModelOutput], config: PretrainedConf
|
||||
return torch.cat(data, dim=0)
|
||||
# New cache format
|
||||
elif isinstance(data[0], DynamicCache):
|
||||
return DynamicCache.from_batch_splits(data)
|
||||
return DynamicCache.from_batch_splits(data, num_hidden_layers=num_hidden_layers)
|
||||
elif isinstance(data[0], EncoderDecoderCache):
|
||||
return EncoderDecoderCache.from_batch_splits(data)
|
||||
return EncoderDecoderCache.from_batch_splits(data, num_hidden_layers=num_hidden_layers)
|
||||
elif isinstance(data[0], tuple):
|
||||
# If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example)
|
||||
if isinstance(data[0][0], tuple):
|
||||
|
||||
@ -16,7 +16,10 @@ from ..utils.import_utils import is_torch_available
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
from transformers import PreTrainedModel, StaticCache
|
||||
from transformers import (
|
||||
PreTrainedModel,
|
||||
StaticCache,
|
||||
)
|
||||
from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_3
|
||||
|
||||
|
||||
@ -69,13 +72,9 @@ class TorchExportableModuleWithStaticCache(torch.nn.Module):
|
||||
config=self.model.config,
|
||||
batch_size=self.model.generation_config.cache_config.batch_size,
|
||||
max_cache_len=self.model.generation_config.cache_config.max_cache_len,
|
||||
device=self.model.generation_config.cache_config.device,
|
||||
dtype=self.model.dtype,
|
||||
device=self.model.generation_config.cache_config.device,
|
||||
)
|
||||
for i in range(len(self.static_cache.key_cache)):
|
||||
self.register_buffer(f"key_cache_{i}", self.static_cache.key_cache[i], persistent=False)
|
||||
self.register_buffer(f"value_cache_{i}", self.static_cache.value_cache[i], persistent=False)
|
||||
|
||||
self.is_causal = any("CausalLM" in arch for arch in self.model.config.architectures)
|
||||
if self.is_causal:
|
||||
causal_mask = torch.tril(
|
||||
@ -110,15 +109,12 @@ class TorchExportableModuleWithStaticCache(torch.nn.Module):
|
||||
"""
|
||||
_, seqlen = input_ids.shape
|
||||
attn_mask = self.mask[cache_position, :seqlen] if self.is_causal else None
|
||||
position_ids = cache_position.unsqueeze(0)
|
||||
past_key_values = self.static_cache
|
||||
|
||||
outs = self.model(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attn_mask,
|
||||
position_ids=position_ids,
|
||||
position_ids=cache_position.unsqueeze(0),
|
||||
cache_position=cache_position,
|
||||
past_key_values=past_key_values,
|
||||
past_key_values=self.static_cache,
|
||||
use_cache=True,
|
||||
)
|
||||
return outs.logits
|
||||
@ -147,7 +143,7 @@ class TorchExportableModuleWithStaticCache(torch.nn.Module):
|
||||
prompt_token_len = prompt_token_ids.shape[-1]
|
||||
max_generation_length = prompt_token_len + max_new_tokens
|
||||
for buffer_name, buffer in exported_program.named_buffers():
|
||||
if buffer_name.startswith("key_cache"):
|
||||
if buffer_name.startswith("static_cache.key_cache"):
|
||||
max_cache_len = buffer.shape[2]
|
||||
max_generation_length = min(max_generation_length, max_cache_len)
|
||||
break
|
||||
|
||||
@ -28,12 +28,15 @@ if is_torch_available():
|
||||
|
||||
|
||||
if is_flute_available():
|
||||
from flute.integrations.higgs import prepare_data_transposed
|
||||
from flute.tune import TuneMetaData, qgemm_v2
|
||||
import flute.utils
|
||||
|
||||
if is_hadamard_available():
|
||||
from fast_hadamard_transform import hadamard_transform
|
||||
|
||||
if is_flute_available():
|
||||
import flute.utils
|
||||
from flute.integrations.higgs import prepare_data_transposed
|
||||
|
||||
|
||||
def pad_to_block(tensor, dims, had_block_size, value=0):
|
||||
pad_dims = [0 for _ in range(2 * len(tensor.shape))]
|
||||
@ -461,14 +464,14 @@ def quantize_with_higgs(weight, bits: int = 4, p: int = 2, group_size: int = 256
|
||||
|
||||
# Quantize
|
||||
codes = torch.empty(weight.shape[:-1], device=device, dtype=torch.uint8)
|
||||
for i in range(0, weight.shape[0], 16):
|
||||
codes[i : i + 16] = torch.argmax(2 * weight[i : i + 16] @ grid.T - grid_norm_2, dim=-1).to(torch.uint8)
|
||||
for i in range(0, weight.shape[0], 64):
|
||||
codes[i : i + 64] = torch.argmax(2 * weight[i : i + 64] @ grid.T - grid_norm_2, dim=-1).to(torch.uint8)
|
||||
del weight
|
||||
|
||||
codes = codes.reshape(codes.shape[0], -1)
|
||||
scales = scales / sqrt(hadamard_size)
|
||||
|
||||
weight, scales, tables, tables2, tune_metadata = prepare_data_transposed(
|
||||
weight, scales, tables, tables2 = prepare_data_transposed(
|
||||
codes,
|
||||
torch.repeat_interleave(scales.to(dtype), hadamard_size // group_size, dim=1),
|
||||
grid.to(dtype),
|
||||
@ -477,7 +480,6 @@ def quantize_with_higgs(weight, bits: int = 4, p: int = 2, group_size: int = 256
|
||||
vector_size=p,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
check_correctness=False,
|
||||
)
|
||||
|
||||
return {
|
||||
@ -485,7 +487,6 @@ def quantize_with_higgs(weight, bits: int = 4, p: int = 2, group_size: int = 256
|
||||
"scales": scales,
|
||||
"tables": tables,
|
||||
"tables2": tables2.view(dtype=torch.float16),
|
||||
"tune_metadata": tune_metadata,
|
||||
}
|
||||
|
||||
|
||||
@ -507,6 +508,7 @@ class HiggsLinear(torch.nn.Module):
|
||||
self.num_bits = num_bits
|
||||
self.group_size = group_size
|
||||
self.hadamard_size = hadamard_size
|
||||
self.num_sms_packed = nn.Parameter(torch.tensor(-1, dtype=torch.int32, device=device), requires_grad=False)
|
||||
|
||||
assert in_features % group_size == 0
|
||||
assert num_bits in [2, 3, 4]
|
||||
@ -529,7 +531,6 @@ class HiggsLinear(torch.nn.Module):
|
||||
self.register_parameter("bias", None)
|
||||
|
||||
self.workspace = None # must be set externally to be reused among layers
|
||||
self.tune_metadata: TuneMetaData = None # must be set externally because architecture dependent
|
||||
|
||||
def forward(self, x):
|
||||
x = pad_to_block(x, [-1], self.hadamard_size)
|
||||
@ -537,15 +538,16 @@ class HiggsLinear(torch.nn.Module):
|
||||
if self.workspace is None:
|
||||
raise Exception("Workspace must be set before calling forward")
|
||||
|
||||
return qgemm_v2(
|
||||
return flute.qgemm_hadamard(
|
||||
x,
|
||||
self.weight,
|
||||
self.scales,
|
||||
self.tables,
|
||||
self.tables2.view(dtype=torch.float32),
|
||||
self.workspace,
|
||||
self.tune_metadata,
|
||||
hadamard_size=self.hadamard_size,
|
||||
self.num_bits,
|
||||
self.group_size,
|
||||
self.hadamard_size,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -787,7 +787,6 @@ def _load_state_dict_into_meta_model(
|
||||
keep_in_fp32_modules=None,
|
||||
unexpected_keys=None, # passing `unexpected` for cleanup from quantization items
|
||||
pretrained_model_name_or_path=None, # for flagging the user when the model contains renamed keys
|
||||
device_mesh=None,
|
||||
):
|
||||
"""
|
||||
This is somewhat similar to `_load_state_dict_into_model`, but deals with a model that has some or all of its
|
||||
@ -797,8 +796,6 @@ def _load_state_dict_into_meta_model(
|
||||
`start_prefix` is used for models which insert their name into model keys, e.g. `bert` in
|
||||
`bert.pooler.dense.weight`
|
||||
|
||||
It also initialize tensor parallelism for each module if needed.
|
||||
|
||||
"""
|
||||
|
||||
# XXX: remaining features to implement to be fully compatible with _load_state_dict_into_model
|
||||
@ -812,12 +809,6 @@ def _load_state_dict_into_meta_model(
|
||||
|
||||
is_torch_e4m3fn_available = hasattr(torch, "float8_e4m3fn")
|
||||
|
||||
# we need this later to initialize tensor parallelism
|
||||
if device_mesh is not None:
|
||||
full_tp_plan = model.config.base_model_tp_plan
|
||||
for submodule in model.modules():
|
||||
full_tp_plan.update(getattr(submodule, "_tp_plan", {}))
|
||||
|
||||
for param_name, param in state_dict.items():
|
||||
if param_name not in expected_keys:
|
||||
continue
|
||||
@ -921,37 +912,6 @@ def _load_state_dict_into_meta_model(
|
||||
setattr(module, tensor_name, value)
|
||||
# TODO: consider removing used param_parts from state_dict before return
|
||||
|
||||
# In this case, let's parallelize the modules!
|
||||
if device_mesh is not None:
|
||||
# Immediate parent
|
||||
split_parent_module_name = param_name.split(".")[:-1]
|
||||
parent_module_name = ".".join(split_parent_module_name)
|
||||
parent_module = model
|
||||
for name in split_parent_module_name:
|
||||
parent_module = getattr(parent_module, name)
|
||||
|
||||
# Check if we are part of the tp_plan
|
||||
current_module_plan = None
|
||||
for param, plan in full_tp_plan.items():
|
||||
# "*" are a placeholder for layer indices, so we replace them by "[0-9]+" in the regex pattern
|
||||
pattern = param.replace("*", "[0-9]+")
|
||||
if re.search(pattern, parent_module_name):
|
||||
current_module_plan = plan
|
||||
break
|
||||
|
||||
# We can only apply the tp_plan after all parameters of the current module have been correctly initialized (e.g.
|
||||
# if we have bias, we need both `weights` and `bias` of a nn.Linear to be initialized)
|
||||
process_device = list(device_map.values())[0]
|
||||
all_module_parameters_initialized = all(
|
||||
m.device == process_device for m in parent_module.parameters(recurse=False)
|
||||
) and all(m.device == process_device for m in parent_module.buffers(recurse=False))
|
||||
if current_module_plan is not None and all_module_parameters_initialized:
|
||||
torch.distributed.tensor.parallel.parallelize_module(
|
||||
parent_module,
|
||||
device_mesh=device_mesh,
|
||||
parallelize_plan=translate_to_torch_parallel_style(current_module_plan),
|
||||
)
|
||||
|
||||
return error_msgs, offload_index, state_dict_index
|
||||
|
||||
|
||||
@ -3529,11 +3489,12 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
)
|
||||
|
||||
# We need to correctly dispatch the model on the current process device. The easiest way for this is to use a simple
|
||||
# `device_map` pointing to the correct device
|
||||
device_mesh = None
|
||||
# `device_map` pointing to the correct device. If we don't, torch will use the default device (index 0) for all
|
||||
# childs processes at parallelization time, resulting in excessive memory usage on device 0 and OOMs.
|
||||
# And temporarily setting the default device to current process rank result in the following error
|
||||
# `torch.distributed.DistBackendError: Attempt to perform collective on tensor not on device passed to init_process_group`
|
||||
tp_device = None
|
||||
if tp_plan is not None:
|
||||
if not is_torch_greater_or_equal("2.5"):
|
||||
raise EnvironmentError("tensor parallel is only supported for `torch>=2.5`.")
|
||||
if not torch.distributed.is_initialized():
|
||||
raise ValueError("Tensor Parallel requires torch.distributed to be initialized first.")
|
||||
|
||||
@ -3545,10 +3506,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
# This is the easiest way to dispatch to the current process device
|
||||
device_map = tp_device
|
||||
|
||||
# Assuming sharding the model onto the world
|
||||
world_size = torch.distributed.get_world_size()
|
||||
device_mesh = torch.distributed.init_device_mesh(tp_device.type, (world_size,))
|
||||
|
||||
if is_fsdp_enabled():
|
||||
low_cpu_mem_usage = True
|
||||
|
||||
@ -3643,7 +3600,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
if low_cpu_mem_usage is None:
|
||||
low_cpu_mem_usage = True
|
||||
elif not low_cpu_mem_usage:
|
||||
raise ValueError("Passing along a `device_map` or a `tp_plan` requires `low_cpu_mem_usage=True`")
|
||||
raise ValueError("Passing along a `device_map` requires `low_cpu_mem_usage=True`")
|
||||
|
||||
if low_cpu_mem_usage:
|
||||
if is_deepspeed_zero3_enabled():
|
||||
@ -3652,7 +3609,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
)
|
||||
elif not is_accelerate_available():
|
||||
raise ImportError(
|
||||
f"Using `low_cpu_mem_usage=True`, a `device_map` or a `tp_plan` requires Accelerate: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`"
|
||||
f"Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`"
|
||||
)
|
||||
|
||||
# handling bnb config from kwargs, remove after `load_in_{4/8}bit` deprecation.
|
||||
@ -3749,10 +3706,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
device_map = hf_quantizer.update_device_map(device_map)
|
||||
|
||||
# In order to ensure popular quantization methods are supported. Can be disable with `disable_telemetry`
|
||||
if hasattr(hf_quantizer.quantization_config.quant_method, "value"):
|
||||
user_agent["quant"] = hf_quantizer.quantization_config.quant_method.value
|
||||
else:
|
||||
user_agent["quant"] = hf_quantizer.quantization_config.quant_method
|
||||
user_agent["quant"] = hf_quantizer.quantization_config.quant_method.value
|
||||
|
||||
# Force-set to `True` for more mem efficiency
|
||||
if low_cpu_mem_usage is None:
|
||||
low_cpu_mem_usage = True
|
||||
@ -4229,9 +4184,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
# Let's make sure we don't run the init function of buffer modules
|
||||
model = cls(config, *model_args, **model_kwargs)
|
||||
|
||||
if device_mesh is not None and not model.supports_tp_plan:
|
||||
raise NotImplementedError("This model does not have a tensor parallel plan.")
|
||||
|
||||
# make sure we use the model's config since the __init__ call might have copied it
|
||||
config = model.config
|
||||
|
||||
@ -4315,12 +4267,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
# check if we don't have tied param in different devices
|
||||
check_tied_parameters_on_same_device(tied_params, device_map)
|
||||
|
||||
if gguf_path and device_map is not None and "disk" in device_map.values():
|
||||
raise RuntimeError(
|
||||
"One or more modules is configured to be mapped to disk. Disk offload is not supported for models "
|
||||
"loaded from GGUF files."
|
||||
)
|
||||
|
||||
if from_tf:
|
||||
if resolved_archive_file.endswith(".index"):
|
||||
# Load from a TensorFlow 1.X checkpoint - provided by original authors
|
||||
@ -4382,7 +4328,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
keep_in_fp32_modules=keep_in_fp32_modules,
|
||||
gguf_path=gguf_path,
|
||||
weights_only=weights_only,
|
||||
device_mesh=device_mesh,
|
||||
)
|
||||
|
||||
# make sure token embedding weights are still tied if needed
|
||||
@ -4417,9 +4362,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
)
|
||||
pass
|
||||
|
||||
# Dispatch model with hooks on all devices if necessary (not needed with a tp_plan, so we skip it as it slightly
|
||||
# harm performances)
|
||||
if device_map is not None and device_mesh is None:
|
||||
# Dispatch model with hooks on all devices if necessary
|
||||
if device_map is not None:
|
||||
device_map_kwargs = {
|
||||
"device_map": device_map,
|
||||
"offload_dir": offload_folder,
|
||||
@ -4446,13 +4390,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
if not is_fsdp_enabled() and not is_deepspeed_zero3_enabled():
|
||||
dispatch_model(model, **device_map_kwargs)
|
||||
|
||||
# This is needed for the RotaryEmbedding, which was not initialized on the correct device as it is
|
||||
# not part of the state_dict (persistent=False)
|
||||
if device_mesh is not None:
|
||||
for buffer in model.buffers():
|
||||
if buffer.device != tp_device:
|
||||
buffer.data = buffer.to(tp_device)
|
||||
|
||||
if hf_quantizer is not None:
|
||||
hf_quantizer.postprocess_model(model, config=config)
|
||||
model.hf_quantizer = hf_quantizer
|
||||
@ -4475,6 +4412,16 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
}
|
||||
return model, loading_info
|
||||
|
||||
if tp_plan is not None:
|
||||
assert tp_device is not None, "tp_device not set!"
|
||||
if not model.supports_tp_plan:
|
||||
raise NotImplementedError("This model does not have a tensor parallel plan.")
|
||||
# Assuming sharding the model onto the world
|
||||
world_size = torch.distributed.get_world_size()
|
||||
device_mesh = torch.distributed.init_device_mesh(tp_device.type, (world_size,))
|
||||
# Apply Tensor Parallelism
|
||||
model.tensor_parallel(device_mesh)
|
||||
|
||||
return model
|
||||
|
||||
@staticmethod
|
||||
@ -4568,7 +4515,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
keep_in_fp32_modules=None,
|
||||
gguf_path=None,
|
||||
weights_only=True,
|
||||
device_mesh=None,
|
||||
):
|
||||
is_safetensors = False
|
||||
is_quantized = hf_quantizer is not None
|
||||
@ -4579,7 +4525,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
archive_file = (
|
||||
resolved_archive_file[0] if isinstance(resolved_archive_file, (list, tuple)) else resolved_archive_file
|
||||
)
|
||||
is_safetensors = archive_file is not None and archive_file.endswith(".safetensors")
|
||||
is_safetensors = archive_file.endswith(".safetensors")
|
||||
if offload_folder is None and not is_safetensors:
|
||||
raise ValueError(
|
||||
"The current `device_map` had weights offloaded to the disk. Please provide an `offload_folder`"
|
||||
@ -4868,7 +4814,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
is_safetensors=is_safetensors,
|
||||
keep_in_fp32_modules=keep_in_fp32_modules,
|
||||
unexpected_keys=unexpected_keys,
|
||||
device_mesh=device_mesh,
|
||||
)
|
||||
else:
|
||||
# Sharded checkpoint or whole but low_cpu_mem_usage==True
|
||||
@ -4958,7 +4903,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
is_safetensors=is_safetensors,
|
||||
keep_in_fp32_modules=keep_in_fp32_modules,
|
||||
unexpected_keys=unexpected_keys,
|
||||
device_mesh=device_mesh,
|
||||
)
|
||||
error_msgs += new_error_msgs
|
||||
else:
|
||||
@ -5236,12 +5180,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
|
||||
def tensor_parallel(self, device_mesh):
|
||||
"""
|
||||
Tensor parallelize the model across the given device mesh. This function is a helper to be called after the model
|
||||
was already loaded in memory, note however that this means that each process will first initialize the whole model,
|
||||
then parallelize it accross devices. Thus there is a huge waste of GPU memory, and this can lead to OOM at loading time.
|
||||
|
||||
Calling `from_pretrained(..., tp_plan="auto")` is prefered, and will parallelize module-by-module during initialization,
|
||||
so that the expected per-device memory spike at loading time is not larger than the final model size on each device.
|
||||
Tensor parallelize the model across the given device mesh.
|
||||
|
||||
Args:
|
||||
device_mesh (`torch.distributed.DeviceMesh`):
|
||||
|
||||
@ -118,7 +118,6 @@ from . import (
|
||||
gptj,
|
||||
granite,
|
||||
granitemoe,
|
||||
granitemoeshared,
|
||||
grounding_dino,
|
||||
groupvit,
|
||||
helium,
|
||||
|
||||
@ -193,7 +193,7 @@ class ASTFeatureExtractor(SequenceFeatureExtractor):
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
|
||||
"It is strongly recommended to pass the `sampling_rate` argument to this function. "
|
||||
"Failing to do so can result in silent errors that might be hard to debug."
|
||||
)
|
||||
|
||||
|
||||
@ -137,7 +137,6 @@ CONFIG_MAPPING_NAMES = OrderedDict(
|
||||
("gptsan-japanese", "GPTSanJapaneseConfig"),
|
||||
("granite", "GraniteConfig"),
|
||||
("granitemoe", "GraniteMoeConfig"),
|
||||
("granitemoeshared", "GraniteMoeSharedConfig"),
|
||||
("granitevision", "LlavaNextConfig"),
|
||||
("graphormer", "GraphormerConfig"),
|
||||
("grounding-dino", "GroundingDinoConfig"),
|
||||
@ -468,7 +467,6 @@ MODEL_NAMES_MAPPING = OrderedDict(
|
||||
("gptsan-japanese", "GPTSAN-japanese"),
|
||||
("granite", "Granite"),
|
||||
("granitemoe", "GraniteMoeMoe"),
|
||||
("granitemoeshared", "GraniteMoeSharedMoe"),
|
||||
("granitevision", "LLaVA-NeXT"),
|
||||
("graphormer", "Graphormer"),
|
||||
("grounding-dino", "Grounding DINO"),
|
||||
|
||||
@ -132,7 +132,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
|
||||
("granite", "GraniteModel"),
|
||||
("granitemoe", "GraniteMoeModel"),
|
||||
("granitemoeshared", "GraniteMoeSharedModel"),
|
||||
("graphormer", "GraphormerModel"),
|
||||
("grounding-dino", "GroundingDinoModel"),
|
||||
("groupvit", "GroupViTModel"),
|
||||
@ -527,7 +526,6 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
|
||||
("gptj", "GPTJForCausalLM"),
|
||||
("granite", "GraniteForCausalLM"),
|
||||
("granitemoe", "GraniteMoeForCausalLM"),
|
||||
("granitemoeshared", "GraniteMoeSharedForCausalLM"),
|
||||
("helium", "HeliumForCausalLM"),
|
||||
("jamba", "JambaForCausalLM"),
|
||||
("jetmoe", "JetMoeForCausalLM"),
|
||||
|
||||
@ -2016,9 +2016,6 @@ class Blip2VisionModelWithProjection(Blip2PreTrainedModel):
|
||||
class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin):
|
||||
config_class = Blip2Config
|
||||
main_input_name = "pixel_values"
|
||||
_supports_cache_class = True
|
||||
_supports_static_cache = True
|
||||
_supports_quantized_cache = False # not all LM bacbones support (e.g. T5)
|
||||
|
||||
def __init__(self, config: Blip2Config):
|
||||
super().__init__(config)
|
||||
|
||||
@ -1284,13 +1284,13 @@ class ChameleonModel(ChameleonPreTrainedModel):
|
||||
|
||||
if pixel_values is not None:
|
||||
image_tokens = self.get_image_tokens(pixel_values)
|
||||
special_image_mask = input_ids == self.vocabulary_mapping.image_token_id
|
||||
if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_tokens.numel():
|
||||
n_image_tokens_in_text = (input_ids == self.vocabulary_mapping.image_token_id).sum()
|
||||
n_image_features = image_tokens.shape[0] * image_tokens.shape[1]
|
||||
n_image_tokens_in_text = (input_ids == self.vocabulary_mapping.image_token_id).sum().item()
|
||||
n_image_features = image_tokens.shape[0] * image_tokens.shape[1]
|
||||
if n_image_tokens_in_text != n_image_features:
|
||||
raise ValueError(
|
||||
f"Image features and image tokens do not match: tokens: {n_image_tokens_in_text}, features {n_image_features}"
|
||||
)
|
||||
special_image_mask = input_ids == self.vocabulary_mapping.image_token_id
|
||||
image_tokens = image_tokens.to(input_ids.device, input_ids.dtype)
|
||||
input_ids = input_ids.masked_scatter(special_image_mask, image_tokens)
|
||||
|
||||
|
||||
@ -308,7 +308,7 @@ class ClapFeatureExtractor(SequenceFeatureExtractor):
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
|
||||
"It is strongly recommended to pass the `sampling_rate` argument to this function. "
|
||||
"Failing to do so can result in silent errors that might be hard to debug."
|
||||
)
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user