mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-22 10:19:00 +08:00
Compare commits
51 Commits
check_fa2
...
clean-mode
Author | SHA1 | Date | |
---|---|---|---|
a304370d21 | |||
e155ae8510 | |||
cfa8c40779 | |||
81b2678251 | |||
6c34e8a543 | |||
eb727c50c8 | |||
b1351120d1 | |||
f06e39ed69 | |||
a4ec22ae4d | |||
54242e21cf | |||
f3341519e2 | |||
2d71cb33ac | |||
f1672862c1 | |||
ce6092585f | |||
6099f095a5 | |||
2dbf9a1757 | |||
675cd88e1a | |||
2ab361bd1d | |||
4853171a0a | |||
168db36cd8 | |||
531dc0c286 | |||
2de1fcd039 | |||
7d0ab1a129 | |||
39322ed145 | |||
8fcc791590 | |||
19ef67ad7b | |||
b7ae7d92a7 | |||
46b1b90b47 | |||
71838e4a8f | |||
4e35168f46 | |||
4affc579d8 | |||
949f149883 | |||
91fc267959 | |||
d0035f754f | |||
b76a44d2e2 | |||
5b95608bf1 | |||
b5c6501f7c | |||
081ebcc01c | |||
14289072e5 | |||
da51096bca | |||
8f45617781 | |||
818cef60ea | |||
afecb000aa | |||
3c791b6f39 | |||
5abbe87f57 | |||
fbad09c9c0 | |||
388ee4b923 | |||
d70e1c1aa5 | |||
a6cfc4b6c5 | |||
59a90ee202 | |||
cd26c0d99c |
@ -109,9 +109,7 @@ class CircleCIJob:
|
||||
self.docker_image[0]["image"] = f"{self.docker_image[0]['image']}:dev"
|
||||
print(f"Using {self.docker_image} docker image")
|
||||
if self.install_steps is None:
|
||||
self.install_steps = ["uv pip install ."]
|
||||
# Use a custom patched pytest to force exit the process at the end, to avoid `Too long with no output (exceeded 10m0s): context deadline exceeded`
|
||||
self.install_steps.append("uv pip install git+https://github.com/ydshieh/pytest.git@8.4.1-ydshieh")
|
||||
self.install_steps = ["uv venv && uv pip install ."]
|
||||
if self.pytest_options is None:
|
||||
self.pytest_options = {}
|
||||
if isinstance(self.tests_to_run, str):
|
||||
@ -215,7 +213,7 @@ generate_job = CircleCIJob(
|
||||
docker_image=[{"image": "huggingface/transformers-torch-light"}],
|
||||
# networkx==3.3 (after #36957) cause some issues
|
||||
# TODO: remove this once it works directly
|
||||
install_steps=["uv pip install ."],
|
||||
install_steps=["uv venv && uv pip install ."],
|
||||
marker="generate",
|
||||
parallelism=6,
|
||||
)
|
||||
@ -252,7 +250,7 @@ examples_torch_job = CircleCIJob(
|
||||
additional_env={"OMP_NUM_THREADS": 8},
|
||||
docker_image=[{"image":"huggingface/transformers-examples-torch"}],
|
||||
# TODO @ArthurZucker remove this once docker is easier to build
|
||||
install_steps=["uv pip install . && uv pip install -r examples/pytorch/_tests_requirements.txt"],
|
||||
install_steps=["uv venv && uv pip install . && uv pip install -r examples/pytorch/_tests_requirements.txt"],
|
||||
pytest_num_workers=4,
|
||||
)
|
||||
|
||||
@ -261,7 +259,7 @@ hub_job = CircleCIJob(
|
||||
additional_env={"HUGGINGFACE_CO_STAGING": True},
|
||||
docker_image=[{"image":"huggingface/transformers-torch-light"}],
|
||||
install_steps=[
|
||||
'uv pip install .',
|
||||
'uv venv && uv pip install .',
|
||||
'git config --global user.email "ci@dummy.com"',
|
||||
'git config --global user.name "ci"',
|
||||
],
|
||||
@ -275,6 +273,7 @@ onnx_job = CircleCIJob(
|
||||
"onnx",
|
||||
docker_image=[{"image":"huggingface/transformers-torch-tf-light"}],
|
||||
install_steps=[
|
||||
"uv venv",
|
||||
"uv pip install .[testing,sentencepiece,onnxruntime,vision,rjieba]",
|
||||
],
|
||||
pytest_options={"k onnx": None},
|
||||
@ -304,7 +303,7 @@ non_model_job = CircleCIJob(
|
||||
docker_image=[{"image": "huggingface/transformers-torch-light"}],
|
||||
# networkx==3.3 (after #36957) cause some issues
|
||||
# TODO: remove this once it works directly
|
||||
install_steps=["uv pip install .[serving]"],
|
||||
install_steps=["uv venv && uv pip install ."],
|
||||
marker="not generate",
|
||||
parallelism=6,
|
||||
)
|
||||
@ -322,7 +321,7 @@ doc_test_job = CircleCIJob(
|
||||
additional_env={"TRANSFORMERS_VERBOSITY": "error", "DATASETS_VERBOSITY": "error", "SKIP_CUDA_DOCTEST": "1"},
|
||||
install_steps=[
|
||||
# Add an empty file to keep the test step running correctly even no file is selected to be tested.
|
||||
"uv pip install .",
|
||||
"uv venv && pip install .",
|
||||
"touch dummy.py",
|
||||
command,
|
||||
"cat pr_documentation_tests_temp.txt",
|
||||
|
2
.github/workflows/benchmark.yml
vendored
2
.github/workflows/benchmark.yml
vendored
@ -48,7 +48,7 @@ jobs:
|
||||
|
||||
- name: Run database init script
|
||||
run: |
|
||||
psql -f benchmark/utils/init_db.sql
|
||||
psql -f benchmark/init_db.sql
|
||||
env:
|
||||
PGDATABASE: metrics
|
||||
PGHOST: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGHOST }}
|
||||
|
2
.github/workflows/check_failed_tests.yml
vendored
2
.github/workflows/check_failed_tests.yml
vendored
@ -41,7 +41,7 @@ jobs:
|
||||
check_new_failures:
|
||||
name: " "
|
||||
runs-on:
|
||||
group: aws-g5-4xlarge-cache
|
||||
group: aws-g4dn-4xlarge-cache
|
||||
container:
|
||||
image: ${{ inputs.docker }}
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
|
49
.github/workflows/collated-reports.yml
vendored
49
.github/workflows/collated-reports.yml
vendored
@ -1,49 +0,0 @@
|
||||
name: CI collated reports
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
job:
|
||||
required: true
|
||||
type: string
|
||||
report_repo_id:
|
||||
required: true
|
||||
type: string
|
||||
machine_type:
|
||||
required: true
|
||||
type: string
|
||||
gpu_name:
|
||||
description: Name of the GPU used for the job. Its enough that the value contains the name of the GPU, e.g. "noise-h100-more-noise". Case insensitive.
|
||||
required: true
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
collated_reports:
|
||||
name: Collated reports
|
||||
runs-on: ubuntu-22.04
|
||||
if: always()
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/download-artifact@v4
|
||||
|
||||
- name: Collated reports
|
||||
shell: bash
|
||||
env:
|
||||
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
CI_SHA: ${{ github.sha }}
|
||||
TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
|
||||
run: |
|
||||
pip install huggingface_hub
|
||||
python3 utils/collated_reports.py \
|
||||
--path /transformers/reports/ \
|
||||
--machine-type ${{ inputs.machine_type }} \
|
||||
--commit-hash ${{ env.CI_SHA }} \
|
||||
--job ${{ inputs.job }} \
|
||||
--report-repo-id ${{ inputs.report_repo_id }} \
|
||||
--gpu-name ${{ inputs.gpu_name }}
|
||||
|
||||
- name: Upload collated reports
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: collated_reports_${{ env.CI_SHA }}.json
|
||||
path: collated_reports_${{ env.CI_SHA }}.json
|
4
.github/workflows/doctest_job.yml
vendored
4
.github/workflows/doctest_job.yml
vendored
@ -28,10 +28,10 @@ jobs:
|
||||
matrix:
|
||||
split_keys: ${{ fromJson(inputs.split_keys) }}
|
||||
runs-on:
|
||||
group: aws-g5-4xlarge-cache
|
||||
group: aws-g4dn-4xlarge-cache
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
|
4
.github/workflows/doctests.yml
vendored
4
.github/workflows/doctests.yml
vendored
@ -15,10 +15,10 @@ jobs:
|
||||
setup:
|
||||
name: Setup
|
||||
runs-on:
|
||||
group: aws-g5-4xlarge-cache
|
||||
group: aws-g4dn-4xlarge-cache
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
outputs:
|
||||
job_splits: ${{ steps.set-matrix.outputs.job_splits }}
|
||||
split_keys: ${{ steps.set-matrix.outputs.split_keys }}
|
||||
|
157
.github/workflows/get-pr-info.yml
vendored
157
.github/workflows/get-pr-info.yml
vendored
@ -1,157 +0,0 @@
|
||||
name: Get PR commit SHA
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
pr_number:
|
||||
required: true
|
||||
type: string
|
||||
outputs:
|
||||
PR_HEAD_REPO_FULL_NAME:
|
||||
description: "The full name of the repository from which the pull request is created"
|
||||
value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REPO_FULL_NAME }}
|
||||
PR_BASE_REPO_FULL_NAME:
|
||||
description: "The full name of the repository to which the pull request is created"
|
||||
value: ${{ jobs.get-pr-info.outputs.PR_BASE_REPO_FULL_NAME }}
|
||||
PR_HEAD_REPO_OWNER:
|
||||
description: "The owner of the repository from which the pull request is created"
|
||||
value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}
|
||||
PR_BASE_REPO_OWNER:
|
||||
description: "The owner of the repository to which the pull request is created"
|
||||
value: ${{ jobs.get-pr-info.outputs.PR_BASE_REPO_OWNER }}
|
||||
PR_HEAD_REPO_NAME:
|
||||
description: "The name of the repository from which the pull request is created"
|
||||
value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}
|
||||
PR_BASE_REPO_NAME:
|
||||
description: "The name of the repository to which the pull request is created"
|
||||
value: ${{ jobs.get-pr-info.outputs.PR_BASE_REPO_NAME }}
|
||||
PR_HEAD_REF:
|
||||
description: "The branch name of the pull request in the head repository"
|
||||
value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REF }}
|
||||
PR_BASE_REF:
|
||||
description: "The branch name in the base repository (to merge into)"
|
||||
value: ${{ jobs.get-pr-info.outputs.PR_BASE_REF }}
|
||||
PR_HEAD_SHA:
|
||||
description: "The head sha of the pull request branch in the head repository"
|
||||
value: ${{ jobs.get-pr-info.outputs.PR_HEAD_SHA }}
|
||||
PR_BASE_SHA:
|
||||
description: "The head sha of the target branch in the base repository"
|
||||
value: ${{ jobs.get-pr-info.outputs.PR_BASE_SHA }}
|
||||
PR_MERGE_COMMIT_SHA:
|
||||
description: "The sha of the merge commit for the pull request (created by GitHub) in the base repository"
|
||||
value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_SHA }}
|
||||
PR_HEAD_COMMIT_DATE:
|
||||
description: "The date of the head sha of the pull request branch in the head repository"
|
||||
value: ${{ jobs.get-pr-info.outputs.PR_HEAD_COMMIT_DATE }}
|
||||
PR_MERGE_COMMIT_DATE:
|
||||
description: "The date of the merge commit for the pull request (created by GitHub) in the base repository"
|
||||
value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_DATE }}
|
||||
PR_HEAD_COMMIT_TIMESTAMP:
|
||||
description: "The timestamp of the head sha of the pull request branch in the head repository"
|
||||
value: ${{ jobs.get-pr-info.outputs.PR_HEAD_COMMIT_TIMESTAMP }}
|
||||
PR_MERGE_COMMIT_TIMESTAMP:
|
||||
description: "The timestamp of the merge commit for the pull request (created by GitHub) in the base repository"
|
||||
value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
|
||||
PR:
|
||||
description: "The PR"
|
||||
value: ${{ jobs.get-pr-info.outputs.PR }}
|
||||
PR_FILES:
|
||||
description: "The files touched in the PR"
|
||||
value: ${{ jobs.get-pr-info.outputs.PR_FILES }}
|
||||
|
||||
|
||||
jobs:
|
||||
get-pr-info:
|
||||
runs-on: ubuntu-22.04
|
||||
name: Get PR commit SHA better
|
||||
outputs:
|
||||
PR_HEAD_REPO_FULL_NAME: ${{ steps.pr_info.outputs.head_repo_full_name }}
|
||||
PR_BASE_REPO_FULL_NAME: ${{ steps.pr_info.outputs.base_repo_full_name }}
|
||||
PR_HEAD_REPO_OWNER: ${{ steps.pr_info.outputs.head_repo_owner }}
|
||||
PR_BASE_REPO_OWNER: ${{ steps.pr_info.outputs.base_repo_owner }}
|
||||
PR_HEAD_REPO_NAME: ${{ steps.pr_info.outputs.head_repo_name }}
|
||||
PR_BASE_REPO_NAME: ${{ steps.pr_info.outputs.base_repo_name }}
|
||||
PR_HEAD_REF: ${{ steps.pr_info.outputs.head_ref }}
|
||||
PR_BASE_REF: ${{ steps.pr_info.outputs.base_ref }}
|
||||
PR_HEAD_SHA: ${{ steps.pr_info.outputs.head_sha }}
|
||||
PR_BASE_SHA: ${{ steps.pr_info.outputs.base_sha }}
|
||||
PR_MERGE_COMMIT_SHA: ${{ steps.pr_info.outputs.merge_commit_sha }}
|
||||
PR_HEAD_COMMIT_DATE: ${{ steps.pr_info.outputs.head_commit_date }}
|
||||
PR_MERGE_COMMIT_DATE: ${{ steps.pr_info.outputs.merge_commit_date }}
|
||||
PR_HEAD_COMMIT_TIMESTAMP: ${{ steps.get_timestamps.outputs.head_commit_timestamp }}
|
||||
PR_MERGE_COMMIT_TIMESTAMP: ${{ steps.get_timestamps.outputs.merge_commit_timestamp }}
|
||||
PR: ${{ steps.pr_info.outputs.pr }}
|
||||
PR_FILES: ${{ steps.pr_info.outputs.files }}
|
||||
if: ${{ inputs.pr_number != '' }}
|
||||
steps:
|
||||
- name: Extract PR details
|
||||
id: pr_info
|
||||
uses: actions/github-script@v6
|
||||
with:
|
||||
script: |
|
||||
const { data: pr } = await github.rest.pulls.get({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
pull_number: ${{ inputs.pr_number }}
|
||||
});
|
||||
|
||||
const { data: head_commit } = await github.rest.repos.getCommit({
|
||||
owner: pr.head.repo.owner.login,
|
||||
repo: pr.head.repo.name,
|
||||
ref: pr.head.ref
|
||||
});
|
||||
|
||||
const { data: merge_commit } = await github.rest.repos.getCommit({
|
||||
owner: pr.base.repo.owner.login,
|
||||
repo: pr.base.repo.name,
|
||||
ref: pr.merge_commit_sha,
|
||||
});
|
||||
|
||||
const { data: files } = await github.rest.pulls.listFiles({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
pull_number: ${{ inputs.pr_number }}
|
||||
});
|
||||
|
||||
core.setOutput('head_repo_full_name', pr.head.repo.full_name);
|
||||
core.setOutput('base_repo_full_name', pr.base.repo.full_name);
|
||||
core.setOutput('head_repo_owner', pr.head.repo.owner.login);
|
||||
core.setOutput('base_repo_owner', pr.base.repo.owner.login);
|
||||
core.setOutput('head_repo_name', pr.head.repo.name);
|
||||
core.setOutput('base_repo_name', pr.base.repo.name);
|
||||
core.setOutput('head_ref', pr.head.ref);
|
||||
core.setOutput('base_ref', pr.base.ref);
|
||||
core.setOutput('head_sha', pr.head.sha);
|
||||
core.setOutput('base_sha', pr.base.sha);
|
||||
core.setOutput('merge_commit_sha', pr.merge_commit_sha);
|
||||
core.setOutput('pr', pr);
|
||||
|
||||
core.setOutput('head_commit_date', head_commit.commit.committer.date);
|
||||
core.setOutput('merge_commit_date', merge_commit.commit.committer.date);
|
||||
|
||||
core.setOutput('files', files);
|
||||
|
||||
console.log('PR head commit:', {
|
||||
head_commit: head_commit,
|
||||
commit: head_commit.commit,
|
||||
date: head_commit.commit.committer.date
|
||||
});
|
||||
|
||||
console.log('PR merge commit:', {
|
||||
merge_commit: merge_commit,
|
||||
commit: merge_commit.commit,
|
||||
date: merge_commit.commit.committer.date
|
||||
});
|
||||
|
||||
- name: Convert dates to timestamps
|
||||
id: get_timestamps
|
||||
run: |
|
||||
head_commit_date=${{ steps.pr_info.outputs.head_commit_date }}
|
||||
merge_commit_date=${{ steps.pr_info.outputs.merge_commit_date }}
|
||||
echo $head_commit_date
|
||||
echo $merge_commit_date
|
||||
head_commit_timestamp=$(date -d "$head_commit_date" +%s)
|
||||
merge_commit_timestamp=$(date -d "$merge_commit_date" +%s)
|
||||
echo $head_commit_timestamp
|
||||
echo $merge_commit_timestamp
|
||||
echo "head_commit_timestamp=$head_commit_timestamp" >> $GITHUB_OUTPUT
|
||||
echo "merge_commit_timestamp=$merge_commit_timestamp" >> $GITHUB_OUTPUT
|
36
.github/workflows/get-pr-number.yml
vendored
36
.github/workflows/get-pr-number.yml
vendored
@ -1,36 +0,0 @@
|
||||
name: Get PR number
|
||||
on:
|
||||
workflow_call:
|
||||
outputs:
|
||||
PR_NUMBER:
|
||||
description: "The extracted PR number"
|
||||
value: ${{ jobs.get-pr-number.outputs.PR_NUMBER }}
|
||||
|
||||
jobs:
|
||||
get-pr-number:
|
||||
runs-on: ubuntu-22.04
|
||||
name: Get PR number
|
||||
outputs:
|
||||
PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
|
||||
steps:
|
||||
- name: Get PR number
|
||||
shell: bash
|
||||
run: |
|
||||
if [[ "${{ github.event.issue.number }}" != "" && "${{ github.event.issue.pull_request }}" != "" ]]; then
|
||||
echo "PR_NUMBER=${{ github.event.issue.number }}" >> $GITHUB_ENV
|
||||
elif [[ "${{ github.event.pull_request.number }}" != "" ]]; then
|
||||
echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV
|
||||
elif [[ "${{ github.event.pull_request }}" != "" ]]; then
|
||||
echo "PR_NUMBER=${{ github.event.number }}" >> $GITHUB_ENV
|
||||
else
|
||||
echo "PR_NUMBER=" >> $GITHUB_ENV
|
||||
fi
|
||||
|
||||
- name: Check PR number
|
||||
shell: bash
|
||||
run: |
|
||||
echo "${{ env.PR_NUMBER }}"
|
||||
|
||||
- name: Set PR number
|
||||
id: set_pr_number
|
||||
run: echo "PR_NUMBER=${{ env.PR_NUMBER }}" >> "$GITHUB_OUTPUT"
|
123
.github/workflows/model_jobs.yml
vendored
123
.github/workflows/model_jobs.yml
vendored
@ -12,8 +12,8 @@ on:
|
||||
slice_id:
|
||||
required: true
|
||||
type: number
|
||||
runner_map:
|
||||
required: false
|
||||
runner:
|
||||
required: true
|
||||
type: string
|
||||
docker:
|
||||
required: true
|
||||
@ -45,7 +45,7 @@ jobs:
|
||||
matrix:
|
||||
folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
|
||||
runs-on:
|
||||
group: ${{ fromJson(inputs.runner_map)[matrix.folders][inputs.machine_type] }}
|
||||
group: '${{ inputs.machine_type }}'
|
||||
container:
|
||||
image: ${{ inputs.docker }}
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@ -107,9 +107,9 @@ jobs:
|
||||
run: |
|
||||
echo "${{ inputs.machine_type }}"
|
||||
|
||||
if [ "${{ inputs.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
||||
if [ "${{ inputs.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ inputs.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
||||
elif [ "${{ inputs.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ inputs.machine_type }}
|
||||
@ -120,106 +120,23 @@ jobs:
|
||||
|
||||
- name: Run all tests on GPU
|
||||
working-directory: /transformers
|
||||
if: ${{ always() }}
|
||||
run: |
|
||||
python3 -m pip uninstall -y natten
|
||||
python3 -m pip uninstall -y ninja && python3 -m pip install ninja && python3 -m pip install flash-attn --no-build-isolation
|
||||
# START=0 END=400 python3 run.py
|
||||
#
|
||||
# - name: "Upload"
|
||||
# if: ${{ always() }}
|
||||
# uses: actions/upload-artifact@v4
|
||||
# with:
|
||||
# name: summary_short_1
|
||||
# path: /transformers/summary_short.txt
|
||||
#
|
||||
# - name: Run all tests on GPU
|
||||
# working-directory: /transformers
|
||||
# if: ${{ always() }}
|
||||
# run: |
|
||||
# START=400 END=800 python3 run.py
|
||||
#
|
||||
# - name: "Upload"
|
||||
# if: ${{ always() }}
|
||||
# uses: actions/upload-artifact@v4
|
||||
# with:
|
||||
# name: summary_short_2
|
||||
# path: /transformers/summary_short.txt
|
||||
#
|
||||
# - name: Run all tests on GPU
|
||||
# working-directory: /transformers
|
||||
# if: ${{ always() }}
|
||||
# run: |
|
||||
# START=800 END=1200 python3 run.py
|
||||
#
|
||||
# - name: "Upload"
|
||||
# if: ${{ always() }}
|
||||
# uses: actions/upload-artifact@v4
|
||||
# with:
|
||||
# name: summary_short_3
|
||||
# path: /transformers/summary_short.txt
|
||||
#
|
||||
# - name: Run all tests on GPU
|
||||
# working-directory: /transformers
|
||||
# if: ${{ always() }}
|
||||
# run: |
|
||||
# START=1200 END=1600 python3 run.py
|
||||
#
|
||||
# - name: "Upload"
|
||||
# if: ${{ always() }}
|
||||
# uses: actions/upload-artifact@v4
|
||||
# with:
|
||||
# name: summary_short_4
|
||||
# path: /transformers/summary_short.txt
|
||||
run: python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
|
||||
|
||||
- name: Run all tests on GPU
|
||||
working-directory: /transformers
|
||||
if: ${{ always() }}
|
||||
run: |
|
||||
START=1600 END=2000 python3 run.py
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt
|
||||
|
||||
- name: "Upload"
|
||||
- name: Run test
|
||||
shell: bash
|
||||
run: |
|
||||
mkdir -p /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
|
||||
echo "hello" > /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt
|
||||
echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports"
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: summary_short_5
|
||||
path: /transformers/summary_short.txt
|
||||
|
||||
- name: Run all tests on GPU
|
||||
working-directory: /transformers
|
||||
if: ${{ always() }}
|
||||
run: |
|
||||
START=2000 END=2400 python3 run.py
|
||||
|
||||
- name: "Upload"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: summary_short_6
|
||||
path: /transformers/summary_short.txt
|
||||
|
||||
- name: Run all tests on GPU
|
||||
working-directory: /transformers
|
||||
if: ${{ always() }}
|
||||
run: |
|
||||
START=2400 END=2800 python3 run.py
|
||||
|
||||
- name: "Upload"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: summary_short_7
|
||||
path: /transformers/summary_short.txt
|
||||
|
||||
- name: Run all tests on GPU
|
||||
working-directory: /transformers
|
||||
if: ${{ always() }}
|
||||
run: |
|
||||
START=2800 END=3000 python3 run.py
|
||||
|
||||
- name: "Upload"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: summary_short_8
|
||||
path: /transformers/summary_short.txt
|
||||
name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
|
||||
path: /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
|
||||
|
128
.github/workflows/model_jobs_amd.yml
vendored
Normal file
128
.github/workflows/model_jobs_amd.yml
vendored
Normal file
@ -0,0 +1,128 @@
|
||||
name: model jobs
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
folder_slices:
|
||||
required: true
|
||||
type: string
|
||||
machine_type:
|
||||
required: true
|
||||
type: string
|
||||
slice_id:
|
||||
required: true
|
||||
type: number
|
||||
runner:
|
||||
required: true
|
||||
type: string
|
||||
docker:
|
||||
required: true
|
||||
type: string
|
||||
|
||||
env:
|
||||
HF_HOME: /mnt/cache
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
OMP_NUM_THREADS: 8
|
||||
MKL_NUM_THREADS: 8
|
||||
RUN_SLOW: yes
|
||||
# For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
|
||||
# This token is created under the bot `hf-transformers-bot`.
|
||||
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
||||
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||
CUDA_VISIBLE_DEVICES: 0,1
|
||||
|
||||
jobs:
|
||||
run_models_gpu:
|
||||
name: " "
|
||||
strategy:
|
||||
max-parallel: 1 # For now, not to parallelize. Can change later if it works well.
|
||||
fail-fast: false
|
||||
matrix:
|
||||
folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
|
||||
runs-on: ['${{ inputs.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
|
||||
container:
|
||||
image: ${{ inputs.docker }}
|
||||
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: Echo input and matrix info
|
||||
shell: bash
|
||||
run: |
|
||||
echo "${{ inputs.folder_slices }}"
|
||||
echo "${{ matrix.folders }}"
|
||||
echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
|
||||
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
|
||||
# set the artifact folder names (because the character `/` is not allowed).
|
||||
run: |
|
||||
echo "${{ matrix.folders }}"
|
||||
matrix_folders=${{ matrix.folders }}
|
||||
matrix_folders=${matrix_folders/'models/'/'models_'}
|
||||
echo "$matrix_folders"
|
||||
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
||||
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
run: git fetch && git checkout ${{ github.sha }}
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: /transformers
|
||||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
|
||||
|
||||
- name: Update / Install some packages (for Past CI)
|
||||
if: ${{ contains(inputs.docker, '-past-') }}
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 -m pip install -U datasets
|
||||
|
||||
- name: Update / Install some packages (for Past CI)
|
||||
if: ${{ contains(inputs.docker, '-past-') && contains(inputs.docker, '-pytorch-') }}
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
||||
|
||||
- name: ROCM-SMI
|
||||
run: |
|
||||
rocm-smi
|
||||
|
||||
- name: ROCM-INFO
|
||||
run: |
|
||||
rocminfo | grep "Agent" -A 14
|
||||
|
||||
- name: Show ROCR environment
|
||||
run: |
|
||||
echo "ROCR: $ROCR_VISIBLE_DEVICES"
|
||||
|
||||
- name: Environment
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
working-directory: /transformers
|
||||
run: pip freeze
|
||||
|
||||
- name: Run all tests on GPU
|
||||
working-directory: /transformers
|
||||
run: python3 -m pytest -rsfE -v --make-reports=${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} -m "not not_device_test"
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: cat /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
|
||||
|
||||
- name: Run test
|
||||
shell: bash
|
||||
run: |
|
||||
mkdir -p /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
|
||||
echo "hello" > /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
|
||||
echo "${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
|
||||
path: /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
|
121
.github/workflows/model_jobs_intel_gaudi.yml
vendored
121
.github/workflows/model_jobs_intel_gaudi.yml
vendored
@ -1,121 +0,0 @@
|
||||
name: model jobs
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
folder_slices:
|
||||
required: true
|
||||
type: string
|
||||
slice_id:
|
||||
required: true
|
||||
type: number
|
||||
runner:
|
||||
required: true
|
||||
type: string
|
||||
machine_type:
|
||||
required: true
|
||||
type: string
|
||||
report_name_prefix:
|
||||
required: false
|
||||
default: run_models_gpu
|
||||
type: string
|
||||
|
||||
env:
|
||||
RUN_SLOW: yes
|
||||
PT_HPU_LAZY_MODE: 0
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
PT_ENABLE_INT64_SUPPORT: 1
|
||||
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
||||
HF_HOME: /mnt/cache/.cache/huggingface
|
||||
|
||||
jobs:
|
||||
run_models_gpu:
|
||||
name: " "
|
||||
strategy:
|
||||
max-parallel: 8
|
||||
fail-fast: false
|
||||
matrix:
|
||||
folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
|
||||
runs-on:
|
||||
group: ${{ inputs.runner }}
|
||||
container:
|
||||
image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
||||
options: --runtime=habana
|
||||
-v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
|
||||
--env OMPI_MCA_btl_vader_single_copy_mechanism=none
|
||||
--env HABANA_VISIBLE_DEVICES
|
||||
--env HABANA_VISIBLE_MODULES
|
||||
--cap-add=sys_nice
|
||||
--shm-size=64G
|
||||
steps:
|
||||
- name: Echo input and matrix info
|
||||
shell: bash
|
||||
run: |
|
||||
echo "${{ inputs.folder_slices }}"
|
||||
echo "${{ matrix.folders }}"
|
||||
echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
|
||||
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
run: |
|
||||
echo "${{ matrix.folders }}"
|
||||
matrix_folders=${{ matrix.folders }}
|
||||
matrix_folders=${matrix_folders/'models/'/'models_'}
|
||||
echo "$matrix_folders"
|
||||
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn
|
||||
|
||||
- name: HL-SMI
|
||||
run: |
|
||||
hl-smi
|
||||
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
|
||||
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
|
||||
|
||||
- name: Environment
|
||||
run: python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
run: pip freeze
|
||||
|
||||
- name: Set `machine_type` for report and artifact names
|
||||
shell: bash
|
||||
run: |
|
||||
if [ "${{ inputs.machine_type }}" = "1gaudi" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ inputs.machine_type }}" = "2gaudi" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ inputs.machine_type }}
|
||||
fi
|
||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||
|
||||
- name: Run all tests on Gaudi
|
||||
run: python3 -m pytest -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: cat reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt
|
||||
|
||||
- name: Run test
|
||||
shell: bash
|
||||
run: |
|
||||
mkdir -p reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
|
||||
echo "hello" > reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt
|
||||
echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports"
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
|
||||
path: reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
|
134
.github/workflows/pr_build_doc_with_comment.yml
vendored
134
.github/workflows/pr_build_doc_with_comment.yml
vendored
@ -1,134 +0,0 @@
|
||||
name: PR - build doc via comment
|
||||
on:
|
||||
issue_comment:
|
||||
types:
|
||||
- created
|
||||
branches-ignore:
|
||||
- main
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.issue.number }}-${{ startsWith(github.event.comment.body, 'build-doc') }}
|
||||
cancel-in-progress: true
|
||||
permissions: {}
|
||||
|
||||
|
||||
jobs:
|
||||
get-pr-number:
|
||||
name: Get PR number
|
||||
if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad", "stevhliu", "ebezzam"]'), github.actor) && (startsWith(github.event.comment.body, 'build-doc')) }}
|
||||
uses: ./.github/workflows/get-pr-number.yml
|
||||
|
||||
get-pr-info:
|
||||
name: Get PR commit SHA
|
||||
needs: get-pr-number
|
||||
if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
|
||||
uses: ./.github/workflows/get-pr-info.yml
|
||||
with:
|
||||
pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
|
||||
|
||||
verity_pr_commit:
|
||||
name: Verity PR commit corresponds to a specific event by comparing timestamps
|
||||
if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
|
||||
runs-on: ubuntu-22.04
|
||||
needs: get-pr-info
|
||||
env:
|
||||
COMMENT_DATE: ${{ github.event.comment.created_at }}
|
||||
PR_MERGE_COMMIT_DATE: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_DATE }}
|
||||
PR_MERGE_COMMIT_TIMESTAMP: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
|
||||
steps:
|
||||
- run: |
|
||||
COMMENT_TIMESTAMP=$(date -d "${COMMENT_DATE}" +"%s")
|
||||
echo "COMMENT_DATE: $COMMENT_DATE"
|
||||
echo "PR_MERGE_COMMIT_DATE: $PR_MERGE_COMMIT_DATE"
|
||||
echo "COMMENT_TIMESTAMP: $COMMENT_TIMESTAMP"
|
||||
echo "PR_MERGE_COMMIT_TIMESTAMP: $PR_MERGE_COMMIT_TIMESTAMP"
|
||||
if [ $COMMENT_TIMESTAMP -le $PR_MERGE_COMMIT_TIMESTAMP ]; then
|
||||
echo "Last commit on the pull request is newer than the issue comment triggering this run! Abort!";
|
||||
exit -1;
|
||||
fi
|
||||
|
||||
create_run:
|
||||
name: Create run
|
||||
needs: [get-pr-number, get-pr-info]
|
||||
if: ${{ needs.get-pr-number.outputs.PR_NUMBER != '' }}
|
||||
permissions:
|
||||
statuses: write
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Create Run
|
||||
id: create_run
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
# Create a commit status (pending) for a run of this workflow. The status has to be updated later in `update_run_status`.
|
||||
# See https://docs.github.com/en/rest/commits/statuses?apiVersion=2022-11-28#create-a-commit-status
|
||||
GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
run: |
|
||||
gh api \
|
||||
--method POST \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
repos/${{ github.repository }}/statuses/${{ needs.get-pr-info.outputs.PR_HEAD_SHA }} \
|
||||
-f "target_url=$GITHUB_RUN_URL" -f "state=pending" -f "description=Custom doc building job" -f "context=custom-doc-build"
|
||||
|
||||
reply_to_comment:
|
||||
name: Reply to the comment
|
||||
if: ${{ needs.create_run.result == 'success' }}
|
||||
needs: [get-pr-number, create_run]
|
||||
permissions:
|
||||
pull-requests: write
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Reply to the comment
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
run: |
|
||||
gh api \
|
||||
--method POST \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \
|
||||
-f "body=[Building docs for all languages...](${{ env.GITHUB_RUN_URL }})"
|
||||
|
||||
build-doc:
|
||||
name: Build doc
|
||||
needs: [get-pr-number, get-pr-info]
|
||||
if: ${{ needs.get-pr-number.outputs.PR_NUMBER != '' }}
|
||||
uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
|
||||
with:
|
||||
commit_sha: ${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}
|
||||
pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
|
||||
package: transformers
|
||||
languages: ar de en es fr hi it ko pt tr zh ja te
|
||||
|
||||
update_run_status:
|
||||
name: Update Check Run Status
|
||||
needs: [ get-pr-info, create_run, build-doc ]
|
||||
permissions:
|
||||
statuses: write
|
||||
if: ${{ always() && needs.create_run.result == 'success' }}
|
||||
runs-on: ubuntu-22.04
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
STATUS_OK: ${{ contains(fromJSON('["skipped", "success"]'), needs.create_run.result) }}
|
||||
steps:
|
||||
- name: Get `build-doc` job status
|
||||
run: |
|
||||
echo "${{ needs.build-doc.result }}"
|
||||
echo $STATUS_OK
|
||||
if [ "$STATUS_OK" = "true" ]; then
|
||||
echo "STATUS=success" >> $GITHUB_ENV
|
||||
else
|
||||
echo "STATUS=failure" >> $GITHUB_ENV
|
||||
fi
|
||||
|
||||
- name: Update PR commit statuses
|
||||
run: |
|
||||
echo "${{ needs.build-doc.result }}"
|
||||
echo "${{ env.STATUS }}"
|
||||
gh api \
|
||||
--method POST \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
repos/${{ github.repository }}/statuses/${{ needs.get-pr-info.outputs.PR_HEAD_SHA }} \
|
||||
-f "target_url=$GITHUB_RUN_URL" -f "state=${{ env.STATUS }}" -f "description=Custom doc building job" -f "context=custom-doc-build"
|
177
.github/workflows/pr_run_slow_ci.yml
vendored
177
.github/workflows/pr_run_slow_ci.yml
vendored
@ -1,177 +0,0 @@
|
||||
name: PR slow CI
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [opened, synchronize, reopened]
|
||||
|
||||
jobs:
|
||||
get-pr-number:
|
||||
name: Get PR number
|
||||
uses: ./.github/workflows/get-pr-number.yml
|
||||
|
||||
get-pr-info:
|
||||
name: Get PR commit SHA
|
||||
needs: get-pr-number
|
||||
if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
|
||||
uses: ./.github/workflows/get-pr-info.yml
|
||||
with:
|
||||
pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
|
||||
|
||||
get-jobs:
|
||||
name: Get test files to run
|
||||
runs-on: ubuntu-22.04
|
||||
needs: [get-pr-number, get-pr-info]
|
||||
outputs:
|
||||
jobs: ${{ steps.get_jobs.outputs.jobs_to_run }}
|
||||
steps:
|
||||
- name: Get repository content
|
||||
id: repo_content
|
||||
uses: actions/github-script@v6
|
||||
with:
|
||||
script: |
|
||||
const { data: tests_dir } = await github.rest.repos.getContent({
|
||||
owner: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}',
|
||||
repo: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}',
|
||||
path: 'tests',
|
||||
ref: '${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}',
|
||||
});
|
||||
|
||||
const { data: tests_models_dir } = await github.rest.repos.getContent({
|
||||
owner: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}',
|
||||
repo: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}',
|
||||
path: 'tests/models',
|
||||
ref: '${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}',
|
||||
});
|
||||
|
||||
const { data: tests_quantization_dir } = await github.rest.repos.getContent({
|
||||
owner: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}',
|
||||
repo: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}',
|
||||
path: 'tests/quantization',
|
||||
ref: '${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}',
|
||||
});
|
||||
|
||||
core.setOutput('tests_dir', tests_dir);
|
||||
core.setOutput('tests_models_dir', tests_models_dir);
|
||||
core.setOutput('tests_quantization_dir', tests_quantization_dir);
|
||||
|
||||
# This checkout to the main branch
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: "0"
|
||||
|
||||
- name: Write pr_files file
|
||||
run: |
|
||||
cat > pr_files.txt << 'EOF'
|
||||
${{ needs.get-pr-info.outputs.PR_FILES }}
|
||||
EOF
|
||||
|
||||
- name: Write tests_dir file
|
||||
run: |
|
||||
cat > tests_dir.txt << 'EOF'
|
||||
${{ steps.repo_content.outputs.tests_dir }}
|
||||
EOF
|
||||
|
||||
- name: Write tests_models_dir file
|
||||
run: |
|
||||
cat > tests_models_dir.txt << 'EOF'
|
||||
${{ steps.repo_content.outputs.tests_models_dir }}
|
||||
EOF
|
||||
|
||||
- name: Write tests_quantization_dir file
|
||||
run: |
|
||||
cat > tests_quantization_dir.txt << 'EOF'
|
||||
${{ steps.repo_content.outputs.tests_quantization_dir }}
|
||||
EOF
|
||||
|
||||
- name: Run script to get jobs to run
|
||||
id: get_jobs
|
||||
run: |
|
||||
python utils/get_pr_run_slow_jobs.py | tee output.txt
|
||||
echo "jobs_to_run: $(tail -n 1 output.txt)"
|
||||
echo "jobs_to_run=$(tail -n 1 output.txt)" >> $GITHUB_OUTPUT
|
||||
|
||||
send_comment:
|
||||
# Will delete the previous comment and send a new one if:
|
||||
# - either the content is changed
|
||||
# - or the previous comment is 30 minutes or more old
|
||||
name: Send a comment to suggest jobs to run
|
||||
if: ${{ needs.get-jobs.outputs.jobs != '' }}
|
||||
needs: [get-pr-number, get-jobs]
|
||||
permissions:
|
||||
pull-requests: write
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Check and update comment if needed
|
||||
uses: actions/github-script@v7
|
||||
env:
|
||||
BODY: "\n\nrun-slow: ${{ needs.get-jobs.outputs.jobs }}"
|
||||
with:
|
||||
script: |
|
||||
const prNumber = ${{ needs.get-pr-number.outputs.PR_NUMBER }};
|
||||
const commentPrefix = "**[For maintainers]** Suggested jobs to run (before merge)";
|
||||
const thirtyMinutesAgo = new Date(Date.now() - 30 * 60 * 1000); // 30 minutes ago
|
||||
const newBody = `${commentPrefix}${process.env.BODY}`;
|
||||
|
||||
// Get all comments on the PR
|
||||
const { data: comments } = await github.rest.issues.listComments({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: prNumber
|
||||
});
|
||||
|
||||
// Find existing comments that start with our prefix
|
||||
const existingComments = comments.filter(comment =>
|
||||
comment.user.login === 'github-actions[bot]' &&
|
||||
comment.body.startsWith(commentPrefix)
|
||||
);
|
||||
|
||||
let shouldCreateNewComment = true;
|
||||
let commentsToDelete = [];
|
||||
|
||||
if (existingComments.length > 0) {
|
||||
// Get the most recent comment
|
||||
const mostRecentComment = existingComments
|
||||
.sort((a, b) => new Date(b.created_at) - new Date(a.created_at))[0];
|
||||
|
||||
const commentDate = new Date(mostRecentComment.created_at);
|
||||
const isOld = commentDate < thirtyMinutesAgo;
|
||||
const isDifferentContent = mostRecentComment.body !== newBody;
|
||||
|
||||
console.log(`Most recent comment created: ${mostRecentComment.created_at}`);
|
||||
console.log(`Is older than 30 minutes: ${isOld}`);
|
||||
console.log(`Has different content: ${isDifferentContent}`);
|
||||
|
||||
if (isOld || isDifferentContent) {
|
||||
// Delete all existing comments and create new one
|
||||
commentsToDelete = existingComments;
|
||||
console.log(`Will delete ${commentsToDelete.length} existing comment(s) and create new one`);
|
||||
} else {
|
||||
// Content is same and comment is recent, skip
|
||||
shouldCreateNewComment = false;
|
||||
console.log('Comment is recent and content unchanged, skipping update');
|
||||
}
|
||||
} else {
|
||||
console.log('No existing comments found, will create new one');
|
||||
}
|
||||
|
||||
// Delete old comments if needed
|
||||
for (const comment of commentsToDelete) {
|
||||
console.log(`Deleting comment #${comment.id} (created: ${comment.created_at})`);
|
||||
await github.rest.issues.deleteComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
comment_id: comment.id
|
||||
});
|
||||
}
|
||||
|
||||
// Create new comment if needed
|
||||
if (shouldCreateNewComment) {
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: prNumber,
|
||||
body: newBody
|
||||
});
|
||||
console.log('✅ New comment created');
|
||||
} else {
|
||||
console.log('ℹ️ No comment update needed');
|
||||
}
|
14
.github/workflows/self-comment-ci.yml
vendored
14
.github/workflows/self-comment-ci.yml
vendored
@ -29,7 +29,7 @@ jobs:
|
||||
runs-on: ubuntu-22.04
|
||||
name: Get PR number
|
||||
# For security: only allow team members to run
|
||||
if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad", "stevhliu", "ebezzam"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
|
||||
if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
|
||||
outputs:
|
||||
PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
|
||||
steps:
|
||||
@ -185,7 +185,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.get-tests.outputs.models) }}
|
||||
machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
|
||||
machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
|
||||
runs-on:
|
||||
group: '${{ matrix.machine_type }}'
|
||||
container:
|
||||
@ -239,9 +239,9 @@ jobs:
|
||||
shell: bash
|
||||
run: |
|
||||
echo "${{ matrix.machine_type }}"
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
@ -292,7 +292,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.get-tests.outputs.quantizations) }}
|
||||
machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
|
||||
machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
|
||||
runs-on:
|
||||
group: '${{ matrix.machine_type }}'
|
||||
container:
|
||||
@ -338,9 +338,9 @@ jobs:
|
||||
shell: bash
|
||||
run: |
|
||||
echo "${{ matrix.machine_type }}"
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
|
25
.github/workflows/self-push-amd-mi300-caller.yml
vendored
Normal file
25
.github/workflows/self-push-amd-mi300-caller.yml
vendored
Normal file
@ -0,0 +1,25 @@
|
||||
name: Self-hosted runner (AMD mi300 CI caller)
|
||||
|
||||
on:
|
||||
#workflow_run:
|
||||
# workflows: ["Self-hosted runner (push-caller)"]
|
||||
# branches: ["main"]
|
||||
# types: [completed]
|
||||
push:
|
||||
branches:
|
||||
- run_amd_push_ci_caller*
|
||||
paths:
|
||||
- "src/**"
|
||||
- "tests/**"
|
||||
- ".github/**"
|
||||
- "templates/**"
|
||||
- "utils/**"
|
||||
|
||||
jobs:
|
||||
run_amd_ci:
|
||||
name: AMD mi300
|
||||
if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && (startsWith(github.ref_name, 'run_amd_push_ci_caller') || startsWith(github.ref_name, 'mi300-ci'))))
|
||||
uses: ./.github/workflows/self-push-amd.yml
|
||||
with:
|
||||
gpu_flavor: mi300
|
||||
secrets: inherit
|
32
.github/workflows/self-push.yml
vendored
32
.github/workflows/self-push.yml
vendored
@ -31,12 +31,12 @@ jobs:
|
||||
name: Setup
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
|
||||
machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
|
||||
runs-on:
|
||||
group: '${{ matrix.machine_type }}'
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu-push-ci
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
outputs:
|
||||
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||
test_map: ${{ steps.set-matrix.outputs.test_map }}
|
||||
@ -131,12 +131,12 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
||||
machine_type: [aws-g5-4xlarge-cache]
|
||||
machine_type: [aws-g4dn-2xlarge-cache]
|
||||
runs-on:
|
||||
group: '${{ matrix.machine_type }}'
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu-push-ci
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
env:
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
CI_BRANCH_PUSH: ${{ github.event.ref }}
|
||||
@ -169,9 +169,9 @@ jobs:
|
||||
run: |
|
||||
echo "${{ matrix.machine_type }}"
|
||||
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
@ -244,7 +244,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
||||
machine_type: [aws-g5-12xlarge-cache]
|
||||
machine_type: [aws-g4dn-12xlarge-cache]
|
||||
runs-on:
|
||||
group: '${{ matrix.machine_type }}'
|
||||
container:
|
||||
@ -282,9 +282,9 @@ jobs:
|
||||
run: |
|
||||
echo "${{ matrix.machine_type }}"
|
||||
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
@ -357,12 +357,12 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [aws-g5-4xlarge-cache]
|
||||
machine_type: [aws-g4dn-2xlarge-cache]
|
||||
runs-on:
|
||||
group: '${{ matrix.machine_type }}'
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
env:
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
CI_BRANCH_PUSH: ${{ github.event.ref }}
|
||||
@ -395,9 +395,9 @@ jobs:
|
||||
run: |
|
||||
echo "${{ matrix.machine_type }}"
|
||||
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
@ -467,7 +467,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [aws-g5-12xlarge-cache]
|
||||
machine_type: [aws-g4dn-12xlarge-cache]
|
||||
runs-on:
|
||||
group: '${{ matrix.machine_type }}'
|
||||
container:
|
||||
@ -505,9 +505,9 @@ jobs:
|
||||
run: |
|
||||
echo "${{ matrix.machine_type }}"
|
||||
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
|
@ -1,8 +1,8 @@
|
||||
name: Self-hosted runner scale set (AMD mi355 scheduled CI caller)
|
||||
name: Self-hosted runner scale set (AMD mi300 scheduled CI caller)
|
||||
|
||||
# Note: For every job in this workflow, the name of the runner scale set is finalized in the runner yaml i.e. huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml
|
||||
# For example, 1gpu : amd-mi355-ci-1gpu
|
||||
# 2gpu : amd-mi355-ci-2gpu
|
||||
# For example, 1gpu scale set: amd-mi300-ci-1gpu
|
||||
# 2gpu scale set: amd-mi300-ci-2gpu
|
||||
|
||||
on:
|
||||
workflow_run:
|
||||
@ -20,9 +20,9 @@ jobs:
|
||||
with:
|
||||
job: run_models_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_scale_set: amd-mi355-ci
|
||||
runner_scale_set: amd-mi300-ci
|
||||
docker: huggingface/transformers-pytorch-amd-gpu
|
||||
ci_event: Scheduled CI (AMD) - mi355
|
||||
ci_event: Scheduled CI (AMD) - mi300
|
||||
report_repo_id: optimum-amd/transformers_daily_ci
|
||||
secrets: inherit
|
||||
|
||||
@ -32,9 +32,9 @@ jobs:
|
||||
with:
|
||||
job: run_pipelines_torch_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_scale_set: amd-mi355-ci
|
||||
runner_scale_set: amd-mi300-ci
|
||||
docker: huggingface/transformers-pytorch-amd-gpu
|
||||
ci_event: Scheduled CI (AMD) - mi355
|
||||
ci_event: Scheduled CI (AMD) - mi300
|
||||
report_repo_id: optimum-amd/transformers_daily_ci
|
||||
secrets: inherit
|
||||
|
||||
@ -44,9 +44,9 @@ jobs:
|
||||
with:
|
||||
job: run_examples_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_scale_set: amd-mi355-ci
|
||||
runner_scale_set: amd-mi300-ci
|
||||
docker: huggingface/transformers-pytorch-amd-gpu
|
||||
ci_event: Scheduled CI (AMD) - mi355
|
||||
ci_event: Scheduled CI (AMD) - mi300
|
||||
report_repo_id: optimum-amd/transformers_daily_ci
|
||||
secrets: inherit
|
||||
|
||||
@ -56,8 +56,8 @@ jobs:
|
||||
with:
|
||||
job: run_torch_cuda_extensions_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_scale_set: amd-mi355-ci
|
||||
runner_scale_set: amd-mi300-ci
|
||||
docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
|
||||
ci_event: Scheduled CI (AMD) - mi355
|
||||
ci_event: Scheduled CI (AMD) - mi300
|
||||
report_repo_id: optimum-amd/transformers_daily_ci
|
||||
secrets: inherit
|
@ -1,67 +0,0 @@
|
||||
name: Self-hosted runner scale set (AMD mi325 scheduled CI caller)
|
||||
|
||||
# Note: For every job in this workflow, the name of the runner scale set is finalized in the runner yaml i.e. huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml
|
||||
# For example, 1gpu scale set: amd-mi325-ci-1gpu
|
||||
# 2gpu scale set: amd-mi325-ci-2gpu
|
||||
|
||||
on:
|
||||
workflow_run:
|
||||
workflows: ["Self-hosted runner (AMD scheduled CI caller)"]
|
||||
branches: ["main"]
|
||||
types: [completed]
|
||||
push:
|
||||
branches:
|
||||
- run_amd_scheduled_ci_caller*
|
||||
|
||||
jobs:
|
||||
model-ci:
|
||||
name: Model CI
|
||||
uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
|
||||
with:
|
||||
job: run_models_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_scale_set: amd-mi325-ci
|
||||
docker: huggingface/transformers-pytorch-amd-gpu
|
||||
ci_event: Scheduled CI (AMD) - mi325
|
||||
report_repo_id: optimum-amd/transformers_daily_ci
|
||||
env_file: /etc/podinfo/gha-gpu-isolation-settings
|
||||
secrets: inherit
|
||||
|
||||
torch-pipeline:
|
||||
name: Torch pipeline CI
|
||||
uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
|
||||
with:
|
||||
job: run_pipelines_torch_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_scale_set: amd-mi325-ci
|
||||
docker: huggingface/transformers-pytorch-amd-gpu
|
||||
ci_event: Scheduled CI (AMD) - mi325
|
||||
report_repo_id: optimum-amd/transformers_daily_ci
|
||||
env_file: /etc/podinfo/gha-gpu-isolation-settings
|
||||
secrets: inherit
|
||||
|
||||
example-ci:
|
||||
name: Example CI
|
||||
uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
|
||||
with:
|
||||
job: run_examples_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_scale_set: amd-mi325-ci
|
||||
docker: huggingface/transformers-pytorch-amd-gpu
|
||||
ci_event: Scheduled CI (AMD) - mi325
|
||||
report_repo_id: optimum-amd/transformers_daily_ci
|
||||
env_file: /etc/podinfo/gha-gpu-isolation-settings
|
||||
secrets: inherit
|
||||
|
||||
deepspeed-ci:
|
||||
name: DeepSpeed CI
|
||||
uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
|
||||
with:
|
||||
job: run_torch_cuda_extensions_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_scale_set: amd-mi325-ci
|
||||
docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
|
||||
ci_event: Scheduled CI (AMD) - mi325
|
||||
report_repo_id: optimum-amd/transformers_daily_ci
|
||||
env_file: /etc/podinfo/gha-gpu-isolation-settings
|
||||
secrets: inherit
|
68
.github/workflows/self-scheduled-caller.yml
vendored
68
.github/workflows/self-scheduled-caller.yml
vendored
@ -7,7 +7,7 @@ on:
|
||||
- cron: "17 2 * * *"
|
||||
push:
|
||||
branches:
|
||||
- check_fa2
|
||||
- run_scheduled_ci*
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
prev_workflow_run_id:
|
||||
@ -22,7 +22,7 @@ on:
|
||||
default: ""
|
||||
|
||||
|
||||
# Used for `push` to easily modify the target workflow runs to compare against
|
||||
# Used for `push` to easily modiffy the target workflow runs to compare against
|
||||
env:
|
||||
prev_workflow_run_id: ""
|
||||
other_workflow_run_id: ""
|
||||
@ -50,8 +50,70 @@ jobs:
|
||||
uses: ./.github/workflows/self-scheduled.yml
|
||||
with:
|
||||
job: run_models_gpu
|
||||
slack_report_channel: "#transformers-ci-dummy"
|
||||
slack_report_channel: "#transformers-ci-daily-models"
|
||||
runner: daily-ci
|
||||
docker: huggingface/transformers-all-latest-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
secrets: inherit
|
||||
|
||||
torch-pipeline:
|
||||
name: Torch pipeline CI
|
||||
uses: ./.github/workflows/self-scheduled.yml
|
||||
with:
|
||||
job: run_pipelines_torch_gpu
|
||||
slack_report_channel: "#transformers-ci-daily-pipeline-torch"
|
||||
runner: daily-ci
|
||||
docker: huggingface/transformers-pytorch-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
secrets: inherit
|
||||
|
||||
example-ci:
|
||||
name: Example CI
|
||||
uses: ./.github/workflows/self-scheduled.yml
|
||||
with:
|
||||
job: run_examples_gpu
|
||||
slack_report_channel: "#transformers-ci-daily-examples"
|
||||
runner: daily-ci
|
||||
docker: huggingface/transformers-all-latest-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
secrets: inherit
|
||||
|
||||
trainer-fsdp-ci:
|
||||
name: Trainer/FSDP CI
|
||||
uses: ./.github/workflows/self-scheduled.yml
|
||||
with:
|
||||
job: run_trainer_and_fsdp_gpu
|
||||
slack_report_channel: "#transformers-ci-daily-training"
|
||||
runner: daily-ci
|
||||
docker: huggingface/transformers-all-latest-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
secrets: inherit
|
||||
|
||||
deepspeed-ci:
|
||||
name: DeepSpeed CI
|
||||
uses: ./.github/workflows/self-scheduled.yml
|
||||
with:
|
||||
job: run_torch_cuda_extensions_gpu
|
||||
slack_report_channel: "#transformers-ci-daily-training"
|
||||
runner: daily-ci
|
||||
docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
|
||||
ci_event: Daily CI
|
||||
working-directory-prefix: /workspace
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
secrets: inherit
|
||||
|
||||
quantization-ci:
|
||||
name: Quantization CI
|
||||
uses: ./.github/workflows/self-scheduled.yml
|
||||
with:
|
||||
job: run_quantization_torch_gpu
|
||||
slack_report_channel: "#transformers-ci-daily-quantization"
|
||||
runner: daily-ci
|
||||
docker: huggingface/transformers-quantization-latest-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
secrets: inherit
|
||||
|
342
.github/workflows/self-scheduled-intel-gaudi.yml
vendored
342
.github/workflows/self-scheduled-intel-gaudi.yml
vendored
@ -1,342 +0,0 @@
|
||||
name: Self-hosted runner (scheduled-intel-gaudi)
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
job:
|
||||
required: true
|
||||
type: string
|
||||
slack_report_channel:
|
||||
required: true
|
||||
type: string
|
||||
runner_scale_set:
|
||||
required: true
|
||||
type: string
|
||||
ci_event:
|
||||
required: true
|
||||
type: string
|
||||
report_repo_id:
|
||||
required: true
|
||||
type: string
|
||||
|
||||
env:
|
||||
NUM_SLICES: 2
|
||||
RUN_SLOW: yes
|
||||
PT_HPU_LAZY_MODE: 0
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
PT_ENABLE_INT64_SUPPORT: 1
|
||||
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
||||
HF_HOME: /mnt/cache/.cache/huggingface
|
||||
|
||||
jobs:
|
||||
setup:
|
||||
if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
|
||||
name: Setup
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
|
||||
folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
|
||||
quantization_matrix: ${{ steps.set-matrix.outputs.quantization_matrix }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
|
||||
- id: set-matrix
|
||||
if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
|
||||
name: Identify models to test
|
||||
working-directory: tests
|
||||
run: |
|
||||
if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
|
||||
echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
|
||||
echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
|
||||
elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
|
||||
echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
|
||||
echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- id: set-matrix-quantization
|
||||
if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
|
||||
name: Identify quantization method to test
|
||||
working-directory: tests
|
||||
run: |
|
||||
echo "quantization_matrix=$(python3 -c 'import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ; print(d)')" >> $GITHUB_OUTPUT
|
||||
|
||||
run_models_gpu:
|
||||
if: ${{ inputs.job == 'run_models_gpu' }}
|
||||
name: " "
|
||||
needs: setup
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [1gaudi, 2gaudi]
|
||||
slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
|
||||
uses: ./.github/workflows/model_jobs_intel_gaudi.yml
|
||||
with:
|
||||
slice_id: ${{ matrix.slice_id }}
|
||||
machine_type: ${{ matrix.machine_type }}
|
||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||
runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
||||
secrets: inherit
|
||||
|
||||
run_trainer_and_fsdp_gpu:
|
||||
if: ${{ inputs.job == 'run_trainer_and_fsdp_gpu' }}
|
||||
name: " "
|
||||
needs: setup
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [1gaudi, 2gaudi]
|
||||
slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
|
||||
uses: ./.github/workflows/model_jobs_intel_gaudi.yml
|
||||
with:
|
||||
slice_id: ${{ matrix.slice_id }}
|
||||
machine_type: ${{ matrix.machine_type }}
|
||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||
runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
||||
report_name_prefix: run_trainer_and_fsdp_gpu
|
||||
secrets: inherit
|
||||
|
||||
run_pipelines_torch_gpu:
|
||||
if: ${{ inputs.job == 'run_pipelines_torch_gpu' }}
|
||||
name: Pipelines
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [1gaudi, 2gaudi]
|
||||
runs-on:
|
||||
group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
||||
container:
|
||||
image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
||||
options: --runtime=habana
|
||||
-v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
|
||||
--env OMPI_MCA_btl_vader_single_copy_mechanism=none
|
||||
--env HABANA_VISIBLE_DEVICES
|
||||
--env HABANA_VISIBLE_MODULES
|
||||
--cap-add=sys_nice
|
||||
--shm-size=64G
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
|
||||
|
||||
- name: HL-SMI
|
||||
run: |
|
||||
hl-smi
|
||||
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
|
||||
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
|
||||
|
||||
- name: Environment
|
||||
run: python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
run: pip freeze
|
||||
|
||||
- name: Set `machine_type` for report and artifact names
|
||||
shell: bash
|
||||
run: |
|
||||
if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
fi
|
||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||
|
||||
- name: Run all pipeline tests on Intel Gaudi
|
||||
run: |
|
||||
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines -m "not not_device_test"
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: |
|
||||
cat reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
|
||||
path: reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
|
||||
|
||||
run_examples_gpu:
|
||||
if: ${{ inputs.job == 'run_examples_gpu' }}
|
||||
name: Examples directory
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [1gaudi]
|
||||
runs-on:
|
||||
group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
||||
container:
|
||||
image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
||||
options: --runtime=habana
|
||||
-v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
|
||||
--env OMPI_MCA_btl_vader_single_copy_mechanism=none
|
||||
--env HABANA_VISIBLE_DEVICES
|
||||
--env HABANA_VISIBLE_MODULES
|
||||
--cap-add=sys_nice
|
||||
--shm-size=64G
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
|
||||
|
||||
- name: HL-SMI
|
||||
run: |
|
||||
hl-smi
|
||||
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
|
||||
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
run: |
|
||||
pip freeze
|
||||
|
||||
- name: Set `machine_type` for report and artifact names
|
||||
shell: bash
|
||||
run: |
|
||||
if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
fi
|
||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||
|
||||
- name: Run examples tests on Intel Gaudi
|
||||
run: |
|
||||
pip install -r examples/pytorch/_tests_requirements.txt
|
||||
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_examples_gpu_test_reports examples/pytorch -m "not not_device_test"
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: |
|
||||
cat reports/${{ env.machine_type }}_run_examples_gpu_test_reports/failures_short.txt
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_examples_gpu_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ env.machine_type }}_run_examples_gpu_test_reports
|
||||
path: reports/${{ env.machine_type }}_run_examples_gpu_test_reports
|
||||
|
||||
run_torch_cuda_extensions_gpu:
|
||||
if: ${{ inputs.job == 'run_torch_cuda_extensions_gpu' }}
|
||||
name: Intel Gaudi deepspeed tests
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [1gaudi, 2gaudi]
|
||||
runs-on:
|
||||
group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
||||
container:
|
||||
image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
||||
options: --runtime=habana
|
||||
-v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
|
||||
--env OMPI_MCA_btl_vader_single_copy_mechanism=none
|
||||
--env HABANA_VISIBLE_DEVICES
|
||||
--env HABANA_VISIBLE_MODULES
|
||||
--cap-add=sys_nice
|
||||
--shm-size=64G
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
|
||||
pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0
|
||||
|
||||
- name: HL-SMI
|
||||
run: |
|
||||
hl-smi
|
||||
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
|
||||
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
run: |
|
||||
pip freeze
|
||||
|
||||
- name: Set `machine_type` for report and artifact names
|
||||
shell: bash
|
||||
run: |
|
||||
if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
fi
|
||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||
|
||||
- name: Run all deepspeed tests on intel Gaudi
|
||||
run: |
|
||||
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed -m "not not_device_test"
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: |
|
||||
cat reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
|
||||
path: reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
|
||||
|
||||
send_results:
|
||||
name: Slack Report
|
||||
needs:
|
||||
[
|
||||
setup,
|
||||
run_models_gpu,
|
||||
run_examples_gpu,
|
||||
run_torch_cuda_extensions_gpu,
|
||||
run_pipelines_torch_gpu,
|
||||
run_trainer_and_fsdp_gpu,
|
||||
]
|
||||
if: ${{ always() }}
|
||||
uses: ./.github/workflows/slack-report.yml
|
||||
with:
|
||||
job: ${{ inputs.job }}
|
||||
setup_status: ${{ needs.setup.result }}
|
||||
slack_report_channel: ${{ inputs.slack_report_channel }}
|
||||
quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
|
||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||
report_repo_id: ${{ inputs.report_repo_id }}
|
||||
ci_event: ${{ inputs.ci_event }}
|
||||
|
||||
secrets: inherit
|
@ -1,67 +0,0 @@
|
||||
name: Self-hosted runner (Intel Gaudi3 scheduled CI caller)
|
||||
|
||||
on:
|
||||
repository_dispatch:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: "17 2 * * *"
|
||||
|
||||
jobs:
|
||||
model-ci:
|
||||
name: Model CI
|
||||
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
||||
with:
|
||||
job: run_models_gpu
|
||||
ci_event: Scheduled CI (Intel) - Gaudi3
|
||||
runner_scale_set: itac-bm-emr-gaudi3-dell
|
||||
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
||||
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
|
||||
|
||||
secrets: inherit
|
||||
|
||||
pipeline-ci:
|
||||
name: Pipeline CI
|
||||
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
||||
with:
|
||||
job: run_pipelines_torch_gpu
|
||||
ci_event: Scheduled CI (Intel) - Gaudi3
|
||||
runner_scale_set: itac-bm-emr-gaudi3-dell
|
||||
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
||||
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
|
||||
|
||||
secrets: inherit
|
||||
|
||||
example-ci:
|
||||
name: Example CI
|
||||
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
||||
with:
|
||||
job: run_examples_gpu
|
||||
ci_event: Scheduled CI (Intel) - Gaudi3
|
||||
runner_scale_set: itac-bm-emr-gaudi3-dell
|
||||
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
||||
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
|
||||
|
||||
secrets: inherit
|
||||
|
||||
deepspeed-ci:
|
||||
name: DeepSpeed CI
|
||||
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
||||
with:
|
||||
job: run_torch_cuda_extensions_gpu
|
||||
ci_event: Scheduled CI (Intel) - Gaudi3
|
||||
runner_scale_set: itac-bm-emr-gaudi3-dell
|
||||
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
||||
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
|
||||
|
||||
secrets: inherit
|
||||
|
||||
trainer-fsdp-ci:
|
||||
name: Trainer/FSDP CI
|
||||
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
||||
with:
|
||||
job: run_trainer_and_fsdp_gpu
|
||||
ci_event: Scheduled CI (Intel) - Gaudi3
|
||||
runner_scale_set: itac-bm-emr-gaudi3-dell
|
||||
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
||||
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
|
||||
secrets: inherit
|
126
.github/workflows/self-scheduled.yml
vendored
126
.github/workflows/self-scheduled.yml
vendored
@ -15,6 +15,9 @@ on:
|
||||
slack_report_channel:
|
||||
required: true
|
||||
type: string
|
||||
runner:
|
||||
required: true
|
||||
type: string
|
||||
docker:
|
||||
required: true
|
||||
type: string
|
||||
@ -50,16 +53,15 @@ jobs:
|
||||
name: Setup
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [aws-g5-4xlarge-cache]
|
||||
machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
|
||||
runs-on:
|
||||
group: '${{ matrix.machine_type }}'
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
outputs:
|
||||
folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
|
||||
slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
|
||||
runner_map: ${{ steps.set-matrix.outputs.runner_map }}
|
||||
quantization_matrix: ${{ steps.set-matrix-quantization.outputs.quantization_matrix }}
|
||||
steps:
|
||||
- name: Update clone
|
||||
@ -86,7 +88,6 @@ jobs:
|
||||
if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
|
||||
echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
|
||||
echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
|
||||
echo "runner_map=$(python3 ../utils/get_runner_map.py)" >> $GITHUB_OUTPUT
|
||||
elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
|
||||
echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
|
||||
echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
|
||||
@ -110,14 +111,14 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [single-gpu]
|
||||
machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
|
||||
slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
|
||||
uses: ./.github/workflows/model_jobs.yml
|
||||
with:
|
||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||
machine_type: ${{ matrix.machine_type }}
|
||||
slice_id: ${{ matrix.slice_id }}
|
||||
runner_map: ${{ needs.setup.outputs.runner_map }}
|
||||
runner: ${{ inputs.runner }}
|
||||
docker: ${{ inputs.docker }}
|
||||
secrets: inherit
|
||||
|
||||
@ -128,14 +129,14 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
|
||||
machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
|
||||
slice_id: [0, 1]
|
||||
uses: ./.github/workflows/model_jobs.yml
|
||||
with:
|
||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||
machine_type: ${{ matrix.machine_type }}
|
||||
slice_id: ${{ matrix.slice_id }}
|
||||
runner_map: ${{ needs.setup.outputs.runner_map }}
|
||||
runner: ${{ inputs.runner }}
|
||||
docker: ${{ inputs.docker }}
|
||||
report_name_prefix: run_trainer_and_fsdp_gpu
|
||||
secrets: inherit
|
||||
@ -146,7 +147,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
|
||||
machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
|
||||
runs-on:
|
||||
group: '${{ matrix.machine_type }}'
|
||||
container:
|
||||
@ -180,9 +181,9 @@ jobs:
|
||||
run: |
|
||||
echo "${{ matrix.machine_type }}"
|
||||
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
@ -214,12 +215,12 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [aws-g5-4xlarge-cache]
|
||||
machine_type: [aws-g4dn-4xlarge-cache]
|
||||
runs-on:
|
||||
group: '${{ matrix.machine_type }}'
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
@ -248,9 +249,9 @@ jobs:
|
||||
run: |
|
||||
echo "${{ matrix.machine_type }}"
|
||||
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
@ -283,7 +284,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
|
||||
machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
|
||||
runs-on:
|
||||
group: '${{ matrix.machine_type }}'
|
||||
container:
|
||||
@ -345,9 +346,9 @@ jobs:
|
||||
run: |
|
||||
echo "${{ matrix.machine_type }}"
|
||||
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
@ -382,7 +383,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }}
|
||||
machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
|
||||
machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
|
||||
runs-on:
|
||||
group: '${{ matrix.machine_type }}'
|
||||
container:
|
||||
@ -425,9 +426,9 @@ jobs:
|
||||
run: |
|
||||
echo "${{ matrix.machine_type }}"
|
||||
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
@ -452,3 +453,86 @@ jobs:
|
||||
with:
|
||||
name: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports
|
||||
path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports
|
||||
|
||||
run_extract_warnings:
|
||||
# Let's only do this for the job `run_models_gpu` to simplify the (already complex) logic.
|
||||
if: ${{ always() && inputs.job == 'run_models_gpu' }}
|
||||
name: Extract warnings in CI artifacts
|
||||
runs-on: ubuntu-22.04
|
||||
needs: [setup, run_models_gpu]
|
||||
steps:
|
||||
- name: Checkout transformers
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Install transformers
|
||||
run: pip install transformers
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
run: pip freeze
|
||||
|
||||
- name: Create output directory
|
||||
run: mkdir warnings_in_ci
|
||||
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
path: warnings_in_ci
|
||||
|
||||
- name: Show artifacts
|
||||
run: echo "$(python3 -c 'import os; d = os.listdir(); print(d)')"
|
||||
working-directory: warnings_in_ci
|
||||
|
||||
- name: Extract warnings in CI artifacts
|
||||
run: |
|
||||
python3 utils/extract_warnings.py --workflow_run_id ${{ github.run_id }} --output_dir warnings_in_ci --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} --from_gh
|
||||
echo "$(python3 -c 'import os; import json; fp = open("warnings_in_ci/selected_warnings.json"); d = json.load(fp); d = "\n".join(d) ;print(d)')"
|
||||
|
||||
- name: Upload artifact
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: warnings_in_ci
|
||||
path: warnings_in_ci/selected_warnings.json
|
||||
|
||||
send_results:
|
||||
name: Slack Report
|
||||
needs: [
|
||||
setup,
|
||||
run_models_gpu,
|
||||
run_trainer_and_fsdp_gpu,
|
||||
run_pipelines_torch_gpu,
|
||||
run_examples_gpu,
|
||||
run_torch_cuda_extensions_gpu,
|
||||
run_quantization_torch_gpu,
|
||||
run_extract_warnings
|
||||
]
|
||||
if: ${{ always() }}
|
||||
uses: ./.github/workflows/slack-report.yml
|
||||
with:
|
||||
job: ${{ inputs.job }}
|
||||
# This would be `skipped` if `setup` is skipped.
|
||||
setup_status: ${{ needs.setup.result }}
|
||||
slack_report_channel: ${{ inputs.slack_report_channel }}
|
||||
# This would be an empty string if `setup` is skipped.
|
||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||
quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
|
||||
ci_event: ${{ inputs.ci_event }}
|
||||
report_repo_id: ${{ inputs.report_repo_id }}
|
||||
|
||||
secrets: inherit
|
||||
|
||||
check_new_failures:
|
||||
if: ${{ always() && inputs.ci_event == 'Daily CI' && needs.send_results.result == 'success' }}
|
||||
name: Check new failures
|
||||
needs: send_results
|
||||
uses: ./.github/workflows/check_failed_tests.yml
|
||||
with:
|
||||
docker: ${{ inputs.docker }}
|
||||
start_sha: ${{ github.sha }}
|
||||
job: ${{ inputs.job }}
|
||||
slack_report_channel: ${{ inputs.slack_report_channel }}
|
||||
ci_event: ${{ inputs.ci_event }}
|
||||
report_repo_id: ${{ inputs.report_repo_id }}
|
||||
|
||||
secrets: inherit
|
||||
|
3
.gitignore
vendored
3
.gitignore
vendored
@ -167,6 +167,3 @@ tags
|
||||
|
||||
# ruff
|
||||
.ruff_cache
|
||||
|
||||
# modular conversion
|
||||
*.modular_backup
|
||||
|
19
Makefile
19
Makefile
@ -8,19 +8,13 @@ check_dirs := examples tests src utils
|
||||
exclude_folders := ""
|
||||
|
||||
modified_only_fixup:
|
||||
@current_branch=$$(git branch --show-current); \
|
||||
if [ "$$current_branch" = "main" ]; then \
|
||||
echo "On main branch, running 'style' target instead..."; \
|
||||
$(MAKE) style; \
|
||||
$(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
|
||||
@if test -n "$(modified_py_files)"; then \
|
||||
echo "Checking/fixing $(modified_py_files)"; \
|
||||
ruff check $(modified_py_files) --fix --exclude $(exclude_folders); \
|
||||
ruff format $(modified_py_files) --exclude $(exclude_folders);\
|
||||
else \
|
||||
modified_py_files=$$(python utils/get_modified_files.py $(check_dirs)); \
|
||||
if [ -n "$$modified_py_files" ]; then \
|
||||
echo "Checking/fixing files: $${modified_py_files}"; \
|
||||
ruff check $${modified_py_files} --fix --exclude $(exclude_folders); \
|
||||
ruff format $${modified_py_files} --exclude $(exclude_folders); \
|
||||
else \
|
||||
echo "No library .py files were modified"; \
|
||||
fi; \
|
||||
echo "No library .py files were modified"; \
|
||||
fi
|
||||
|
||||
# Update src/transformers/dependency_versions_table.py
|
||||
@ -52,7 +46,6 @@ repo-consistency:
|
||||
python utils/check_doctest_list.py
|
||||
python utils/update_metadata.py --check-only
|
||||
python utils/check_docstrings.py
|
||||
python utils/add_dates.py
|
||||
|
||||
# this target runs checks on all files
|
||||
|
||||
|
@ -44,7 +44,7 @@ limitations under the License.
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ja.md">日本語</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_hd.md">हिन्दी</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ru.md">Русский</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_pt-br.md">Português</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_pt-br.md">Рortuguês</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_te.md">తెలుగు</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
|
||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
|
||||
@ -242,7 +242,7 @@ pipeline(
|
||||
|
||||
- This library is not a modular toolbox of building blocks for neural nets. The code in the model files is not refactored with additional abstractions on purpose, so that researchers can quickly iterate on each of the models without diving into additional abstractions/files.
|
||||
- The training API is optimized to work with PyTorch models provided by Transformers. For generic machine learning loops, you should use another library like [Accelerate](https://huggingface.co/docs/accelerate).
|
||||
- The [example scripts](https://github.com/huggingface/transformers/tree/main/examples) are only *examples*. They may not necessarily work out-of-the-box on your specific use case and you'll need to adapt the code for it to work.
|
||||
- The [example scripts]((https://github.com/huggingface/transformers/tree/main/examples)) are only *examples*. They may not necessarily work out-of-the-box on your specific use case and you'll need to adapt the code for it to work.
|
||||
|
||||
## 100 projects using Transformers
|
||||
|
||||
@ -280,8 +280,8 @@ Expand each modality below to see a few example models for various use cases.
|
||||
- Automatic mask generation with [SAM](https://huggingface.co/facebook/sam-vit-base)
|
||||
- Depth estimation with [DepthPro](https://huggingface.co/apple/DepthPro-hf)
|
||||
- Image classification with [DINO v2](https://huggingface.co/facebook/dinov2-base)
|
||||
- Keypoint detection with [SuperPoint](https://huggingface.co/magic-leap-community/superpoint)
|
||||
- Keypoint matching with [SuperGlue](https://huggingface.co/magic-leap-community/superglue_outdoor)
|
||||
- Keypoint detection with [SuperGlue](https://huggingface.co/magic-leap-community/superglue_outdoor)
|
||||
- Keypoint matching with [SuperGlue](https://huggingface.co/magic-leap-community/superglue)
|
||||
- Object detection with [RT-DETRv2](https://huggingface.co/PekingU/rtdetr_v2_r50vd)
|
||||
- Pose Estimation with [VitPose](https://huggingface.co/usyd-community/vitpose-base-simple)
|
||||
- Universal segmentation with [OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_swin_large)
|
||||
|
@ -14,7 +14,7 @@ Models uploaded on the Hugging Face Hub come in different formats. We heavily re
|
||||
models in the [`safetensors`](https://github.com/huggingface/safetensors) format (which is the default prioritized
|
||||
by the transformers library), as developed specifically to prevent arbitrary code execution on your system.
|
||||
|
||||
To avoid loading models from unsafe formats (e.g. [pickle](https://docs.python.org/3/library/pickle.html), you should use the `use_safetensors` parameter. If doing so, in the event that no .safetensors file is present, transformers will error when loading the model.
|
||||
To avoid loading models from unsafe formats(e.g. [pickle](https://docs.python.org/3/library/pickle.html), you should use the `use_safetensors` parameter. If doing so, in the event that no .safetensors file is present, transformers will error when loading the model.
|
||||
|
||||
### Remote code
|
||||
|
||||
|
@ -288,7 +288,7 @@ Keywords: Music understanding, Music generation
|
||||
|
||||
## [dalle-flow](https://github.com/jina-ai/dalle-flow)
|
||||
|
||||
DALL·E Flow is an interactive workflow for generating high-definition images from a text prompt. It leverages DALL·E-Mega, GLID-3 XL, and Stable Diffusion to generate image candidates, and then calls CLIP-as-service to rank the candidates w.r.t. the prompt.
|
||||
DALL·E Flow is an interactive workflow for generating high-definition images from a text prompt. Itt leverages DALL·E-Mega, GLID-3 XL, and Stable Diffusion to generate image candidates, and then calls CLIP-as-service to rank the candidates w.r.t. the prompt.
|
||||
The preferred candidate is fed to GLID-3 XL for diffusion, which often enriches the texture and background. Finally, the candidate is upscaled to 1024x1024 via SwinIR.
|
||||
|
||||
Keywords: High-definition image generation, Stable Diffusion, DALL-E Mega, GLID-3 XL, CLIP, SwinIR
|
||||
@ -526,7 +526,7 @@ Keywords: Model deployment, CLoud, Mobile, Edge
|
||||
|
||||
## [underthesea](https://github.com/undertheseanlp/underthesea)
|
||||
|
||||
[underthesea](https://github.com/undertheseanlp/underthesea) is a Vietnamese NLP toolkit. Underthesea is a suite of open source Python modules data sets and tutorials supporting research and development in Vietnamese Natural Language Processing. We provide extremely easy API to quickly apply pretrained NLP models to your Vietnamese text, such as word segmentation, part-of-speech tagging (PoS), named entity recognition (NER), text classification and dependency parsing.
|
||||
[underthesea](https://github.com/undertheseanlp/underthesea) is a Vietnamese NLP toolkit. Underthesea is a suite of open source Python modules data sets and tutorials supporting research and development in Vietnamese Natural Language Processing. We provides extremely easy API to quickly apply pretrained NLP models to your Vietnamese text, such as word segmentation, part-of-speech tagging (PoS), named entity recognition (NER), text classification and dependency parsing.
|
||||
|
||||
Keywords: Vietnamese, NLP
|
||||
|
||||
|
1
benchmark/.gitignore
vendored
1
benchmark/.gitignore
vendored
@ -1 +0,0 @@
|
||||
benchmark_results/
|
@ -1,345 +0,0 @@
|
||||
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from logging import Logger
|
||||
import os
|
||||
from threading import Event, Thread
|
||||
from time import perf_counter, sleep
|
||||
from typing import Optional
|
||||
import sys
|
||||
|
||||
# Add the parent directory to Python path to import benchmarks_entrypoint
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
from benchmarks_entrypoint import MetricsRecorder
|
||||
|
||||
import gpustat
|
||||
import psutil
|
||||
import psycopg2
|
||||
|
||||
# Optional heavy ML dependencies - only required when actually running the benchmark
|
||||
try:
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StaticCache
|
||||
TRANSFORMERS_AVAILABLE = True
|
||||
except ImportError:
|
||||
TRANSFORMERS_AVAILABLE = False
|
||||
torch = None
|
||||
AutoModelForCausalLM = None
|
||||
AutoTokenizer = None
|
||||
GenerationConfig = None
|
||||
StaticCache = None
|
||||
|
||||
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "1"
|
||||
|
||||
# Only set torch precision if torch is available
|
||||
if TRANSFORMERS_AVAILABLE:
|
||||
torch.set_float32_matmul_precision("high")
|
||||
|
||||
|
||||
def collect_metrics(benchmark_id, continue_metric_collection, metrics_recorder):
|
||||
p = psutil.Process(os.getpid())
|
||||
while not continue_metric_collection.is_set():
|
||||
with p.oneshot():
|
||||
cpu_util = p.cpu_percent()
|
||||
mem_megabytes = p.memory_info().rss / (1024 * 1024)
|
||||
gpu_stats = gpustat.GPUStatCollection.new_query()
|
||||
gpu_util = gpu_stats[0]["utilization.gpu"]
|
||||
gpu_mem_megabytes = gpu_stats[0]["memory.used"]
|
||||
metrics_recorder.collect_device_measurements(
|
||||
benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes
|
||||
)
|
||||
sleep(0.01)
|
||||
|
||||
|
||||
def run_benchmark(
|
||||
logger: Logger, repository: str, branch: str, commit_id: str, commit_msg: str, metrics_recorder=None, num_tokens_to_generate=100
|
||||
):
|
||||
# Check if required ML dependencies are available
|
||||
if not TRANSFORMERS_AVAILABLE:
|
||||
logger.error("Transformers and torch are required to run the LLaMA benchmark. Please install them with:")
|
||||
logger.error("pip install torch transformers")
|
||||
logger.error("Skipping LLaMA benchmark due to missing dependencies.")
|
||||
return
|
||||
|
||||
continue_metric_collection = Event()
|
||||
metrics_thread = None
|
||||
model_id = "meta-llama/Llama-2-7b-hf"
|
||||
|
||||
# If no metrics_recorder is provided, create one for backward compatibility
|
||||
if metrics_recorder is None:
|
||||
try:
|
||||
metrics_recorder = MetricsRecorder(
|
||||
psycopg2.connect("dbname=metrics"), logger, repository, branch, commit_id, commit_msg, True
|
||||
)
|
||||
should_close_recorder = True
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create metrics recorder: {e}")
|
||||
return
|
||||
else:
|
||||
should_close_recorder = False
|
||||
try:
|
||||
gpu_stats = gpustat.GPUStatCollection.new_query()
|
||||
gpu_name = gpu_stats[0]["name"]
|
||||
benchmark_id = metrics_recorder.initialise_benchmark({"gpu_name": gpu_name, "model_id": model_id})
|
||||
logger.info(f"running benchmark #{benchmark_id} on {gpu_name} for {model_id}")
|
||||
metrics_thread = Thread(
|
||||
target=collect_metrics,
|
||||
args=[benchmark_id, continue_metric_collection, metrics_recorder],
|
||||
)
|
||||
metrics_thread.start()
|
||||
logger.info("started background thread to fetch device metrics")
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false" # silence warnings when compiling
|
||||
|
||||
device = "cuda"
|
||||
|
||||
logger.info("downloading weights")
|
||||
# This is to avoid counting download in model load time measurement
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)
|
||||
gen_config = GenerationConfig(do_sample=False, top_p=1, temperature=1)
|
||||
logger.info("loading model")
|
||||
start = perf_counter()
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_id, torch_dtype=torch.float16, generation_config=gen_config
|
||||
).eval()
|
||||
model.to(device)
|
||||
torch.cuda.synchronize()
|
||||
end = perf_counter()
|
||||
model_load_time = end - start
|
||||
logger.info(f"loaded model in: {model_load_time}s")
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||
|
||||
prompt = "Why dogs are so cute?"
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to(device)
|
||||
|
||||
# Specify the max length (including both the prompt and the response)
|
||||
# When calling `generate` with `cache_implementation="static" later, this is also used to create a `StaticCache` object
|
||||
# with sequence length = `max_length`. The longer the more you will re-use it
|
||||
seq_length = inputs["input_ids"].shape[1]
|
||||
model.generation_config.max_length = seq_length + num_tokens_to_generate
|
||||
batch_size = inputs["input_ids"].shape[0]
|
||||
|
||||
# Copied from the gpt-fast repo
|
||||
def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization
|
||||
q = torch.empty_like(probs_sort).exponential_(1)
|
||||
return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
|
||||
|
||||
def logits_to_probs(logits, temperature: float = 1.0, top_k: Optional[int] = None):
|
||||
logits = logits / max(temperature, 1e-5)
|
||||
|
||||
if top_k is not None:
|
||||
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
|
||||
pivot = v.select(-1, -1).unsqueeze(-1)
|
||||
logits = torch.where(logits < pivot, -float("Inf"), logits)
|
||||
probs = torch.nn.functional.softmax(logits, dim=-1)
|
||||
return probs
|
||||
|
||||
def sample(logits, temperature: float = 1.0, top_k: Optional[int] = None):
|
||||
probs = logits_to_probs(logits[0, -1], temperature, top_k)
|
||||
idx_next = multinomial_sample_one_no_sync(probs)
|
||||
return idx_next, probs
|
||||
|
||||
# First eager forward pass
|
||||
logger.info("running first eager forward pass")
|
||||
start = perf_counter()
|
||||
outputs = model(**inputs)
|
||||
torch.cuda.synchronize()
|
||||
end = perf_counter()
|
||||
first_eager_fwd_pass_time = end - start
|
||||
logger.info(f"completed first eager forward pass in: {first_eager_fwd_pass_time}s")
|
||||
|
||||
# Second eager forward pass (should be faster)
|
||||
logger.info("running second eager forward pass")
|
||||
start = perf_counter()
|
||||
outputs = model(**inputs)
|
||||
torch.cuda.synchronize()
|
||||
end = perf_counter()
|
||||
second_eager_fwd_pass_time = end - start
|
||||
logger.info(f"completed second eager forward pass in: {second_eager_fwd_pass_time}s")
|
||||
|
||||
# First eager generation
|
||||
logger.info("running first eager generation")
|
||||
start = perf_counter()
|
||||
output = model.generate(**inputs)
|
||||
torch.cuda.synchronize()
|
||||
end = perf_counter()
|
||||
first_eager_generate_time = end - start
|
||||
logger.info(f"completed first eager generation in: {first_eager_generate_time}s")
|
||||
logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
|
||||
|
||||
# Second eager generation (should be faster)
|
||||
logger.info("running second eager generation")
|
||||
start = perf_counter()
|
||||
output = model.generate(**inputs)
|
||||
torch.cuda.synchronize()
|
||||
end = perf_counter()
|
||||
second_eager_generate_time = end - start
|
||||
logger.info(f"completed second eager generation in: {second_eager_generate_time}s")
|
||||
logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
|
||||
|
||||
logger.info("running generation timing loop")
|
||||
|
||||
input_pos = torch.arange(0, seq_length, device=device)
|
||||
inputs = inputs["input_ids"]
|
||||
|
||||
start = perf_counter()
|
||||
with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH):
|
||||
logits = model(inputs, position_ids=input_pos).logits
|
||||
next_token, probs = sample(logits, temperature=0.6, top_k=5)
|
||||
torch.cuda.synchronize()
|
||||
end = perf_counter()
|
||||
time_to_first_token = end - start
|
||||
|
||||
input_pos = torch.tensor([seq_length], device=device, dtype=torch.int)
|
||||
next_token = next_token.clone()
|
||||
start = perf_counter()
|
||||
with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH):
|
||||
logits = model(next_token, position_ids=input_pos).logits
|
||||
next_token, probs = sample(logits, temperature=0.6, top_k=5)
|
||||
torch.cuda.synchronize()
|
||||
end = perf_counter()
|
||||
time_to_second_token = end - start
|
||||
|
||||
input_pos = torch.tensor([seq_length + 1], device=device, dtype=torch.int)
|
||||
next_token = next_token.clone()
|
||||
start = perf_counter()
|
||||
with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH):
|
||||
logits = model(next_token, position_ids=input_pos).logits
|
||||
next_token, probs = sample(logits, temperature=0.6, top_k=5)
|
||||
torch.cuda.synchronize()
|
||||
end = perf_counter()
|
||||
time_to_third_token = end - start
|
||||
|
||||
logger.info("running longer generation timing loop")
|
||||
|
||||
total_time = 0
|
||||
for i in range(20):
|
||||
input_pos = torch.tensor([seq_length + 2 + i], device=device, dtype=torch.int)
|
||||
next_token = next_token.clone()
|
||||
start = perf_counter()
|
||||
with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH):
|
||||
logits = model(next_token, position_ids=input_pos).logits
|
||||
next_token, probs = sample(logits, temperature=0.6, top_k=5)
|
||||
torch.cuda.synchronize()
|
||||
end = perf_counter()
|
||||
total_time += end - start
|
||||
|
||||
mean_time_to_next_token = total_time / 20
|
||||
|
||||
logger.info("running compilation benchmarks")
|
||||
|
||||
# Now compile the model
|
||||
model = torch.compile(model, mode="max-autotune", fullgraph=True)
|
||||
|
||||
# StaticCache for generation
|
||||
with torch.device(device):
|
||||
model.setup_caches(max_batch_size=batch_size, max_seq_len=seq_length + num_tokens_to_generate)
|
||||
|
||||
input_pos = torch.arange(0, seq_length, device=device)
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to(device)["input_ids"]
|
||||
|
||||
logger.info("compiling model")
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, generation_config=gen_config)
|
||||
model.to(device)
|
||||
model = torch.compile(model, mode="max-autotune", fullgraph=True)
|
||||
|
||||
past_key_values = StaticCache(
|
||||
model.config,
|
||||
max_batch_size=batch_size,
|
||||
device=device,
|
||||
dtype=torch.float16,
|
||||
max_cache_len=seq_length + 128,
|
||||
)
|
||||
# 1st call
|
||||
start = perf_counter()
|
||||
output = model.generate(**inputs, past_key_values=past_key_values)
|
||||
end = perf_counter()
|
||||
first_compile_generate_time = end - start
|
||||
logger.info(f"completed first compile generation in: {first_compile_generate_time}s")
|
||||
logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
|
||||
|
||||
past_key_values = StaticCache(
|
||||
model.config,
|
||||
max_batch_size=batch_size,
|
||||
device=device,
|
||||
dtype=torch.float16,
|
||||
max_cache_len=seq_length + 128,
|
||||
)
|
||||
# 2nd call
|
||||
start = perf_counter()
|
||||
output = model.generate(**inputs, past_key_values=past_key_values)
|
||||
end = perf_counter()
|
||||
second_compile_generate_time = end - start
|
||||
logger.info(f"completed second compile generation in: {second_compile_generate_time}s")
|
||||
logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
|
||||
|
||||
past_key_values = StaticCache(
|
||||
model.config,
|
||||
max_batch_size=batch_size,
|
||||
device=device,
|
||||
dtype=torch.float16,
|
||||
max_cache_len=seq_length + 128,
|
||||
)
|
||||
# 3rd call
|
||||
start = perf_counter()
|
||||
output = model.generate(**inputs, past_key_values=past_key_values)
|
||||
end = perf_counter()
|
||||
third_compile_generate_time = end - start
|
||||
logger.info(f"completed third compile generation in: {third_compile_generate_time}s")
|
||||
logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
|
||||
|
||||
past_key_values = StaticCache(
|
||||
model.config,
|
||||
max_batch_size=batch_size,
|
||||
device=device,
|
||||
dtype=torch.float16,
|
||||
max_cache_len=seq_length + 128,
|
||||
)
|
||||
# 4th call
|
||||
start = perf_counter()
|
||||
output = model.generate(**inputs, past_key_values=past_key_values)
|
||||
end = perf_counter()
|
||||
fourth_compile_generate_time = end - start
|
||||
logger.info(f"completed fourth compile generation in: {fourth_compile_generate_time}s")
|
||||
logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
|
||||
|
||||
metrics_recorder.collect_model_measurements(
|
||||
benchmark_id,
|
||||
{
|
||||
"model_load_time": model_load_time,
|
||||
"first_eager_forward_pass_time_secs": first_eager_fwd_pass_time,
|
||||
"second_eager_forward_pass_time_secs": second_eager_fwd_pass_time,
|
||||
"first_eager_generate_time_secs": first_eager_generate_time,
|
||||
"second_eager_generate_time_secs": second_eager_generate_time,
|
||||
"time_to_first_token_secs": time_to_first_token,
|
||||
"time_to_second_token_secs": time_to_second_token,
|
||||
"time_to_third_token_secs": time_to_third_token,
|
||||
"time_to_next_token_mean_secs": mean_time_to_next_token,
|
||||
"first_compile_generate_time_secs": first_compile_generate_time,
|
||||
"second_compile_generate_time_secs": second_compile_generate_time,
|
||||
"third_compile_generate_time_secs": third_compile_generate_time,
|
||||
"fourth_compile_generate_time_secs": fourth_compile_generate_time,
|
||||
},
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Caught exception: {e}")
|
||||
continue_metric_collection.set()
|
||||
if metrics_thread is not None:
|
||||
metrics_thread.join()
|
||||
|
||||
# Only close the recorder if we created it locally
|
||||
if should_close_recorder:
|
||||
metrics_recorder.close()
|
@ -1,35 +1,15 @@
|
||||
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import importlib.util
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Dict, Tuple, Optional, List
|
||||
from typing import Dict, Tuple
|
||||
|
||||
import pandas as pd
|
||||
from psycopg2.extensions import register_adapter
|
||||
from psycopg2.extras import Json
|
||||
|
||||
try:
|
||||
from psycopg2.extensions import register_adapter
|
||||
from psycopg2.extras import Json
|
||||
register_adapter(dict, Json)
|
||||
PSYCOPG2_AVAILABLE = True
|
||||
except ImportError:
|
||||
PSYCOPG2_AVAILABLE = False
|
||||
|
||||
register_adapter(dict, Json)
|
||||
|
||||
|
||||
class ImportModuleException(Exception):
|
||||
@ -38,239 +18,61 @@ class ImportModuleException(Exception):
|
||||
|
||||
class MetricsRecorder:
|
||||
def __init__(
|
||||
self, connection, logger: logging.Logger, repository: str, branch: str, commit_id: str, commit_msg: str,
|
||||
collect_csv_data: bool = True
|
||||
self, connection, logger: logging.Logger, repository: str, branch: str, commit_id: str, commit_msg: str
|
||||
):
|
||||
self.conn = connection
|
||||
self.use_database = connection is not None
|
||||
if self.use_database:
|
||||
self.conn.autocommit = True
|
||||
self.conn.autocommit = True
|
||||
self.logger = logger
|
||||
self.repository = repository
|
||||
self.branch = branch
|
||||
self.commit_id = commit_id
|
||||
self.commit_msg = commit_msg
|
||||
self.collect_csv_data = collect_csv_data
|
||||
|
||||
# For CSV export - store all data in pandas DataFrames (only if CSV collection is enabled)
|
||||
if self.collect_csv_data:
|
||||
# Initialize empty DataFrames with proper schemas
|
||||
self.benchmarks_df = pd.DataFrame(columns=[
|
||||
'benchmark_id', 'repository', 'branch', 'commit_id', 'commit_message',
|
||||
'metadata', 'created_at'
|
||||
])
|
||||
self.device_measurements_df = pd.DataFrame(columns=[
|
||||
'benchmark_id', 'cpu_util', 'mem_megabytes', 'gpu_util',
|
||||
'gpu_mem_megabytes', 'time'
|
||||
])
|
||||
self.model_measurements_df = pd.DataFrame(columns=[
|
||||
'benchmark_id', 'time', 'model_load_time', 'first_eager_forward_pass_time_secs',
|
||||
'second_eager_forward_pass_time_secs', 'first_eager_generate_time_secs',
|
||||
'second_eager_generate_time_secs', 'time_to_first_token_secs',
|
||||
'time_to_second_token_secs', 'time_to_third_token_secs',
|
||||
'time_to_next_token_mean_secs', 'first_compile_generate_time_secs',
|
||||
'second_compile_generate_time_secs', 'third_compile_generate_time_secs',
|
||||
'fourth_compile_generate_time_secs'
|
||||
])
|
||||
else:
|
||||
self.benchmarks_df = None
|
||||
self.device_measurements_df = None
|
||||
self.model_measurements_df = None
|
||||
|
||||
def initialise_benchmark(self, metadata: dict[str, str]) -> str:
|
||||
def initialise_benchmark(self, metadata: dict[str, str]) -> int:
|
||||
"""
|
||||
Creates a new benchmark, returns the benchmark id (UUID)
|
||||
Creates a new benchmark, returns the benchmark id
|
||||
"""
|
||||
# Generate a unique UUID for this benchmark
|
||||
benchmark_id = str(uuid.uuid4())
|
||||
|
||||
if self.use_database:
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"INSERT INTO benchmarks (benchmark_id, repository, branch, commit_id, commit_message, metadata) VALUES (%s, %s, %s, %s, %s, %s)",
|
||||
(benchmark_id, self.repository, self.branch, self.commit_id, self.commit_msg, metadata),
|
||||
)
|
||||
self.logger.debug(f"initialised benchmark #{benchmark_id}")
|
||||
|
||||
# Store benchmark data for CSV export (if enabled)
|
||||
if self.collect_csv_data:
|
||||
# Add row to pandas DataFrame
|
||||
new_row = pd.DataFrame([{
|
||||
'benchmark_id': benchmark_id,
|
||||
'repository': self.repository,
|
||||
'branch': self.branch,
|
||||
'commit_id': self.commit_id,
|
||||
'commit_message': self.commit_msg,
|
||||
'metadata': json.dumps(metadata),
|
||||
'created_at': datetime.utcnow().isoformat()
|
||||
}])
|
||||
self.benchmarks_df = pd.concat([self.benchmarks_df, new_row], ignore_index=True)
|
||||
|
||||
mode_info = []
|
||||
if self.use_database:
|
||||
mode_info.append("database")
|
||||
if self.collect_csv_data:
|
||||
mode_info.append("CSV")
|
||||
mode_str = " + ".join(mode_info) if mode_info else "no storage"
|
||||
|
||||
self.logger.debug(f"initialised benchmark #{benchmark_id} ({mode_str} mode)")
|
||||
return benchmark_id
|
||||
# gpu_name: str, model_id: str
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"INSERT INTO benchmarks (repository, branch, commit_id, commit_message, metadata) VALUES (%s, %s, %s, %s, %s) RETURNING benchmark_id",
|
||||
(self.repository, self.branch, self.commit_id, self.commit_msg, metadata),
|
||||
)
|
||||
benchmark_id = cur.fetchone()[0]
|
||||
logger.debug(f"initialised benchmark #{benchmark_id}")
|
||||
return benchmark_id
|
||||
|
||||
def collect_device_measurements(self, benchmark_id: str, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes):
|
||||
def collect_device_measurements(self, benchmark_id: int, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes):
|
||||
"""
|
||||
Collect device metrics, such as CPU & GPU usage. These are "static", as in you cannot pass arbitrary arguments to the function.
|
||||
"""
|
||||
# Store device measurements for CSV export (if enabled)
|
||||
if self.collect_csv_data:
|
||||
# Add row to pandas DataFrame
|
||||
new_row = pd.DataFrame([{
|
||||
'benchmark_id': benchmark_id,
|
||||
'cpu_util': cpu_util,
|
||||
'mem_megabytes': mem_megabytes,
|
||||
'gpu_util': gpu_util,
|
||||
'gpu_mem_megabytes': gpu_mem_megabytes,
|
||||
'time': datetime.utcnow().isoformat()
|
||||
}])
|
||||
self.device_measurements_df = pd.concat([self.device_measurements_df, new_row], ignore_index=True)
|
||||
|
||||
# Store in database if available
|
||||
if self.use_database:
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"INSERT INTO device_measurements (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes) VALUES (%s, %s, %s, %s, %s)",
|
||||
(benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes),
|
||||
)
|
||||
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"INSERT INTO device_measurements (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes) VALUES (%s, %s, %s, %s, %s)",
|
||||
(benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes),
|
||||
)
|
||||
self.logger.debug(
|
||||
f"collected device measurements for benchmark #{benchmark_id} [CPU util: {cpu_util}, mem MBs: {mem_megabytes}, GPU util: {gpu_util}, GPU mem MBs: {gpu_mem_megabytes}]"
|
||||
f"inserted device measurements for benchmark #{benchmark_id} [CPU util: {cpu_util}, mem MBs: {mem_megabytes}, GPU util: {gpu_util}, GPU mem MBs: {gpu_mem_megabytes}]"
|
||||
)
|
||||
|
||||
def collect_model_measurements(self, benchmark_id: str, measurements: dict[str, float]):
|
||||
# Store model measurements for CSV export (if enabled)
|
||||
if self.collect_csv_data:
|
||||
# Add row to pandas DataFrame with flattened measurements
|
||||
row_data = {
|
||||
'benchmark_id': benchmark_id,
|
||||
'time': datetime.utcnow().isoformat()
|
||||
}
|
||||
# Flatten the measurements dict into the row
|
||||
row_data.update(measurements)
|
||||
|
||||
new_row = pd.DataFrame([row_data])
|
||||
self.model_measurements_df = pd.concat([self.model_measurements_df, new_row], ignore_index=True)
|
||||
|
||||
# Store in database if available
|
||||
if self.use_database:
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO model_measurements (
|
||||
benchmark_id,
|
||||
measurements
|
||||
) VALUES (%s, %s)
|
||||
""",
|
||||
(
|
||||
benchmark_id,
|
||||
measurements,
|
||||
),
|
||||
)
|
||||
|
||||
self.logger.debug(f"collected model measurements for benchmark #{benchmark_id}: {measurements}")
|
||||
|
||||
def export_to_csv(self, output_dir: str = "benchmark_results"):
|
||||
"""
|
||||
Export all collected data to CSV files using pandas DataFrames
|
||||
"""
|
||||
if not self.collect_csv_data:
|
||||
self.logger.warning("CSV data collection is disabled - no CSV files will be generated")
|
||||
return
|
||||
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
self.logger.info(f"Created output directory: {output_dir}")
|
||||
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
files_created = []
|
||||
|
||||
# Export using pandas DataFrames
|
||||
self._export_pandas_data(output_dir, timestamp, files_created)
|
||||
|
||||
self.logger.info(f"CSV export complete! Created {len(files_created)} files in {output_dir}")
|
||||
|
||||
def _export_pandas_data(self, output_dir: str, timestamp: str, files_created: list):
|
||||
"""
|
||||
Export CSV files using pandas DataFrames
|
||||
"""
|
||||
# Export benchmarks
|
||||
benchmarks_file = os.path.join(output_dir, f"benchmarks_{timestamp}.csv")
|
||||
self.benchmarks_df.to_csv(benchmarks_file, index=False)
|
||||
files_created.append(benchmarks_file)
|
||||
self.logger.info(f"Exported {len(self.benchmarks_df)} benchmark records to {benchmarks_file}")
|
||||
|
||||
# Export device measurements
|
||||
device_file = os.path.join(output_dir, f"device_measurements_{timestamp}.csv")
|
||||
self.device_measurements_df.to_csv(device_file, index=False)
|
||||
files_created.append(device_file)
|
||||
self.logger.info(f"Exported {len(self.device_measurements_df)} device measurement records to {device_file}")
|
||||
|
||||
# Export model measurements (already flattened)
|
||||
model_file = os.path.join(output_dir, f"model_measurements_{timestamp}.csv")
|
||||
self.model_measurements_df.to_csv(model_file, index=False)
|
||||
files_created.append(model_file)
|
||||
self.logger.info(f"Exported {len(self.model_measurements_df)} model measurement records to {model_file}")
|
||||
|
||||
# Create comprehensive summary using pandas operations
|
||||
summary_file = os.path.join(output_dir, f"benchmark_summary_{timestamp}.csv")
|
||||
self._create_summary(summary_file)
|
||||
files_created.append(summary_file)
|
||||
|
||||
def _create_summary(self, summary_file: str):
|
||||
"""
|
||||
Create a comprehensive summary CSV using pandas operations
|
||||
"""
|
||||
if len(self.benchmarks_df) == 0:
|
||||
# Create empty summary file
|
||||
summary_df = pd.DataFrame()
|
||||
summary_df.to_csv(summary_file, index=False)
|
||||
self.logger.info(f"Created empty benchmark summary at {summary_file}")
|
||||
return
|
||||
|
||||
# Start with benchmarks as the base
|
||||
summary_df = self.benchmarks_df.copy()
|
||||
|
||||
# Add model measurements (join on benchmark_id)
|
||||
if len(self.model_measurements_df) > 0:
|
||||
# Drop 'time' column from model measurements to avoid conflicts
|
||||
model_df = self.model_measurements_df.drop(columns=['time'], errors='ignore')
|
||||
summary_df = summary_df.merge(model_df, on='benchmark_id', how='left')
|
||||
|
||||
# Calculate device measurement aggregates using pandas groupby
|
||||
if len(self.device_measurements_df) > 0:
|
||||
device_agg = self.device_measurements_df.groupby('benchmark_id').agg({
|
||||
'cpu_util': ['mean', 'max', 'std', 'count'],
|
||||
'mem_megabytes': ['mean', 'max', 'std'],
|
||||
'gpu_util': ['mean', 'max', 'std'],
|
||||
'gpu_mem_megabytes': ['mean', 'max', 'std']
|
||||
}).round(3)
|
||||
|
||||
# Flatten column names
|
||||
device_agg.columns = [f"{col[0]}_{col[1]}" for col in device_agg.columns]
|
||||
device_agg = device_agg.reset_index()
|
||||
|
||||
# Rename count column to be more descriptive
|
||||
if 'cpu_util_count' in device_agg.columns:
|
||||
device_agg = device_agg.rename(columns={'cpu_util_count': 'device_measurement_count'})
|
||||
|
||||
# Merge with summary
|
||||
summary_df = summary_df.merge(device_agg, on='benchmark_id', how='left')
|
||||
|
||||
# Export the comprehensive summary
|
||||
summary_df.to_csv(summary_file, index=False)
|
||||
self.logger.info(f"Created comprehensive benchmark summary with {len(summary_df)} records at {summary_file}")
|
||||
def collect_model_measurements(self, benchmark_id: int, measurements: dict[str, float]):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO model_measurements (
|
||||
benchmark_id,
|
||||
measurements
|
||||
) VALUES (%s, %s)
|
||||
""",
|
||||
(
|
||||
benchmark_id,
|
||||
measurements,
|
||||
),
|
||||
)
|
||||
self.logger.debug(f"inserted model measurements for benchmark #{benchmark_id}: {measurements}")
|
||||
|
||||
def close(self):
|
||||
if self.use_database and self.conn:
|
||||
self.conn.close()
|
||||
self.conn.close()
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -283,7 +85,7 @@ handler.setFormatter(formatter)
|
||||
logger.addHandler(handler)
|
||||
|
||||
|
||||
def parse_arguments() -> tuple[str, str, str, str, bool, str]:
|
||||
def parse_arguments() -> tuple[str, str, str, str]:
|
||||
"""
|
||||
Parse command line arguments for the benchmarking CLI.
|
||||
"""
|
||||
@ -312,27 +114,10 @@ def parse_arguments() -> tuple[str, str, str, str, bool, str]:
|
||||
type=str,
|
||||
help="The commit message associated with the commit, truncated to 70 characters.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--csv",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Enable CSV output files generation."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--csv-output-dir",
|
||||
type=str,
|
||||
default="benchmark_results",
|
||||
help="Directory for CSV output files (default: benchmark_results)."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# CSV is disabled by default, only enabled when --csv is used
|
||||
generate_csv = args.csv
|
||||
|
||||
return args.repository, args.branch, args.commit_id, args.commit_msg, generate_csv, args.csv_output_dir
|
||||
return args.repository, args.branch, args.commit_id, args.commit_msg
|
||||
|
||||
|
||||
def import_from_path(module_name, file_path):
|
||||
@ -346,124 +131,22 @@ def import_from_path(module_name, file_path):
|
||||
raise ImportModuleException(f"failed to load python module: {e}")
|
||||
|
||||
|
||||
def create_database_connection():
|
||||
"""
|
||||
Try to create a database connection. Returns None if connection fails.
|
||||
"""
|
||||
if not PSYCOPG2_AVAILABLE:
|
||||
logger.warning("psycopg2 not available - running in CSV-only mode")
|
||||
return None
|
||||
|
||||
try:
|
||||
import psycopg2
|
||||
conn = psycopg2.connect("dbname=metrics")
|
||||
logger.info("Successfully connected to database")
|
||||
return conn
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to connect to database: {e}. Running in CSV-only mode")
|
||||
return None
|
||||
|
||||
|
||||
def create_global_metrics_recorder(repository: str, branch: str, commit_id: str, commit_msg: str,
|
||||
generate_csv: bool = False) -> MetricsRecorder:
|
||||
"""
|
||||
Create a global metrics recorder that will be used across all benchmarks.
|
||||
"""
|
||||
connection = create_database_connection()
|
||||
recorder = MetricsRecorder(connection, logger, repository, branch, commit_id, commit_msg, generate_csv)
|
||||
|
||||
# Log the storage mode
|
||||
storage_modes = []
|
||||
if connection is not None:
|
||||
storage_modes.append("database")
|
||||
if generate_csv:
|
||||
storage_modes.append("CSV")
|
||||
|
||||
if not storage_modes:
|
||||
logger.warning("Running benchmarks with NO data storage (no database connection, CSV disabled)")
|
||||
logger.warning("Use --csv flag to enable CSV output when database is unavailable")
|
||||
else:
|
||||
logger.info(f"Running benchmarks with: {' + '.join(storage_modes)} storage")
|
||||
|
||||
return recorder
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
benchmarks_folder_path = os.path.dirname(os.path.realpath(__file__))
|
||||
benches_folder_path = os.path.join(benchmarks_folder_path, "benches")
|
||||
|
||||
repository, branch, commit_id, commit_msg, generate_csv, csv_output_dir = parse_arguments()
|
||||
|
||||
# Create a global metrics recorder
|
||||
global_metrics_recorder = create_global_metrics_recorder(repository, branch, commit_id, commit_msg, generate_csv)
|
||||
|
||||
successful_benchmarks = 0
|
||||
failed_benchmarks = 0
|
||||
|
||||
# Automatically discover all benchmark modules in benches/ folder
|
||||
benchmark_modules = []
|
||||
|
||||
if os.path.exists(benches_folder_path):
|
||||
logger.debug(f"Scanning for benchmarks in: {benches_folder_path}")
|
||||
for entry in os.scandir(benches_folder_path):
|
||||
repository, branch, commit_id, commit_msg = parse_arguments()
|
||||
|
||||
for entry in os.scandir(benchmarks_folder_path):
|
||||
try:
|
||||
if not entry.name.endswith(".py"):
|
||||
continue
|
||||
if entry.name.startswith("__"): # Skip __init__.py, __pycache__, etc.
|
||||
if entry.path == __file__:
|
||||
continue
|
||||
|
||||
# Check if the file has a run_benchmark function
|
||||
try:
|
||||
logger.debug(f"checking if benches/{entry.name} has run_benchmark function")
|
||||
module = import_from_path(entry.name.split(".")[0], entry.path)
|
||||
if hasattr(module, 'run_benchmark'):
|
||||
benchmark_modules.append(entry.name)
|
||||
logger.debug(f"discovered benchmark: {entry.name}")
|
||||
else:
|
||||
logger.debug(f"skipping {entry.name} - no run_benchmark function found")
|
||||
except Exception as e:
|
||||
logger.debug(f"failed to check benches/{entry.name}: {e}")
|
||||
else:
|
||||
logger.warning(f"Benches directory not found: {benches_folder_path}")
|
||||
|
||||
if benchmark_modules:
|
||||
logger.info(f"Discovered {len(benchmark_modules)} benchmark(s): {benchmark_modules}")
|
||||
else:
|
||||
logger.warning("No benchmark modules found in benches/ directory")
|
||||
|
||||
for module_name in benchmark_modules:
|
||||
module_path = os.path.join(benches_folder_path, module_name)
|
||||
try:
|
||||
logger.debug(f"loading: {module_name}")
|
||||
module = import_from_path(module_name.split(".")[0], module_path)
|
||||
logger.info(f"running benchmarks in: {module_name}")
|
||||
|
||||
# Check if the module has an updated run_benchmark function that accepts metrics_recorder
|
||||
try:
|
||||
# Try the new signature first
|
||||
module.run_benchmark(logger, repository, branch, commit_id, commit_msg, global_metrics_recorder)
|
||||
except TypeError:
|
||||
# Fall back to the old signature for backward compatibility
|
||||
logger.warning(f"Module {module_name} using old run_benchmark signature - database connection will be created per module")
|
||||
module.run_benchmark(logger, repository, branch, commit_id, commit_msg)
|
||||
|
||||
successful_benchmarks += 1
|
||||
logger.debug(f"loading: {entry.name}")
|
||||
module = import_from_path(entry.name.split(".")[0], entry.path)
|
||||
logger.info(f"running benchmarks in: {entry.name}")
|
||||
module.run_benchmark(logger, repository, branch, commit_id, commit_msg)
|
||||
except ImportModuleException as e:
|
||||
logger.error(e)
|
||||
failed_benchmarks += 1
|
||||
except Exception as e:
|
||||
logger.error(f"error running benchmarks for {module_name}: {e}")
|
||||
failed_benchmarks += 1
|
||||
|
||||
# Export CSV results at the end (if enabled)
|
||||
try:
|
||||
if generate_csv:
|
||||
global_metrics_recorder.export_to_csv(csv_output_dir)
|
||||
logger.info(f"CSV reports have been generated and saved to the {csv_output_dir} directory")
|
||||
else:
|
||||
logger.info("CSV generation disabled - no CSV files created (use --csv to enable)")
|
||||
|
||||
logger.info(f"Benchmark run completed. Successful: {successful_benchmarks}, Failed: {failed_benchmarks}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to export CSV results: {e}")
|
||||
finally:
|
||||
global_metrics_recorder.close()
|
||||
logger.error(f"error running benchmarks for {entry.name}: {e}")
|
||||
|
34
benchmark/init_db.sql
Normal file
34
benchmark/init_db.sql
Normal file
@ -0,0 +1,34 @@
|
||||
CREATE TABLE IF NOT EXISTS benchmarks (
|
||||
benchmark_id SERIAL PRIMARY KEY,
|
||||
repository VARCHAR(255),
|
||||
branch VARCHAR(255),
|
||||
commit_id VARCHAR(72),
|
||||
commit_message VARCHAR(70),
|
||||
metadata jsonb,
|
||||
created_at timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC')
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS benchmarks_benchmark_id_idx ON benchmarks (benchmark_id);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS benchmarks_branch_idx ON benchmarks (branch);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS device_measurements (
|
||||
measurement_id SERIAL PRIMARY KEY,
|
||||
benchmark_id int REFERENCES benchmarks (benchmark_id),
|
||||
cpu_util double precision,
|
||||
mem_megabytes double precision,
|
||||
gpu_util double precision,
|
||||
gpu_mem_megabytes double precision,
|
||||
time timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC')
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS device_measurements_branch_idx ON device_measurements (benchmark_id);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS model_measurements (
|
||||
measurement_id SERIAL PRIMARY KEY,
|
||||
benchmark_id int REFERENCES benchmarks (benchmark_id),
|
||||
measurements jsonb,
|
||||
time timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC')
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS model_measurements_branch_idx ON model_measurements (benchmark_id);
|
346
benchmark/llama.py
Normal file
346
benchmark/llama.py
Normal file
@ -0,0 +1,346 @@
|
||||
from logging import Logger
|
||||
import os
|
||||
from threading import Event, Thread
|
||||
from time import perf_counter, sleep
|
||||
from typing import Optional
|
||||
from benchmarks_entrypoint import MetricsRecorder
|
||||
import gpustat
|
||||
import psutil
|
||||
import psycopg2
|
||||
import torch
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StaticCache
|
||||
|
||||
|
||||
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "1"
|
||||
torch.set_float32_matmul_precision("high")
|
||||
|
||||
|
||||
def collect_metrics(benchmark_id, continue_metric_collection, metrics_recorder):
|
||||
p = psutil.Process(os.getpid())
|
||||
while not continue_metric_collection.is_set():
|
||||
with p.oneshot():
|
||||
cpu_util = p.cpu_percent()
|
||||
mem_megabytes = p.memory_info().rss / (1024 * 1024)
|
||||
gpu_stats = gpustat.GPUStatCollection.new_query()
|
||||
gpu_util = gpu_stats[0]["utilization.gpu"]
|
||||
gpu_mem_megabytes = gpu_stats[0]["memory.used"]
|
||||
metrics_recorder.collect_device_measurements(
|
||||
benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes
|
||||
)
|
||||
sleep(0.01)
|
||||
|
||||
|
||||
def run_benchmark(
|
||||
logger: Logger, repository: str, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100
|
||||
):
|
||||
continue_metric_collection = Event()
|
||||
metrics_thread = None
|
||||
model_id = "meta-llama/Llama-2-7b-hf"
|
||||
metrics_recorder = MetricsRecorder(
|
||||
psycopg2.connect("dbname=metrics"), logger, repository, branch, commit_id, commit_msg
|
||||
)
|
||||
try:
|
||||
gpu_stats = gpustat.GPUStatCollection.new_query()
|
||||
gpu_name = gpu_stats[0]["name"]
|
||||
benchmark_id = metrics_recorder.initialise_benchmark({"gpu_name": gpu_name, "model_id": model_id})
|
||||
logger.info(f"running benchmark #{benchmark_id} on {gpu_name} for {model_id}")
|
||||
metrics_thread = Thread(
|
||||
target=collect_metrics,
|
||||
args=[benchmark_id, continue_metric_collection, metrics_recorder],
|
||||
)
|
||||
metrics_thread.start()
|
||||
logger.info("started background thread to fetch device metrics")
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false" # silence warnings when compiling
|
||||
|
||||
device = "cuda"
|
||||
|
||||
logger.info("downloading weights")
|
||||
# This is to avoid counting download in model load time measurement
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)
|
||||
gen_config = GenerationConfig(do_sample=False, top_p=1, temperature=1)
|
||||
logger.info("loading model")
|
||||
start = perf_counter()
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_id, torch_dtype=torch.float16, generation_config=gen_config
|
||||
).eval()
|
||||
model.to(device)
|
||||
torch.cuda.synchronize()
|
||||
end = perf_counter()
|
||||
model_load_time = end - start
|
||||
logger.info(f"loaded model in: {model_load_time}s")
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||
|
||||
prompt = "Why dogs are so cute?"
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to(device)
|
||||
|
||||
# Specify the max length (including both the prompt and the response)
|
||||
# When calling `generate` with `cache_implementation="static" later, this is also used to create a `StaticCache` object
|
||||
# with sequence length = `max_length`. The longer the more you will re-use it
|
||||
seq_length = inputs["input_ids"].shape[1]
|
||||
model.generation_config.max_length = seq_length + num_tokens_to_generate
|
||||
batch_size = inputs["input_ids"].shape[0]
|
||||
|
||||
# Copied from the gpt-fast repo
|
||||
def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization
|
||||
q = torch.empty_like(probs_sort).exponential_(1)
|
||||
return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
|
||||
|
||||
def logits_to_probs(logits, temperature: float = 1.0, top_k: Optional[int] = None):
|
||||
logits = logits / max(temperature, 1e-5)
|
||||
|
||||
if top_k is not None:
|
||||
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
|
||||
pivot = v.select(-1, -1).unsqueeze(-1)
|
||||
logits = torch.where(logits < pivot, -float("Inf"), logits)
|
||||
probs = torch.nn.functional.softmax(logits, dim=-1)
|
||||
return probs
|
||||
|
||||
def sample(logits, temperature: float = 1.0, top_k: Optional[int] = None):
|
||||
probs = logits_to_probs(logits[:, -1], temperature, top_k)
|
||||
idx_next = multinomial_sample_one_no_sync(probs)
|
||||
return idx_next, probs
|
||||
|
||||
def decode_one_token(model, cur_token, cache_position, past_key_values):
|
||||
logits = model(
|
||||
cur_token,
|
||||
cache_position=cache_position,
|
||||
past_key_values=past_key_values,
|
||||
return_dict=False,
|
||||
use_cache=True,
|
||||
)[0]
|
||||
new_token = sample(logits, temperature=0.6, top_k=5)[0]
|
||||
return new_token
|
||||
|
||||
#########
|
||||
# Eager #
|
||||
#########
|
||||
with torch.no_grad():
|
||||
past_key_values = StaticCache(
|
||||
model.config,
|
||||
max_batch_size=batch_size,
|
||||
device=device,
|
||||
dtype=torch.float16,
|
||||
max_cache_len=seq_length + num_tokens_to_generate,
|
||||
)
|
||||
cache_position = torch.arange(seq_length, device=device)
|
||||
start = perf_counter()
|
||||
model(
|
||||
**inputs,
|
||||
cache_position=cache_position,
|
||||
past_key_values=past_key_values,
|
||||
return_dict=False,
|
||||
use_cache=True,
|
||||
)
|
||||
end = perf_counter()
|
||||
first_eager_fwd_pass_time = end - start
|
||||
logger.info(f"completed first eager fwd pass in: {first_eager_fwd_pass_time}s")
|
||||
start = perf_counter()
|
||||
output = model.generate(**inputs, do_sample=False)
|
||||
end = perf_counter()
|
||||
first_eager_generate_time = end - start
|
||||
logger.info(f"completed first eager generation in: {first_eager_generate_time}s")
|
||||
logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
|
||||
|
||||
past_key_values = StaticCache(
|
||||
model.config,
|
||||
max_batch_size=batch_size,
|
||||
device=device,
|
||||
dtype=torch.float16,
|
||||
max_cache_len=seq_length + num_tokens_to_generate,
|
||||
)
|
||||
cache_position = torch.arange(seq_length, device=device)
|
||||
start = perf_counter()
|
||||
model(
|
||||
**inputs,
|
||||
cache_position=cache_position,
|
||||
past_key_values=past_key_values,
|
||||
return_dict=False,
|
||||
use_cache=True,
|
||||
)
|
||||
end = perf_counter()
|
||||
second_eager_fwd_pass_time = end - start
|
||||
logger.info(f"completed second eager fwd pass in: {second_eager_fwd_pass_time}s")
|
||||
start = perf_counter()
|
||||
model.generate(**inputs, do_sample=False)
|
||||
end = perf_counter()
|
||||
second_eager_generate_time = end - start
|
||||
logger.info(f"completed second eager generation in: {second_eager_generate_time}s")
|
||||
logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
|
||||
|
||||
torch.compiler.reset()
|
||||
|
||||
################
|
||||
# Forward pass #
|
||||
################
|
||||
|
||||
# `torch.compile(model, ...)` is not recommended as you compile callbacks
|
||||
# and full generate. We recommend compiling only the forward for now.
|
||||
# "reduce-overhead" will use cudagraphs.
|
||||
generated_ids = torch.zeros(
|
||||
(batch_size, num_tokens_to_generate + seq_length), dtype=torch.int, device=device
|
||||
)
|
||||
|
||||
generated_ids[:, :seq_length] = inputs["input_ids"]
|
||||
decode_one_token = torch.compile(decode_one_token, mode="reduce-overhead", fullgraph=True)
|
||||
# model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
|
||||
# TODO use decode_one_token(model, input_id.clone(), cache_position) for verification
|
||||
past_key_values = StaticCache(
|
||||
model.config,
|
||||
max_batch_size=batch_size,
|
||||
device=device,
|
||||
dtype=torch.float16,
|
||||
max_cache_len=seq_length + num_tokens_to_generate + 10,
|
||||
)
|
||||
cache_position = torch.arange(seq_length, device=device)
|
||||
all_generated_tokens = []
|
||||
### First compile, prefill
|
||||
start = perf_counter()
|
||||
next_token = decode_one_token(
|
||||
model, inputs["input_ids"], cache_position=cache_position, past_key_values=past_key_values
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
end = perf_counter()
|
||||
time_to_first_token = end - start
|
||||
logger.info(f"completed first compile generation in: {time_to_first_token}s")
|
||||
cache_position += 1
|
||||
all_generated_tokens += next_token.tolist()
|
||||
|
||||
cache_position = torch.tensor([seq_length], device=device)
|
||||
### First compile, decoding
|
||||
start = perf_counter()
|
||||
next_token = decode_one_token(
|
||||
model, next_token.clone(), cache_position=cache_position, past_key_values=past_key_values
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
end = perf_counter()
|
||||
time_to_second_token = end - start
|
||||
logger.info(f"completed second compile generation in: {time_to_second_token}s")
|
||||
cache_position += 1
|
||||
all_generated_tokens += next_token.tolist()
|
||||
|
||||
### Second compile, decoding
|
||||
start = perf_counter()
|
||||
next_token = decode_one_token(
|
||||
model, next_token.clone(), cache_position=cache_position, past_key_values=past_key_values
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
end = perf_counter()
|
||||
time_to_third_token = end - start
|
||||
logger.info(f"completed third compile forward in: {time_to_third_token}s")
|
||||
cache_position += 1
|
||||
all_generated_tokens += next_token.tolist()
|
||||
|
||||
### Using cuda graphs decoding
|
||||
|
||||
start = perf_counter()
|
||||
for _ in range(1, num_tokens_to_generate):
|
||||
all_generated_tokens += next_token.tolist()
|
||||
next_token = decode_one_token(
|
||||
model, next_token.clone(), cache_position=cache_position, past_key_values=past_key_values
|
||||
)
|
||||
cache_position += 1
|
||||
torch.cuda.synchronize()
|
||||
end = perf_counter()
|
||||
mean_time_to_next_token = (end - start) / num_tokens_to_generate
|
||||
logger.info(f"completed next compile generation in: {mean_time_to_next_token}s")
|
||||
logger.info(f"generated: {tokenizer.batch_decode(all_generated_tokens)}")
|
||||
|
||||
####################
|
||||
# Generate compile #
|
||||
####################
|
||||
torch.compiler.reset()
|
||||
# we will not compile full generate as it' s to intensive, tho we measure full forward!
|
||||
|
||||
past_key_values = StaticCache(
|
||||
model.config,
|
||||
max_batch_size=batch_size,
|
||||
device=device,
|
||||
dtype=torch.float16,
|
||||
max_cache_len=seq_length + 128,
|
||||
)
|
||||
|
||||
# 1st call
|
||||
start = perf_counter()
|
||||
output = model.generate(**inputs, past_key_values=past_key_values)
|
||||
torch.cuda.synchronize()
|
||||
end = perf_counter()
|
||||
first_compile_generate_time = end - start
|
||||
logger.info(f"completed first compile generation in: {first_compile_generate_time}s")
|
||||
logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
|
||||
|
||||
past_key_values = StaticCache(
|
||||
model.config,
|
||||
max_batch_size=batch_size,
|
||||
device=device,
|
||||
dtype=torch.float16,
|
||||
max_cache_len=seq_length + 128,
|
||||
)
|
||||
# 2nd call
|
||||
start = perf_counter()
|
||||
output = model.generate(**inputs, past_key_values=past_key_values)
|
||||
torch.cuda.synchronize()
|
||||
end = perf_counter()
|
||||
second_compile_generate_time = end - start
|
||||
logger.info(f"completed second compile generation in: {second_compile_generate_time}s")
|
||||
logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
|
||||
|
||||
past_key_values = StaticCache(
|
||||
model.config,
|
||||
max_batch_size=batch_size,
|
||||
device=device,
|
||||
dtype=torch.float16,
|
||||
max_cache_len=seq_length + 128,
|
||||
)
|
||||
|
||||
# 3rd call
|
||||
start = perf_counter()
|
||||
output = model.generate(**inputs, past_key_values=past_key_values)
|
||||
end = perf_counter()
|
||||
third_compile_generate_time = end - start
|
||||
logger.info(f"completed third compile generation in: {third_compile_generate_time}s")
|
||||
logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
|
||||
|
||||
past_key_values = StaticCache(
|
||||
model.config,
|
||||
max_batch_size=batch_size,
|
||||
device=device,
|
||||
dtype=torch.float16,
|
||||
max_cache_len=seq_length + 128,
|
||||
)
|
||||
# 4th call
|
||||
start = perf_counter()
|
||||
output = model.generate(**inputs, past_key_values=past_key_values)
|
||||
end = perf_counter()
|
||||
fourth_compile_generate_time = end - start
|
||||
logger.info(f"completed fourth compile generation in: {fourth_compile_generate_time}s")
|
||||
logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
|
||||
|
||||
metrics_recorder.collect_model_measurements(
|
||||
benchmark_id,
|
||||
{
|
||||
"model_load_time": model_load_time,
|
||||
"first_eager_forward_pass_time_secs": first_eager_fwd_pass_time,
|
||||
"second_eager_forward_pass_time_secs": second_eager_fwd_pass_time,
|
||||
"first_eager_generate_time_secs": first_eager_generate_time,
|
||||
"second_eager_generate_time_secs": second_eager_generate_time,
|
||||
"time_to_first_token_secs": time_to_first_token,
|
||||
"time_to_second_token_secs": time_to_second_token,
|
||||
"time_to_third_token_secs": time_to_third_token,
|
||||
"time_to_next_token_mean_secs": mean_time_to_next_token,
|
||||
"first_compile_generate_time_secs": first_compile_generate_time,
|
||||
"second_compile_generate_time_secs": second_compile_generate_time,
|
||||
"third_compile_generate_time_secs": third_compile_generate_time,
|
||||
"fourth_compile_generate_time_secs": fourth_compile_generate_time,
|
||||
},
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Caught exception: {e}")
|
||||
continue_metric_collection.set()
|
||||
if metrics_thread is not None:
|
||||
metrics_thread.join()
|
||||
metrics_recorder.close()
|
@ -2,5 +2,4 @@ gpustat==1.1.1
|
||||
psutil==6.0.0
|
||||
psycopg2==2.9.9
|
||||
torch>=2.4.0
|
||||
hf_transfer
|
||||
pandas>=1.5.0
|
||||
hf_transfer
|
13
conftest.py
13
conftest.py
@ -23,12 +23,12 @@ from os.path import abspath, dirname, join
|
||||
import _pytest
|
||||
import pytest
|
||||
|
||||
from transformers.testing_utils import HfDoctestModule, HfDocTestParser, is_torch_available
|
||||
from transformers.testing_utils import HfDoctestModule, HfDocTestParser
|
||||
|
||||
|
||||
NOT_DEVICE_TESTS = {
|
||||
"test_tokenization",
|
||||
"test_tokenization_mistral_common",
|
||||
"test_processor",
|
||||
"test_processing",
|
||||
"test_beam_constraints",
|
||||
"test_configuration_utils",
|
||||
@ -83,8 +83,6 @@ def pytest_configure(config):
|
||||
config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
|
||||
config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate")
|
||||
config.addinivalue_line("markers", "not_device_test: mark the tests always running on cpu")
|
||||
config.addinivalue_line("markers", "torch_compile_test: mark test which tests torch compile functionality")
|
||||
config.addinivalue_line("markers", "torch_export_test: mark test which tests torch export functionality")
|
||||
|
||||
|
||||
def pytest_collection_modifyitems(items):
|
||||
@ -129,10 +127,3 @@ class CustomOutputChecker(OutputChecker):
|
||||
doctest.OutputChecker = CustomOutputChecker
|
||||
_pytest.doctest.DoctestModule = HfDoctestModule
|
||||
doctest.DocTestParser = HfDocTestParser
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
# The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
|
||||
# We set it to `False` for CI. See https://github.com/pytorch/pytorch/issues/157274#issuecomment-3090791615
|
||||
torch.backends.cudnn.allow_tf32 = False
|
||||
|
@ -4,7 +4,7 @@ USER root
|
||||
ARG REF=main
|
||||
RUN apt-get update && apt-get install -y time git g++ pkg-config make git-lfs
|
||||
ENV UV_PYTHON=/usr/local/bin/python
|
||||
RUN pip install uv && uv pip install --no-cache-dir -U pip setuptools GitPython
|
||||
RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools GitPython
|
||||
RUN uv pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
|
||||
# tensorflow pin matching setup.py
|
||||
RUN uv pip install --no-cache-dir pypi-kenlm
|
||||
|
@ -4,7 +4,7 @@ ARG REF=main
|
||||
USER root
|
||||
RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake wget xz-utils build-essential g++5 libprotobuf-dev protobuf-compiler
|
||||
ENV UV_PYTHON=/usr/local/bin/python
|
||||
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
|
||||
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
|
||||
|
||||
RUN wget https://github.com/ku-nlp/jumanpp/releases/download/v2.0.0-rc3/jumanpp-2.0.0-rc3.tar.xz
|
||||
RUN tar xvf jumanpp-2.0.0-rc3.tar.xz
|
||||
@ -20,7 +20,7 @@ RUN uv pip install --no-cache --upgrade 'torch' --index-url https://download.pyt
|
||||
RUN uv pip install --no-cache-dir --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]" unidic unidic-lite
|
||||
# spacy is not used so not tested. Causes to failures. TODO fix later
|
||||
RUN uv run python -m unidic download
|
||||
RUN python3 -m unidic download
|
||||
RUN uv pip uninstall transformers
|
||||
|
||||
RUN apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
@ -5,7 +5,7 @@ USER root
|
||||
RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git
|
||||
RUN apt-get install -y g++ cmake
|
||||
ENV UV_PYTHON=/usr/local/bin/python
|
||||
RUN pip --no-cache-dir install uv
|
||||
RUN pip --no-cache-dir install uv && uv venv
|
||||
RUN uv pip install --no-cache-dir -U pip setuptools albumentations seqeval
|
||||
RUN uv pip install --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
|
||||
RUN uv pip install --no-cache-dir "protobuf==3.20.3"
|
||||
|
@ -2,10 +2,10 @@ FROM python:3.9-slim
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
ARG REF=main
|
||||
USER root
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git ffmpeg
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
|
||||
ENV UV_PYTHON=/usr/local/bin/python
|
||||
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
|
||||
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
|
||||
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
|
||||
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
|
||||
RUN uv pip uninstall transformers
|
||||
|
@ -4,10 +4,10 @@ ARG REF=main
|
||||
USER root
|
||||
RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1-mesa-glx libgl1 g++ tesseract-ocr
|
||||
ENV UV_PYTHON=/usr/local/bin/python
|
||||
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
|
||||
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
|
||||
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-cache-dir --no-deps timm accelerate
|
||||
RUN uv pip install -U --upgrade-strategy eager --no-cache-dir pytesseract python-Levenshtein opencv-python nltk
|
||||
RUN pip install -U --upgrade-strategy eager --no-cache-dir pytesseract python-Levenshtein opencv-python nltk
|
||||
# RUN uv pip install --no-cache-dir natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels
|
||||
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[testing, vision]" 'scikit-learn' 'torch-stft' 'nose' 'dataset'
|
||||
# RUN git clone https://github.com/facebookresearch/detectron2.git
|
||||
|
@ -4,7 +4,7 @@ ARG REF=main
|
||||
USER root
|
||||
RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git g++ cmake
|
||||
ENV UV_PYTHON=/usr/local/bin/python
|
||||
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
|
||||
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
|
||||
RUN uv pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,testing,sentencepiece,flax-speech,vision]"
|
||||
RUN uv pip uninstall transformers
|
||||
RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
|
||||
|
@ -4,7 +4,7 @@ ARG REF=main
|
||||
USER root
|
||||
RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake g++
|
||||
ENV UV_PYTHON=/usr/local/bin/python
|
||||
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
|
||||
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
|
||||
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]"
|
||||
RUN uv pip install --no-cache-dir "protobuf==3.20.3" tensorflow_probability
|
||||
RUN apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
@ -2,10 +2,10 @@ FROM python:3.9-slim
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
ARG REF=main
|
||||
USER root
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git ffmpeg
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git
|
||||
ENV UV_PYTHON=/usr/local/bin/python
|
||||
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
|
||||
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
|
||||
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
|
||||
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
|
||||
RUN uv pip uninstall transformers
|
||||
|
@ -2,8 +2,8 @@ FROM python:3.9-slim
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
ARG REF=main
|
||||
USER root
|
||||
RUN apt-get update && apt-get install -y time git
|
||||
RUN apt-get update && apt-get install -y time git
|
||||
ENV UV_PYTHON=/usr/local/bin/python
|
||||
RUN pip install uv
|
||||
RUN pip install uv && uv venv
|
||||
RUN uv pip install --no-cache-dir -U pip setuptools GitPython "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ruff]" urllib3
|
||||
RUN apt-get install -y jq curl && apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
@ -5,7 +5,7 @@ USER root
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ pkg-config openssh-client git
|
||||
RUN apt-get install -y cmake
|
||||
ENV UV_PYTHON=/usr/local/bin/python
|
||||
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
|
||||
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
|
||||
RUN uv pip install --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
|
||||
RUN uv pip install --no-cache-dir "protobuf==3.20.3"
|
||||
RUN uv pip uninstall transformers
|
||||
|
@ -4,7 +4,7 @@ ARG REF=main
|
||||
USER root
|
||||
RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
|
||||
ENV UV_PYTHON=/usr/local/bin/python
|
||||
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
|
||||
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
|
||||
RUN uv pip install --no-deps accelerate
|
||||
RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,audio,sklearn,sentencepiece,vision,testing]"
|
||||
|
@ -2,10 +2,10 @@ FROM python:3.9-slim
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
ARG REF=main
|
||||
USER root
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs ffmpeg
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs
|
||||
ENV UV_PYTHON=/usr/local/bin/python
|
||||
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
|
||||
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
|
||||
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
|
||||
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]"
|
||||
RUN uv pip uninstall transformers
|
||||
|
@ -5,7 +5,7 @@ RUN echo ${REF}
|
||||
USER root
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs
|
||||
ENV UV_PYTHON=/usr/local/bin/python
|
||||
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
|
||||
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
|
||||
RUN uv pip install --no-cache-dir --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
|
||||
RUN git lfs install
|
||||
|
@ -9,7 +9,7 @@ SHELL ["sh", "-lc"]
|
||||
# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
|
||||
# to be used as arguments for docker build (so far).
|
||||
|
||||
ARG PYTORCH='2.8.0'
|
||||
ARG PYTORCH='2.7.1'
|
||||
# Example: `cu102`, `cu113`, etc.
|
||||
ARG CUDA='cu126'
|
||||
# Disable kernel mapping for now until all tests pass
|
||||
@ -26,12 +26,10 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
|
||||
# 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future.
|
||||
# 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`.
|
||||
# Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions).
|
||||
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA && python3 -m pip uninstall -y tensorflow tensorflow_text tensorflow_probability
|
||||
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA && python3 -m pip uninstall -y tensorflow tensorflow_text tensorflow_probability
|
||||
|
||||
RUN python3 -m pip uninstall -y flax jax
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir -U timm
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
|
||||
RUN python3 -m pip install -U "itsdangerous<2.1.0"
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
FROM rocm/pytorch:rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.7.1
|
||||
FROM rocm/pytorch:rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.6.0
|
||||
LABEL maintainer="Hugging Face"
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
@ -20,12 +20,8 @@ WORKDIR /
|
||||
ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
|
||||
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
|
||||
|
||||
# On ROCm, torchcodec is required to decode audio files
|
||||
# RUN python3 -m pip install --no-cache-dir torchcodec
|
||||
# Install transformers
|
||||
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video,audio]
|
||||
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]
|
||||
|
||||
# Remove tensorflow and flax as they are no longer supported by transformers
|
||||
RUN python3 -m pip uninstall -y tensorflow flax
|
||||
|
||||
# When installing in editable mode, `transformers` is not recognized as a package.
|
||||
@ -36,4 +32,4 @@ RUN cd transformers && python3 setup.py develop
|
||||
RUN python3 -m pip uninstall py3nvml pynvml nvidia-ml-py apex -y
|
||||
|
||||
# `kernels` may causes many failing tests
|
||||
RUN python3 -m pip uninstall -y kernels
|
||||
RUN python3 -m pip uninstall -y kernels
|
@ -4,7 +4,7 @@ LABEL maintainer="Hugging Face"
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
ARG PYTORCH='2.8.0'
|
||||
ARG PYTORCH='2.7.1'
|
||||
# Example: `cu102`, `cu113`, etc.
|
||||
ARG CUDA='cu126'
|
||||
|
||||
@ -21,7 +21,7 @@ RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'p
|
||||
# Install latest release PyTorch
|
||||
# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
|
||||
# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
|
||||
RUN python3 -m pip uninstall -y torch torchvision torchaudio && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA
|
||||
RUN python3 -m pip uninstall -y torch torchvision torchaudio && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
||||
|
||||
|
@ -19,7 +19,7 @@ RUN python3 -m pip uninstall -y torch torchvision torchaudio
|
||||
# Install **nightly** release PyTorch (flag `--pre`)
|
||||
# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
|
||||
# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
|
||||
RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
|
||||
RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
|
||||
|
||||
# `datasets` requires pandas, pandas has some modules compiled with numpy=1.x causing errors
|
||||
RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'pandas<2' 'numpy<2'
|
||||
|
@ -11,7 +11,7 @@ ARG REF=main
|
||||
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
|
||||
|
||||
# If set to nothing, will install the latest version
|
||||
ARG PYTORCH='2.8.0'
|
||||
ARG PYTORCH='2.7.1'
|
||||
ARG TORCH_VISION=''
|
||||
ARG TORCH_AUDIO=''
|
||||
# Example: `cu102`, `cu113`, etc.
|
||||
|
@ -1,93 +0,0 @@
|
||||
FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu22.04 AS base
|
||||
LABEL maintainer="Hugging Face"
|
||||
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
ARG PYTHON_VER=3.11
|
||||
ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt-get remove -y python3.10 && apt-get autoremove -y
|
||||
RUN apt-get update && \
|
||||
apt-get install -y software-properties-common && \
|
||||
add-apt-repository -y ppa:deadsnakes/ppa && \
|
||||
apt-get update && \
|
||||
apt-get install -y python$PYTHON_VER python$PYTHON_VER-dev python3-pip && \
|
||||
ln -sf /usr/bin/python$PYTHON_VER /usr/bin/python3 && \
|
||||
ln -sf /usr/bin/python3 /usr/bin/python && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get -y install \
|
||||
apt-utils \
|
||||
build-essential \
|
||||
ca-certificates \
|
||||
clinfo \
|
||||
curl \
|
||||
git \
|
||||
git-lfs \
|
||||
vim \
|
||||
numactl \
|
||||
gnupg2 \
|
||||
gpg-agent \
|
||||
zlib1g-dev \
|
||||
rsync \
|
||||
sudo \
|
||||
libnl-genl-3-200 \
|
||||
xpu-smi \
|
||||
unzip \
|
||||
ffmpeg \
|
||||
tesseract-ocr \
|
||||
espeak-ng \
|
||||
wget \
|
||||
ncurses-term && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y \
|
||||
linux-headers-$(uname -r) \
|
||||
linux-modules-extra-$(uname -r) \
|
||||
flex bison \
|
||||
intel-fw-gpu intel-i915-dkms xpu-smi \
|
||||
intel-opencl-icd libze-intel-gpu1 libze1 \
|
||||
intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
|
||||
libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
|
||||
libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
|
||||
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc \
|
||||
libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN pip install --upgrade pip
|
||||
RUN pip install triton==3.3.0
|
||||
|
||||
RUN pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/xpu --no-cache-dir
|
||||
|
||||
RUN pip install evaluate torchdata pyctcdecode pytesseract decord galore-torch fire scipy scikit-learn sentencepiece sacremoses nltk rouge_score librosa soundfile g2p_en mpi4py requests_mock
|
||||
RUN pip install pretty_midi essentia resampy Levenshtein av sacrebleu phonemizer invisible_watermark schedulefree
|
||||
RUN pip install gguf hqq compressed_tensors gptqmodel mergekit autoawq deepspeed torchao onnx
|
||||
RUN pip install hf_transfer huggingface-hub hf-doc-builder datasets optimum-quanto timm transformers accelerate optimum peft
|
||||
|
||||
RUN pip install git+https://github.com/linkedin/Liger-Kernel.git --extra-index-url https://download.pytorch.org/whl/test/xpu
|
||||
|
||||
# install bitsandbytes
|
||||
RUN pip install git+https://github.com/bitsandbytes-foundation/bitsandbytes.git
|
||||
|
||||
ENV OCL_ICD_VENDORS=/etc/OpenCL/vendors
|
||||
ENV FI_PROVIDER_PATH=${I_MPI_ROOT}/lib/libfabric/prov:/usr/lib/x86_64-linux-gnu/libfabric
|
||||
ENV CCL_ROOT=/usr/local
|
||||
ENV CCL_ATL_TRANSPORT=ofi
|
||||
ENV I_MPI_ROOT=/usr/local
|
||||
ENV CLASSPATH=${I_MPI_ROOT}/lib/mpi.jar
|
||||
ENV PATH=${I_MPI_ROOT}/bin/libfabric:${PATH}
|
||||
ENV LD_LIBRARY_PATH=${I_MPI_ROOT}/lib/libfabric:${LD_LIBRARY_PATH}
|
||||
|
||||
RUN touch /entrypoint.sh
|
||||
RUN chmod +x /entrypoint.sh
|
||||
RUN echo "#!/bin/bash" >> /entrypoint.sh
|
||||
RUN echo "source /opt/intel/oneapi/setvars.sh --force && /bin/bash" >> /entrypoint.sh
|
||||
|
||||
ENTRYPOINT ["/entrypoint.sh"]
|
@ -26,7 +26,7 @@ RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch';
|
||||
RUN echo torch=$VERSION
|
||||
# `torchvision` and `torchaudio` should be installed along with `torch`, especially for nightly build.
|
||||
# Currently, let's just use their latest releases (when `torch` is installed with a release version)
|
||||
RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA
|
||||
RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
||||
|
||||
@ -78,10 +78,6 @@ RUN git clone https://github.com/NetEase-FuXi/EETQ.git && cd EETQ/ && git submod
|
||||
# RUN python3 -m pip install --no-cache-dir flute-kernel==0.4.1
|
||||
# RUN python3 -m pip install --no-cache-dir git+https://github.com/Dao-AILab/fast-hadamard-transform.git
|
||||
|
||||
# Add fp-quant for quantization testing
|
||||
# Requires py3.11 but our CI runs on 3.9
|
||||
# RUN python3 -m pip install --no-cache-dir "fp-quant>=0.1.6"
|
||||
|
||||
# Add compressed-tensors for quantization testing
|
||||
RUN python3 -m pip install --no-cache-dir compressed-tensors
|
||||
|
||||
@ -97,9 +93,6 @@ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]
|
||||
# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
|
||||
RUN python3 -m pip uninstall -y kernels
|
||||
|
||||
# Uninstall flash-attn installed by autoawq, it causes issues here : https://github.com/huggingface/transformers/actions/runs/15915442841/job/44892146131
|
||||
RUN python3 -m pip uninstall -y flash-attn
|
||||
|
||||
# When installing in editable mode, `transformers` is not recognized as a package.
|
||||
# this line must be added in order for python to be aware of transformers.
|
||||
RUN cd transformers && python3 setup.py develop
|
||||
|
@ -20,21 +20,22 @@ To generate the documentation, you first have to build it. Several packages are
|
||||
you can install them with the following command, at the root of the code repository:
|
||||
|
||||
```bash
|
||||
pip install -e ".[dev]"
|
||||
pip install -e ".[docs]"
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
> This command might fail for some OS that are missing dependencies. Check step 4 in [Create a Pull Request](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#create-a-pull-request) to workaround it.
|
||||
|
||||
Then you need to install our special tool that builds the documentation:
|
||||
|
||||
```bash
|
||||
pip install git+https://github.com/huggingface/doc-builder
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
> You only need to generate the documentation to inspect it locally (if you're planning changes and want to
|
||||
> check how they look before committing for instance). You don't have to commit the built documentation.
|
||||
---
|
||||
**NOTE**
|
||||
|
||||
You only need to generate the documentation to inspect it locally (if you're planning changes and want to
|
||||
check how they look before committing for instance). You don't have to commit the built documentation.
|
||||
|
||||
---
|
||||
|
||||
## Building the documentation
|
||||
|
||||
@ -71,8 +72,12 @@ doc-builder preview transformers docs/source/en/
|
||||
|
||||
The docs will be viewable at [http://localhost:3000](http://localhost:3000). You can also preview the docs once you have opened a PR. You will see a bot add a comment to a link where the documentation with your changes lives.
|
||||
|
||||
> [!NOTE]
|
||||
> The `preview` command only works with existing doc files. When you add a completely new file, you need to update `_toctree.yml` & restart `preview` command (`ctrl-c` to stop it & call `doc-builder preview ...` again).
|
||||
---
|
||||
**NOTE**
|
||||
|
||||
The `preview` command only works with existing doc files. When you add a completely new file, you need to update `_toctree.yml` & restart `preview` command (`ctrl-c` to stop it & call `doc-builder preview ...` again).
|
||||
|
||||
---
|
||||
|
||||
## Adding a new element to the navigation bar
|
||||
|
||||
@ -159,9 +164,6 @@ These classes should be added using our Markdown syntax. Usually as follows:
|
||||
[[autodoc]] XXXConfig
|
||||
```
|
||||
|
||||
> [!IMPORTANT]
|
||||
> Always add a blank line after `[[autodoc]]` to ensure it passes the CI/CD checks.
|
||||
|
||||
This will include every public method of the configuration that is documented. If for some reason you wish for a method
|
||||
not to be displayed in the documentation, you can do so by specifying which methods should be in the docs:
|
||||
|
||||
|
@ -280,7 +280,7 @@ resnet50d.model.load_state_dict(pretrained_model.state_dict())
|
||||
الآن لإرسال النموذج إلى Hub، تأكد من تسجيل الدخول. إما تشغيل في المحطة الأوامر الطرفية الخاصة بك:
|
||||
|
||||
```bash
|
||||
hf auth login
|
||||
huggingface-cli login
|
||||
```
|
||||
|
||||
أو من دفتر ملاحظات:
|
||||
|
@ -13,11 +13,11 @@
|
||||
|
||||
في هذا الدليل، سنستعرض التقنيات الفعالة لتُحسِّن من كفاءة نشر نماذج اللغة الكبيرة:
|
||||
|
||||
1. سنتناول تقنية "دقة أقل" التي أثبتت الأبحاث فعاليتها في تحقيق مزايا حسابية دون التأثير بشكل ملحوظ على أداء النموذج عن طريق العمل بدقة رقمية أقل [8 بت و4 بت](/main_classes/quantization).
|
||||
1. سنتناول تقنية "دقة أقل" التي أثبتت الأبحاث فعاليتها في تحقيق مزايا حسابية دون التأثير بشكل ملحوظ على أداء النموذج عن طريق العمل بدقة رقمية أقل [8 بت و4 بت](/main_classes/quantization.md).
|
||||
|
||||
2. **اFlash Attention:** إن Flash Attention وهي نسخة مُعدَّلة من خوارزمية الانتباه التي لا توفر فقط نهجًا أكثر كفاءة في استخدام الذاكرة، ولكنها تحقق أيضًا كفاءة متزايدة بسبب الاستخدام الأمثل لذاكرة GPU.
|
||||
|
||||
3. **الابتكارات المعمارية:** حيث تم اقتراح هياكل متخصصة تسمح باستدلال أكثر فعالية نظرًا لأن نماذج اللغة الكبيرة يتم نشرها دائمًا بنفس الطريقة أثناء عملية الاستدلال، أي توليد النص التنبؤي التلقائي مع سياق الإدخال الطويل، فقد تم اقتراح بنيات نموذج متخصصة تسمح بالاستدلال الأكثر كفاءة. أهم تقدم في بنيات النماذج هنا هو [عذر](https://huggingface.co/papers/2108.12409)، [الترميز الدوار](https://huggingface.co/papers/2104.09864)، [الاهتمام متعدد الاستعلامات (MQA)](https://huggingface.co/papers/1911.02150) و [مجموعة الانتباه بالاستعلام (GQA)](https://huggingface.co/papers/2305.13245).
|
||||
3. **الابتكارات المعمارية:** حيث تم اقتراح هياكل متخصصة تسمح باستدلال أكثر فعالية نظرًا لأن نماذج اللغة الكبيرة يتم نشرها دائمًا بنفس الطريقة أثناء عملية الاستدلال، أي توليد النص التنبؤي التلقائي مع سياق الإدخال الطويل، فقد تم اقتراح بنيات نموذج متخصصة تسمح بالاستدلال الأكثر كفاءة. أهم تقدم في بنيات النماذج هنا هو [عذر](https://huggingface.co/papers/2108.12409)، [الترميز الدوار](https://huggingface.co/papers/2104.09864)، [الاهتمام متعدد الاستعلامات (MQA)](https://huggingface.co/papers/1911.02150) و [مجموعة الانتباه بالاستعلام (GQA)]((https://huggingface.co/papers/2305.13245)).
|
||||
|
||||
على مدار هذا الدليل، سنقدم تحليلًا للتوليد التنبؤي التلقائي من منظور المُوتِّرات. نتعمق في مزايا وعيوب استخدام دقة أقل، ونقدم استكشافًا شاملاً لخوارزميات الانتباه الأحدث، ونناقش بنيات نماذج نماذج اللغة الكبيرة المحسنة. سندعم الشرح بأمثلة عملية تُبرِز كل تحسين على حدة.
|
||||
|
||||
|
@ -41,7 +41,7 @@ picture-in-picture" allowfullscreen></iframe>
|
||||
قبل مشاركة نموذج على Hub، ستحتاج إلى بيانات اعتماد حساب Hugging Face الخاصة بك. إذا كنت تستخدم منصة الأوامر، فقم بتشغيل الأمر التالي في بيئة افتراضية حيث تم تثبيت 🤗 Transformers. سيقوم هذا الأمر بتخزين رمز الدخول الخاص بك في مجلد تخزين المؤقت لـ Hugging Face (`~/.cache/` بشكل افتراضي):
|
||||
|
||||
```bash
|
||||
hf auth login
|
||||
huggingface-cli login
|
||||
```
|
||||
|
||||
إذا كنت تستخدم دفتر ملاحظات مثل Jupyter أو Colaboratory، فتأكد من تثبيت مكتبة [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library). تسمح لك هذه المكتبة بالتفاعل برمجيًا مع Hub.
|
||||
|
@ -324,7 +324,7 @@ python examples/pytorch/summarization/run_summarization.py
|
||||
يمكن لجميع النصوص البرمجية رفع نموذجك النهائي إلى [مركز النماذج](https://huggingface.co/models). تأكد من تسجيل الدخول إلى Hugging Face قبل البدء:
|
||||
|
||||
```bash
|
||||
hf auth login
|
||||
huggingface-cli login
|
||||
```
|
||||
|
||||
ثم أضف المعلمة `push_to_hub` إلى النص البرمجي . ستقوم هذه المعلمة بإنشاء مستودع باستخدام اسم مستخدم Hugging Face واسم المجلد المحدد في `output_dir`.
|
||||
|
@ -56,7 +56,7 @@ Dateien lassen sich auch in einem Repository leicht bearbeiten, und Sie können
|
||||
Bevor Sie ein Modell für den Hub freigeben, benötigen Sie Ihre Hugging Face-Anmeldedaten. Wenn Sie Zugang zu einem Terminal haben, führen Sie den folgenden Befehl in der virtuellen Umgebung aus, in der 🤗 Transformers installiert ist. Dadurch werden Ihre Zugangsdaten in Ihrem Hugging Face-Cache-Ordner (standardmäßig `~/.cache/`) gespeichert:
|
||||
|
||||
```bash
|
||||
hf auth login
|
||||
huggingface-cli login
|
||||
```
|
||||
|
||||
Wenn Sie ein Notebook wie Jupyter oder Colaboratory verwenden, stellen Sie sicher, dass Sie die [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) Bibliothek installiert haben. Diese Bibliothek ermöglicht Ihnen die programmatische Interaktion mit dem Hub.
|
||||
|
@ -324,7 +324,7 @@ python examples/pytorch/summarization/run_summarization.py
|
||||
Alle Skripte können Ihr endgültiges Modell in den [Model Hub](https://huggingface.co/models) hochladen. Stellen Sie sicher, dass Sie bei Hugging Face angemeldet sind, bevor Sie beginnen:
|
||||
|
||||
```bash
|
||||
hf auth login
|
||||
huggingface-cli login
|
||||
```
|
||||
|
||||
Dann fügen Sie dem Skript das Argument `push_to_hub` hinzu. Mit diesem Argument wird ein Repository mit Ihrem Hugging Face-Benutzernamen und dem in `output_dir` angegebenen Ordnernamen erstellt.
|
||||
|
@ -473,6 +473,13 @@ Hier ist zum Beispiel ein Test, der nur ausgeführt werden muss, wenn 2 oder meh
|
||||
def test_example_with_multi_gpu():
|
||||
```
|
||||
|
||||
Wenn ein Test `tensorflow` benötigt, verwenden Sie den Dekorator `require_tf`. Zum Beispiel:
|
||||
|
||||
```python no-style
|
||||
@require_tf
|
||||
def test_tf_thing_with_tensorflow():
|
||||
```
|
||||
|
||||
Diese Dekors können gestapelt werden. Wenn zum Beispiel ein Test langsam ist und mindestens eine GPU unter pytorch benötigt, können Sie
|
||||
wie Sie ihn einrichten können:
|
||||
|
||||
@ -1197,6 +1204,9 @@ if torch.cuda.is_available():
|
||||
import numpy as np
|
||||
|
||||
np.random.seed(seed)
|
||||
|
||||
# tf RNG
|
||||
tf.random.set_seed(seed)
|
||||
```
|
||||
|
||||
### Tests debuggen
|
||||
|
@ -17,12 +17,12 @@
|
||||
title: Customizing model components
|
||||
- local: model_sharing
|
||||
title: Sharing
|
||||
- local: modular_transformers
|
||||
title: Contributing a new model to Transformers
|
||||
- local: add_new_model
|
||||
title: Legacy model contribution
|
||||
title: Adding a new model to Transformers
|
||||
- local: modular_transformers
|
||||
title: Modular Transformers
|
||||
- local: auto_docstring
|
||||
title: Documenting a model
|
||||
title: Document your models
|
||||
- local: attention_interface
|
||||
title: Customizing attention function
|
||||
title: Models
|
||||
@ -72,6 +72,8 @@
|
||||
title: Caching
|
||||
- local: kv_cache
|
||||
title: KV cache strategies
|
||||
- local: serving
|
||||
title: Serving
|
||||
- local: llm_tutorial_optimization
|
||||
title: Getting the most out of LLMs
|
||||
- local: perplexity
|
||||
@ -89,34 +91,22 @@
|
||||
- local: chat_extras
|
||||
title: Tools and RAG
|
||||
title: Chat with models
|
||||
- sections:
|
||||
- local: serving
|
||||
title: Serving LLMs, VLMs, and other chat-based models
|
||||
- local: jan
|
||||
title: Jan
|
||||
- local: cursor
|
||||
title: Cursor
|
||||
- local: tiny_agents
|
||||
title: Tiny-Agents CLI and MCP tools
|
||||
- local: open_webui
|
||||
title: Open WebUI
|
||||
title: Serving
|
||||
- sections:
|
||||
- local: perf_torch_compile
|
||||
title: torch.compile
|
||||
- local: perf_infer_gpu_one
|
||||
title: GPU
|
||||
- local: perf_infer_gpu_multi
|
||||
title: Distributed inference
|
||||
title: Distributed GPU inference
|
||||
- local: perf_infer_cpu
|
||||
title: CPU
|
||||
- local: tf_xla
|
||||
title: XLA
|
||||
title: Optimization
|
||||
- local: agents
|
||||
title: Agents
|
||||
- local: tools
|
||||
title: Tools
|
||||
- local: transformers_as_backend
|
||||
title: Inference server backends
|
||||
title: Inference
|
||||
- isExpanded: false
|
||||
sections:
|
||||
@ -151,6 +141,8 @@
|
||||
title: GPU
|
||||
- local: perf_train_cpu
|
||||
title: CPU
|
||||
- local: perf_train_tpu_tf
|
||||
title: TPU
|
||||
- local: perf_train_special
|
||||
title: Apple Silicon
|
||||
- local: perf_train_gaudi
|
||||
@ -189,8 +181,6 @@
|
||||
title: FBGEMM
|
||||
- local: quantization/finegrained_fp8
|
||||
title: Fine-grained FP8
|
||||
- local: quantization/fp_quant
|
||||
title: FP-Quant
|
||||
- local: gguf
|
||||
title: GGUF
|
||||
- local: quantization/gptq
|
||||
@ -373,8 +363,6 @@
|
||||
- sections:
|
||||
- local: model_doc/albert
|
||||
title: ALBERT
|
||||
- local: model_doc/arcee
|
||||
title: Arcee
|
||||
- local: model_doc/bamba
|
||||
title: Bamba
|
||||
- local: model_doc/bart
|
||||
@ -443,10 +431,6 @@
|
||||
title: DiffLlama
|
||||
- local: model_doc/distilbert
|
||||
title: DistilBERT
|
||||
- local: model_doc/doge
|
||||
title: Doge
|
||||
- local: model_doc/dots1
|
||||
title: dots1
|
||||
- local: model_doc/dpr
|
||||
title: DPR
|
||||
- local: model_doc/electra
|
||||
@ -455,16 +439,10 @@
|
||||
title: Encoder Decoder Models
|
||||
- local: model_doc/ernie
|
||||
title: ERNIE
|
||||
- local: model_doc/ernie4_5
|
||||
title: Ernie4_5
|
||||
- local: model_doc/ernie4_5_moe
|
||||
title: Ernie4_5_MoE
|
||||
- local: model_doc/ernie_m
|
||||
title: ErnieM
|
||||
- local: model_doc/esm
|
||||
title: ESM
|
||||
- local: model_doc/exaone4
|
||||
title: EXAONE-4.0
|
||||
- local: model_doc/falcon
|
||||
title: Falcon
|
||||
- local: model_doc/falcon3
|
||||
@ -495,8 +473,6 @@
|
||||
title: GLM
|
||||
- local: model_doc/glm4
|
||||
title: glm4
|
||||
- local: model_doc/glm4_moe
|
||||
title: glm4_moe
|
||||
- local: model_doc/openai-gpt
|
||||
title: GPT
|
||||
- local: model_doc/gpt_neo
|
||||
@ -511,8 +487,6 @@
|
||||
title: GPT2
|
||||
- local: model_doc/gpt_bigcode
|
||||
title: GPTBigCode
|
||||
- local: model_doc/gpt_oss
|
||||
title: GptOss
|
||||
- local: model_doc/gptsan-japanese
|
||||
title: GPTSAN Japanese
|
||||
- local: model_doc/gpt-sw3
|
||||
@ -529,6 +503,8 @@
|
||||
title: Helium
|
||||
- local: model_doc/herbert
|
||||
title: HerBERT
|
||||
- local: model_doc/hgnet_v2
|
||||
title: HGNet-V2
|
||||
- local: model_doc/ibert
|
||||
title: I-BERT
|
||||
- local: model_doc/jamba
|
||||
@ -539,8 +515,6 @@
|
||||
title: Jukebox
|
||||
- local: model_doc/led
|
||||
title: LED
|
||||
- local: model_doc/lfm2
|
||||
title: LFM2
|
||||
- local: model_doc/llama
|
||||
title: LLaMA
|
||||
- local: model_doc/llama2
|
||||
@ -585,8 +559,6 @@
|
||||
title: MobileBERT
|
||||
- local: model_doc/modernbert
|
||||
title: ModernBert
|
||||
- local: model_doc/modernbert-decoder
|
||||
title: ModernBERTDecoder
|
||||
- local: model_doc/mpnet
|
||||
title: MPNet
|
||||
- local: model_doc/mpt
|
||||
@ -681,8 +653,6 @@
|
||||
title: SwitchTransformers
|
||||
- local: model_doc/t5
|
||||
title: T5
|
||||
- local: model_doc/t5gemma
|
||||
title: T5Gemma
|
||||
- local: model_doc/t5v1.1
|
||||
title: T5v1.1
|
||||
- local: model_doc/tapex
|
||||
@ -693,8 +663,6 @@
|
||||
title: UL2
|
||||
- local: model_doc/umt5
|
||||
title: UMT5
|
||||
- local: model_doc/xcodec
|
||||
title: X-CODEC
|
||||
- local: model_doc/xmod
|
||||
title: X-MOD
|
||||
- local: model_doc/xglm
|
||||
@ -711,8 +679,6 @@
|
||||
title: XLM-V
|
||||
- local: model_doc/xlnet
|
||||
title: XLNet
|
||||
- local: model_doc/xlstm
|
||||
title: xLSTM
|
||||
- local: model_doc/yoso
|
||||
title: YOSO
|
||||
- local: model_doc/zamba
|
||||
@ -721,8 +687,6 @@
|
||||
title: Zamba2
|
||||
title: Text models
|
||||
- sections:
|
||||
- local: model_doc/aimv2
|
||||
title: Aimv2
|
||||
- local: model_doc/beit
|
||||
title: BEiT
|
||||
- local: model_doc/bit
|
||||
@ -739,12 +703,6 @@
|
||||
title: D-FINE
|
||||
- local: model_doc/dab-detr
|
||||
title: DAB-DETR
|
||||
- local: model_doc/deepseek_v2
|
||||
title: DeepSeek-V2
|
||||
- local: model_doc/deepseek_vl
|
||||
title: DeepseekVL
|
||||
- local: model_doc/deepseek_vl_hybrid
|
||||
title: DeepseekVLHybrid
|
||||
- local: model_doc/deformable_detr
|
||||
title: Deformable DETR
|
||||
- local: model_doc/deit
|
||||
@ -765,26 +723,18 @@
|
||||
title: DINOV2
|
||||
- local: model_doc/dinov2_with_registers
|
||||
title: DINOv2 with Registers
|
||||
- local: model_doc/dinov3
|
||||
title: DINOv3
|
||||
- local: model_doc/dit
|
||||
title: DiT
|
||||
- local: model_doc/dpt
|
||||
title: DPT
|
||||
- local: model_doc/efficientformer
|
||||
title: EfficientFormer
|
||||
- local: model_doc/efficientloftr
|
||||
title: EfficientLoFTR
|
||||
- local: model_doc/efficientnet
|
||||
title: EfficientNet
|
||||
- local: model_doc/eomt
|
||||
title: EoMT
|
||||
- local: model_doc/focalnet
|
||||
title: FocalNet
|
||||
- local: model_doc/glpn
|
||||
title: GLPN
|
||||
- local: model_doc/hgnet_v2
|
||||
title: HGNet-V2
|
||||
- local: model_doc/hiera
|
||||
title: Hiera
|
||||
- local: model_doc/ijepa
|
||||
@ -883,8 +833,6 @@
|
||||
title: CSM
|
||||
- local: model_doc/dac
|
||||
title: dac
|
||||
- local: model_doc/dia
|
||||
title: Dia
|
||||
- local: model_doc/encodec
|
||||
title: EnCodec
|
||||
- local: model_doc/fastspeech2_conformer
|
||||
@ -893,8 +841,6 @@
|
||||
title: GraniteSpeech
|
||||
- local: model_doc/hubert
|
||||
title: Hubert
|
||||
- local: model_doc/kyutai_speech_to_text
|
||||
title: Kyutai Speech-To-Text
|
||||
- local: model_doc/mctct
|
||||
title: MCTCT
|
||||
- local: model_doc/mimi
|
||||
@ -987,8 +933,6 @@
|
||||
title: CLIPSeg
|
||||
- local: model_doc/clvp
|
||||
title: CLVP
|
||||
- local: model_doc/cohere2_vision
|
||||
title: Cohere2Vision
|
||||
- local: model_doc/colpali
|
||||
title: ColPali
|
||||
- local: model_doc/colqwen2
|
||||
@ -1001,20 +945,12 @@
|
||||
title: Donut
|
||||
- local: model_doc/emu3
|
||||
title: Emu3
|
||||
- local: model_doc/evolla
|
||||
title: Evolla
|
||||
- local: model_doc/flava
|
||||
title: FLAVA
|
||||
- local: model_doc/gemma3
|
||||
title: Gemma3
|
||||
- local: model_doc/gemma3n
|
||||
title: Gemma3n
|
||||
- local: model_doc/git
|
||||
title: GIT
|
||||
- local: model_doc/glm4v
|
||||
title: glm4v
|
||||
- local: model_doc/glm4v_moe
|
||||
title: glm4v_moe
|
||||
- local: model_doc/got_ocr2
|
||||
title: GOT-OCR2
|
||||
- local: model_doc/granitevision
|
||||
@ -1039,8 +975,6 @@
|
||||
title: Janus
|
||||
- local: model_doc/kosmos-2
|
||||
title: KOSMOS-2
|
||||
- local: model_doc/kosmos2_5
|
||||
title: KOSMOS-2.5
|
||||
- local: model_doc/layoutlm
|
||||
title: LayoutLM
|
||||
- local: model_doc/layoutlmv2
|
||||
@ -1054,7 +988,7 @@
|
||||
- local: model_doc/llama4
|
||||
title: Llama4
|
||||
- local: model_doc/llava
|
||||
title: LLaVA
|
||||
title: Llava
|
||||
- local: model_doc/llava_next
|
||||
title: LLaVA-NeXT
|
||||
- local: model_doc/llava_next_video
|
||||
@ -1065,24 +999,18 @@
|
||||
title: LXMERT
|
||||
- local: model_doc/matcha
|
||||
title: MatCha
|
||||
- local: model_doc/metaclip_2
|
||||
title: MetaCLIP 2
|
||||
- local: model_doc/mgp-str
|
||||
title: MGP-STR
|
||||
- local: model_doc/mistral3
|
||||
title: Mistral3
|
||||
- local: model_doc/mllama
|
||||
title: mllama
|
||||
- local: model_doc/mm-grounding-dino
|
||||
title: MM Grounding DINO
|
||||
- local: model_doc/nougat
|
||||
title: Nougat
|
||||
- local: model_doc/omdet-turbo
|
||||
title: OmDet-Turbo
|
||||
- local: model_doc/oneformer
|
||||
title: OneFormer
|
||||
- local: model_doc/ovis2
|
||||
title: Ovis2
|
||||
- local: model_doc/owlvit
|
||||
title: OWL-ViT
|
||||
- local: model_doc/owlv2
|
||||
@ -1091,8 +1019,6 @@
|
||||
title: PaliGemma
|
||||
- local: model_doc/perceiver
|
||||
title: Perceiver
|
||||
- local: model_doc/perception_lm
|
||||
title: PerceptionLM
|
||||
- local: model_doc/phi4_multimodal
|
||||
title: Phi4 Multimodal
|
||||
- local: model_doc/pix2struct
|
||||
@ -1107,10 +1033,6 @@
|
||||
title: Qwen2Audio
|
||||
- local: model_doc/qwen2_vl
|
||||
title: Qwen2VL
|
||||
- local: model_doc/sam2
|
||||
title: SAM2
|
||||
- local: model_doc/sam2_video
|
||||
title: SAM2 Video
|
||||
- local: model_doc/sam
|
||||
title: Segment Anything
|
||||
- local: model_doc/sam_hq
|
||||
@ -1121,8 +1043,6 @@
|
||||
title: SigLIP
|
||||
- local: model_doc/siglip2
|
||||
title: SigLIP2
|
||||
- local: model_doc/smollm3
|
||||
title: SmolLM3
|
||||
- local: model_doc/smolvlm
|
||||
title: SmolVLM
|
||||
- local: model_doc/speech-encoder-decoder
|
||||
@ -1149,8 +1069,6 @@
|
||||
title: Vision Text Dual Encoder
|
||||
- local: model_doc/visual_bert
|
||||
title: VisualBERT
|
||||
- local: model_doc/voxtral
|
||||
title: Voxtral
|
||||
- local: model_doc/xclip
|
||||
title: X-CLIP
|
||||
title: Multimodal models
|
||||
@ -1208,3 +1126,4 @@
|
||||
title: Environment Variables
|
||||
title: Reference
|
||||
title: API
|
||||
|
||||
|
@ -13,7 +13,7 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Legacy model contribution
|
||||
# Adding a new model to Transformers
|
||||
|
||||
> [!TIP]
|
||||
> Try adding new models with a more [modular](./modular_transformers) approach first. This makes it significantly easier to contribute a model to Transformers!
|
||||
|
@ -14,9 +14,5 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Agents
|
||||
|
||||
(deprecated)
|
||||
|
||||
> [!WARNING]
|
||||
> Agents and tools were spun out into the standalone [smolagents](https://huggingface.co/docs/smolagents/index) library. They were removed from `transformers` in v4.52.
|
||||
|
@ -60,11 +60,11 @@ You will see it prints "I just entered the attention computation" as many times
|
||||
|
||||
## Dynamically switching attention function
|
||||
|
||||
You could dynamically change the model's attention function as well:
|
||||
You could dynamically change the model's attention function as well, by overriding the `config._attn_implementation` field:
|
||||
|
||||
```python
|
||||
# Back to use original sdpa implementation
|
||||
model.set_attn_implementation("sdpa")
|
||||
model.config._attn_implementation = "sdpa"
|
||||
|
||||
model(torch.ones(1, 5, dtype=int))
|
||||
```
|
||||
@ -72,34 +72,6 @@ model(torch.ones(1, 5, dtype=int))
|
||||
and it will stop printing the statements, as it now uses the `sdpa` attention.
|
||||
This allows to quickly change an attention function, without needing to reload the model!
|
||||
|
||||
## Different attention per backbone in multimodal models
|
||||
|
||||
For multimodal models different attention functions may work better for each backbone module. For example, some vision backbones perform better in fp32, but are incompatible with FlashAttention. To continue using FlashAttention while keeping the vision encoder in fp32, create a dict and map each config to an attention implementation as shown below.
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForImageTextToText
|
||||
|
||||
model_id = "facebook/chameleon-7b"
|
||||
|
||||
attention_implementation_per_backbone = {"vision_config": "sdpa", "text_config": "flash_attention_2"}
|
||||
model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation=attention_implementation_per_backbone)
|
||||
|
||||
# NOTE: keys in the attention implementation have to be the same as the sub-config names
|
||||
for key in attention_implementation_per_backbone:
|
||||
assert key in model.config.sub_configs, f"Invalid key in `attention_implementation`"
|
||||
|
||||
# You can omit certain backbones - the default attention function (SDPA) will be used
|
||||
# This is equivalent to the previous example
|
||||
model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation={"text_config": "flash_attention_2"})
|
||||
|
||||
|
||||
# Set the same attention implementation for all backbones with single string, same as in non-multimodal models
|
||||
model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation="eager")
|
||||
|
||||
# Alternatively use a dict with an empty key for global configuration
|
||||
model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation={"": "eager"})
|
||||
```
|
||||
|
||||
## What about new args needed in my custom attention function?
|
||||
|
||||
But indeed, what if the new function requires a new arg to be properly used? It's no issue! Models supporting the
|
||||
|
@ -14,26 +14,43 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Documenting a model
|
||||
# Utilizing the @auto_docstring Decorator
|
||||
|
||||
The `@auto_docstring` decorator in Transformers generates consistent docstrings for model classes and their methods. It reduces boilerplate by automatically including standard argument descriptions while also allowing overrides to add new or custom arguments. [Contributing a new model](./modular_transformers) is easier because you don't need to manually add the standard docstrings, and only focus on documenting new arguments.
|
||||
The `@auto_docstring` decorator in the Hugging Face Transformers library helps generate docstrings for model classes and their methods, which will be used to build the documentation for the library. It aims to improve consistency and reduce boilerplate by automatically including standard argument descriptions and allowing for targeted overrides and additions.
|
||||
|
||||
This guide describes how to use the `@auto_docstring` decorator and how it works.
|
||||
---
|
||||
|
||||
## @auto_docstring
|
||||
## 📜 How it Works
|
||||
|
||||
Start by importing the decorator in the modeling file (`modular_model.py` or `modeling_model.py`).
|
||||
The `@auto_docstring` decorator constructs docstrings by:
|
||||
|
||||
1. **Signature Inspection:** It inspects the signature (arguments, types, defaults) of the decorated class's `__init__` method or the decorated function.
|
||||
2. **Centralized Docstring Fetching:** It retrieves predefined docstrings for common arguments (e.g., `input_ids`, `attention_mask`) from internal library sources (like `ModelArgs` or `ImageProcessorArgs` in `utils/args_doc.py`).
|
||||
3. **Overriding or Adding Arguments Descriptions:**
|
||||
* **Direct Docstring Block:** It incorporates custom docstring content from an `r""" """` (or `""" """`) block below the method signature or within the `__init__` docstring. This is for documenting new arguments or overriding standard descriptions.
|
||||
* **Decorator Arguments (`custom_args`):** A `custom_args` docstring block can be passed to the decorator to provide docstrings for specific arguments directly in the decorator call. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
|
||||
4. **Adding Classes and Functions Introduction:**
|
||||
* **`custom_intro` argument:** Allows prepending a custom introductory paragraph to a class or function docstring.
|
||||
* **Automatic Introduction Generation:** For model classes with standard naming patterns (like `ModelForCausalLM`) or belonging to a pipeline, the decorator automatically generates an appropriate introductory paragraph using `ClassDocstring` in `utils/args_doc.py` as the source.
|
||||
5. **Templating:** The decorator uses a templating system, allowing predefined docstrings to include dynamic information deduced from the `auto_modules` of the library, such as `{{processor_class}}` or `{{config_class}}`.
|
||||
6. **Deducing Relevant Examples:** The decorator attempts to find appropriate usage examples based on the model's task or pipeline compatibility. It extracts checkpoint information from the model's configuration class to provide concrete examples with real model identifiers.
|
||||
7. **Adding Return Value Documentation:** For methods like `forward`, the decorator can automatically generate the "Returns" section based on the method's return type annotation. For example, for a method returning a `ModelOutput` subclass, it will extracts field descriptions from that class's docstring to create a comprehensive return value description. A custom `Returns` section can also be manually specified in the function docstring block.
|
||||
8. **Unrolling Kwargs Typed With Unpack Operator:** For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentation from the TypedDict and adds each parameter to the function's docstring. Currently, this functionality is only supported for `FastImageProcessorKwargs`.
|
||||
|
||||
|
||||
---
|
||||
|
||||
## 🚀 How to Use @auto_docstring
|
||||
|
||||
### 1. Importing the Decorator
|
||||
Import the decorator into your modeling file:
|
||||
|
||||
```python
|
||||
from ...utils import auto_docstring
|
||||
```
|
||||
|
||||
Select whether you'd like to apply `@auto_docstring` to a class or function below to see how to use it.
|
||||
|
||||
<hfoptions id="type">
|
||||
<hfoption id="classes">
|
||||
|
||||
Place `@auto_docstring` directly above the class definition. The decorator derives parameter descriptions from the `__init__` method's signature and docstring.
|
||||
### 2. Applying to Classes
|
||||
Place `@auto_docstring` directly above the class definition. It uses the `__init__` method's signature and its docstring for parameter descriptions.
|
||||
|
||||
```python
|
||||
from transformers.modeling_utils import PreTrainedModel
|
||||
@ -56,7 +73,9 @@ class MyAwesomeModel(PreTrainedModel):
|
||||
# ... other methods
|
||||
```
|
||||
|
||||
Arguments can also be passed directly to `@auto_docstring` for more control. Use the `custom_intro` parameter to describe the argument and the `custom_args` parameter to describe the arguments.
|
||||
#### Advanced Class Decoration:
|
||||
|
||||
Arguments can be passed directly to `@auto_docstring` for more control:
|
||||
|
||||
```python
|
||||
@auto_docstring(
|
||||
@ -64,9 +83,9 @@ Arguments can also be passed directly to `@auto_docstring` for more control. Use
|
||||
It builds upon the standard Transformer architecture with unique modifications.""",
|
||||
custom_args="""
|
||||
custom_parameter (`type`, *optional*, defaults to `default_value`):
|
||||
A concise description for custom_parameter if not defined or overriding the description in `auto_docstring.py`.
|
||||
A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
|
||||
internal_helper_arg (`type`, *optional*, defaults to `default_value`):
|
||||
A concise description for internal_helper_arg if not defined or overriding the description in `auto_docstring.py`.
|
||||
A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
|
||||
"""
|
||||
)
|
||||
class MySpecialModel(PreTrainedModel):
|
||||
@ -74,7 +93,7 @@ class MySpecialModel(PreTrainedModel):
|
||||
# ...
|
||||
```
|
||||
|
||||
You can also choose to only use `custom_intro` and define the custom arguments directly in the class.
|
||||
Or:
|
||||
|
||||
```python
|
||||
@auto_docstring(
|
||||
@ -85,44 +104,15 @@ class MySpecialModel(PreTrainedModel):
|
||||
def __init__(self, config: ConfigType, custom_parameter: "type" = "default_value", internal_helper_arg=None):
|
||||
r"""
|
||||
custom_parameter (`type`, *optional*, defaults to `default_value`):
|
||||
A concise description for custom_parameter if not defined or overriding the description in `auto_docstring.py`.
|
||||
A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
|
||||
internal_helper_arg (`type`, *optional*, defaults to `default_value`):
|
||||
A concise description for internal_helper_arg if not defined or overriding the description in `auto_docstring.py`.
|
||||
A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
|
||||
"""
|
||||
# ...
|
||||
```
|
||||
|
||||
You should also use the `@auto_docstring` decorator for classes that inherit from [`~utils.ModelOutput`].
|
||||
|
||||
```python
|
||||
@dataclass
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
Custom model outputs with additional fields.
|
||||
"""
|
||||
)
|
||||
class MyModelOutput(ImageClassifierOutput):
|
||||
r"""
|
||||
loss (`torch.FloatTensor`, *optional*):
|
||||
The loss of the model.
|
||||
custom_field (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*):
|
||||
A custom output field specific to this model.
|
||||
"""
|
||||
|
||||
# Standard fields like hidden_states, logits, attentions etc. can be automatically documented if the description is the same as the standard arguments.
|
||||
# However, given that the loss docstring is often different per model, you should document it in the docstring above.
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
logits: Optional[torch.FloatTensor] = None
|
||||
hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
|
||||
attentions: Optional[tuple[torch.FloatTensor, ...]] = None
|
||||
# Custom fields need to be documented in the docstring above
|
||||
custom_field: Optional[torch.FloatTensor] = None
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="functions">
|
||||
|
||||
Place `@auto_docstring` directly above the method definition. The decorator derives parameter descriptions from the function signature.
|
||||
### 3. Applying to Functions (e.g., `forward` method)
|
||||
Apply the decorator above method definitions, such as the `forward` method.
|
||||
|
||||
```python
|
||||
@auto_docstring
|
||||
@ -141,10 +131,9 @@ Place `@auto_docstring` directly above the method definition. The decorator deri
|
||||
# ...
|
||||
```
|
||||
|
||||
Arguments can also be passed directly to `@auto_docstring` for more control. Use the `custom_intro` parameter to describe the argument and the `custom_args` parameter to describe the arguments.
|
||||
|
||||
The `Returns` and `Examples` parts of the docstring can also be manually specified.
|
||||
#### Advanced Function Decoration:
|
||||
|
||||
Arguments can be passed directly to `@auto_docstring` for more control. `Returns` and `Examples` sections can also be manually specified:
|
||||
|
||||
```python
|
||||
MODEL_COMMON_CUSTOM_ARGS = r"""
|
||||
@ -191,117 +180,100 @@ class MyModel(PreTrainedModel):
|
||||
# ...
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
---
|
||||
|
||||
## Documenting arguments
|
||||
### ✍️ Documenting Arguments: Approach & Priority
|
||||
|
||||
There are some rules for documenting different types of arguments and they're listed below.
|
||||
|
||||
- Standard arguments (`input_ids`, `attention_mask`, `pixel_values`, etc.) are defined and retrieved from `auto_docstring.py`. It is the single source of truth for standard arguments and should not be redefined locally if an argument's description and shape is the same as an argument in `auto_docstring.py`.
|
||||
|
||||
If a standard argument behaves differently in your model, then you can override it locally in a `r""" """` block. This local definition has a higher priority. For example, the `labels` argument is often customized per model and typically requires overriding.
|
||||
|
||||
|
||||
- New or custom arguments should be documented within an `r""" """` block after the signature if it is a function or in the `__init__` method's docstring if it is a class.
|
||||
|
||||
```py
|
||||
argument_name (`type`, *optional*, defaults to `X`):
|
||||
Description of the argument.
|
||||
Explain its purpose, expected shape/type if complex, and default behavior.
|
||||
This can span multiple lines.
|
||||
```
|
||||
1. **Standard Arguments (e.g., `input_ids`, `attention_mask`, `pixel_values`, `encoder_hidden_states` etc.):**
|
||||
* `@auto_docstring` retrieves descriptions from a central source. Do not redefine these locally if their description and shape are the same as in `args_doc.py`.
|
||||
|
||||
2. **New or Custom Arguments:**
|
||||
* **Primary Method:** Document these within an `r""" """` docstring block following the signature (for functions) or in the `__init__` method's docstring (for class parameters).
|
||||
* **Format:**
|
||||
```
|
||||
argument_name (`type`, *optional*, defaults to `X`):
|
||||
Description of the argument.
|
||||
Explain its purpose, expected shape/type if complex, and default behavior.
|
||||
This can span multiple lines.
|
||||
```
|
||||
* Include `type` in backticks.
|
||||
* Add *optional* if the argument is not required or has a default value.
|
||||
* Add "defaults to X" if it has a default value. You don't need to add "defaults to `None`" if the default value is `None`.
|
||||
* Add "*optional*" if the argument is not required (has a default value).
|
||||
* Add "defaults to `X`" if it has a default value (no need to specify "defaults to `None`" if the default value is `None`).
|
||||
|
||||
These arguments can also be passed to `@auto_docstring` as a `custom_args` argument. It is used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
|
||||
3. **Overriding Standard Arguments:**
|
||||
* If a standard argument behaves differently (e.g., different expected shape, model-specific behavior), provide its complete description in the local `r""" """` docstring. This local definition takes precedence.
|
||||
* The `labels` argument is often customized per model and typically requires a specific docstring.
|
||||
|
||||
```py
|
||||
class MyModel(PreTrainedModel):
|
||||
# ...
|
||||
@auto_docstring(
|
||||
custom_intro="""
|
||||
This is a custom introduction for the function.
|
||||
"""
|
||||
custom_args=r"""
|
||||
common_arg_1 (`torch.Tensor`, *optional*, defaults to `default_value`):
|
||||
Description of common_arg_1
|
||||
"""
|
||||
)
|
||||
```
|
||||
4. **Using Decorator Arguments for Overrides or New Arguments (`custom_args`):**
|
||||
* New or custom arguments docstrings can also be passed to `@auto_docstring` as a `custom_args` argument. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
|
||||
|
||||
## Checking the docstrings
|
||||
---
|
||||
|
||||
Transformers includes a utility script to validate the docstrings when you open a Pull Request which triggers CI (continuous integration) checks. The script checks for the following criteria.
|
||||
### Usage with [modular files](./modular_transformers)
|
||||
|
||||
* Ensures `@auto_docstring` is applied to relevant mode classes and public methods.
|
||||
* Ensures arguments are complete and consistent. It checks that documented arguments exist in the signature and verifies whether the types and default values in the docstring match the signature. Arguments that aren't known standard arguments or if they lack a local description are flagged.
|
||||
* Reminds you to complete placeholders like `<fill_type>` and `<fill_docstring>`.
|
||||
* Ensures docstrings are formatted according to the expected docstring style.
|
||||
When working with modular files, follow these guidelines for applying the `@auto_docstring` decorator:
|
||||
|
||||
You can run this check locally - before committing - by running the following command.
|
||||
- **For standalone models in modular files:**
|
||||
Apply the `@auto_docstring` decorator just as you would in regular modeling files.
|
||||
|
||||
- **For models inheriting from other library models:**
|
||||
- When inheriting from a parent model, decorators (including `@auto_docstring`) are automatically carried over to the generated modeling file without needing to add them in your modular file.
|
||||
- If you need to modify the `@auto_docstring` behavior, apply the customized decorator in your modular file, making sure to *include all other decorators* that were present on the original function/class.
|
||||
|
||||
> **Warning**: When overriding any decorator in a modular file, you must include ALL decorators that were applied to that function/class in the parent model. If you only override some decorators, the others won't be included in the generated modeling file.
|
||||
|
||||
|
||||
**Note**: The `check_auto_docstrings` tool doesn't check modular files directly, but it will check (and modify when using `--fix_and_overwrite`) the generated modeling files. If issues are found in the generated files, you'll need to update your modular files accordingly.
|
||||
|
||||
---
|
||||
|
||||
## ✅ Checking Your Docstrings with `check_auto_docstrings`
|
||||
|
||||
The library includes a utility script to validate docstrings. This check is typically run during Continuous Integration (CI).
|
||||
|
||||
#### What it Checks:
|
||||
|
||||
* **Decorator Presence:** Ensures `@auto_docstring` is applied to relevant model classes and public methods. (TODO)
|
||||
* **Argument Completeness & Consistency:**
|
||||
* Flags arguments in the signature that are not known standard arguments and lack a local description.
|
||||
* Ensures documented arguments exist in the signature. (TODO)
|
||||
* Verifies that types and default values in the docstring match the signature. (TODO)
|
||||
* **Placeholder Detection:** Reminds you to complete placeholders like `<fill_type>` or `<fill_docstring>`.
|
||||
* **Formatting:** Adherence to the expected docstring style.
|
||||
|
||||
#### Running the Check Locally:
|
||||
|
||||
Run this check locally before committing. The common command is:
|
||||
|
||||
```bash
|
||||
make fix-copies
|
||||
```
|
||||
|
||||
`make fix-copies` runs several other checks as well. If you don't need those checks, run the command below to only perform docstring and auto-docstring checks.
|
||||
Alternatively, to only perform docstrings and auto-docstring checks, you can use:
|
||||
|
||||
```bash
|
||||
python utils/check_docstrings.py # to only check files included in the diff without fixing them
|
||||
# python utils/check_docstrings.py --fix_and_overwrite # to fix and overwrite the files in the diff
|
||||
# python utils/check_docstrings.py --fix_and_overwrite --check_all # to fix and overwrite all files
|
||||
# Or: python utils/check_docstrings.py --fix_and_overwrite # to fix and overwrite the files in the diff
|
||||
# Or: python utils/check_docstrings.py --fix_and_overwrite --check_all # to fix and overwrite all files
|
||||
```
|
||||
|
||||
## modular_model.py files
|
||||
#### Workflow with the Checker:
|
||||
|
||||
When working with modular files (`modular_model.py`), follow the guidelines below for applying `@auto_docstring`.
|
||||
1. Add `@auto_docstring(...)` to the class or method.
|
||||
2. For new, custom, or overridden arguments, add descriptions in an `r""" """` block.
|
||||
3. Run `make fix-copies` (or the `check_docstrings.py` utility).
|
||||
* For unrecognized arguments lacking documentation, the utility will create placeholder entries.
|
||||
4. Manually edit these placeholders with accurate types and descriptions.
|
||||
5. Re-run the check to ensure all issues are resolved.
|
||||
|
||||
- For standalone models in modular files, apply `@auto_docstring` like you would in a `modeling_model.py` file.
|
||||
- For models that inherit from other library models, `@auto_docstring` is automatically carried over to the generated modeling file. You don't need to add `@auto_docstring` in your modular file.
|
||||
---
|
||||
|
||||
If you need to modify the `@auto_docstring` behavior, apply the customized decorator in your modular file. Make sure to **include all other decorators** that are present in the original function or class.
|
||||
## 🔑 Key Takeaways & Best Practices
|
||||
|
||||
> [!WARNING]
|
||||
> When overriding any decorator in a modular file, you must include **all** decorators that were applied to that function or class in the parent model. If you only override some decorators, the others won't be included in the generated modeling file.
|
||||
|
||||
## How it works
|
||||
|
||||
The `@auto_docstring` decorator automatically generates docstrings by:
|
||||
|
||||
1. Inspecting the signature (arguments, types, defaults) of the decorated class' `__init__` method or the decorated function.
|
||||
2. Retrieving the predefined docstrings for common arguments (`input_ids`, `attention_mask`, etc.) from internal library sources like [`ModelArgs`], [`ImageProcessorArgs`], and the `auto_docstring.py` file.
|
||||
3. Adding argument descriptions in one of two ways as shown below.
|
||||
|
||||
| method | description | usage |
|
||||
|---|---|---|
|
||||
| `r""" """` | add custom docstring content directly to a method signature or within the `__init__` docstring | document new arguments or override standard descriptions |
|
||||
| `custom_args` | add custom docstrings for specific arguments directly in `@auto_docstring` | define docstring for new arguments once if they're repeated in multiple places in the modeling file |
|
||||
|
||||
4. Adding class and function descriptions. For model classes with standard naming patterns, like `ModelForCausalLM`, or if it belongs to a pipeline, `@auto_docstring` automatically generates the appropriate descriptions with `ClassDocstring` from `auto_docstring.py`.
|
||||
|
||||
`@auto_docstring` also accepts the `custom_intro` argument to describe a class or function.
|
||||
|
||||
5. Using a templating system to allow predefined docstrings to include dynamic information from Transformers' [auto_modules](https://github.com/huggingface/transformers/tree/main/src/transformers/models/auto) such as `{{processor_class}}` and `{{config_class}}`.
|
||||
|
||||
6. Finding appropriate usage examples based on the model's task or pipeline compatibility. It extracts checkpoint information form the model's configuration class to provide concrete examples with real model identifiers.
|
||||
|
||||
7. Adding return values to the docstring. For methods like `forward`, the decorator automatically generates the `Returns` field in the docstring based on the method's return type annotation.
|
||||
|
||||
For example, if a method returns a [`~transformers.utils.ModelOutput`] subclass, `@auto_docstring` extracts the field descriptions from the class' docstring to create a comprehensive return value description. You can also manually specify a custom `Returns` field in a functions docstring.
|
||||
|
||||
8. Unrolling kwargs typed with the unpack operator. For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentations from the `TypedDict` and adds each parameter to the function's docstring.
|
||||
|
||||
Currently only supported for [`FastImageProcessorKwargs`].
|
||||
|
||||
## Best practices
|
||||
|
||||
Follow the best practices below to help maintain consistent and informative documentation for Transformers!
|
||||
|
||||
* Use `@auto_docstring` for new PyTorch model classes ([`PreTrainedModel`] subclasses) and their primary methods like `forward` or `get_text_features`.
|
||||
* For classes, `@auto_docstring` retrieves parameter descriptions from the `__init__` method's docstring.
|
||||
* Rely on standard docstrings and do not redefine common arguments unless their behavior is different in your model.
|
||||
* Use `@auto_docstring` for new PyTorch model classes (`PreTrainedModel` subclasses) and their primary for methods (e.g., `forward`, `get_text_features` etc.).
|
||||
* For classes, the `__init__` method's docstring is the main source for parameter descriptions when using `@auto_docstring` on the class.
|
||||
* Rely on standard docstrings; do not redefine common arguments unless their behavior is different in your specific model.
|
||||
* Document new or custom arguments clearly.
|
||||
* Run `check_docstrings` locally and iteratively.
|
||||
|
||||
By following these guidelines, you help maintain consistent and informative documentation for the Hugging Face Transformers library 🤗.
|
||||
|
@ -82,37 +82,41 @@ When you use Transformers' [`Cache`] class, the self-attention module performs s
|
||||
|
||||
## Cache storage implementation
|
||||
|
||||
Caches are structured as a list of layers, where each layer contains a key and value cache. The key and value caches are tensors with the shape `[batch_size, num_heads, seq_len, head_dim]`.
|
||||
The actual storage of key-value pairs varies between cache implementations. As an example, consider the [`DynamicCache`].
|
||||
|
||||
Layers can be of different types (e.g. `DynamicLayer`, `StaticLayer`, `SlidingWindowLayer`), which mostly changes how sequence length is handled and how the cache is updated.
|
||||
|
||||
The simplest is a `DynamicLayer` that grows as more tokens are processed. The sequence length dimension (`seq_len`) increases with each new token:
|
||||
In [`DynamicCache`], the key-value pairs are stored as two lists of tensors. Each tensor in the lists have the shape `[batch_size, num_heads, seq_len, head_dim]`.
|
||||
- `key_cache`: A list of tensors, one for each layer.
|
||||
- `value_cache`: A list of tensors, one for each layer.
|
||||
|
||||
When new tokens are processed:
|
||||
|
||||
1. For each layer, the new key and value states are concatenated with the existing cache.
|
||||
```py
|
||||
cache.layers[idx].keys = torch.cat([cache.layers[idx].keys, key_states], dim=-2)
|
||||
cache.layers[idx].values = torch.cat([cache.layers[idx].values, value_states], dim=-2)
|
||||
self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
|
||||
self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
|
||||
```
|
||||
|
||||
Other layer types like `StaticLayer` and `SlidingWindowLayer` have a fixed sequence length that is set when the cache is created. This makes them compatible with `torch.compile`. In the case of `SlidingWindowLayer`, existing tokens are shifted out of the cache when a new token is added.
|
||||
2. The cache grows dynamically as more tokens are processed. The sequence length dimension (`seq_len`) increases with each new token.
|
||||
|
||||
3. The cache maintains a count of seen tokens through `self._seen_tokens`. This is updated when the first layer processes a new token.
|
||||
|
||||
The example below demonstrates how to create a generation loop with [`DynamicCache`]. As discussed, the attention mask is a concatenation of past and current token values and `1` is added to the cache position for the next token.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache, infer_device
|
||||
|
||||
device = f"{infer_device()}:0"
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
|
||||
|
||||
model_id = "meta-llama/Llama-2-7b-chat-hf"
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map=device)
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda:0")
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||
|
||||
past_key_values = DynamicCache()
|
||||
messages = [{"role": "user", "content": "Hello, what's your name."}]
|
||||
inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to(model.device)
|
||||
inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to("cuda:0")
|
||||
|
||||
generated_ids = inputs.input_ids
|
||||
cache_position = torch.arange(inputs.input_ids.shape[1], dtype=torch.int64, device=model.device)
|
||||
cache_position = torch.arange(inputs.input_ids.shape[1], dtype=torch.int64, device="cuda:0")
|
||||
max_new_tokens = 10
|
||||
|
||||
for _ in range(max_new_tokens):
|
||||
@ -130,36 +134,6 @@ for _ in range(max_new_tokens):
|
||||
print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])
|
||||
"[INST] Hello, what's your name. [/INST] Hello! My name is LLaMA,"
|
||||
```
|
||||
|
||||
## Cache position
|
||||
|
||||
The cache position tracks where to insert new tokens in the attention cache. It represents the *absolute* position of each token in the context, independent of padding or batch structure. Suppose you already cached `N` tokens and are now processing `K` new tokens. The cache position for the new tokens will range from `N` to `N + K - 1`. In other words, you're processing tokens at positions - `[N, N + 1, N + 2, ..., N + K - 1]`.
|
||||
|
||||
Cache position is used internally for two purposes:
|
||||
|
||||
1. Selecting new tokens to process in the input sequence and ensuring only tokens that haven’t been cached yet are passed to the model's `forward`.
|
||||
2. Storing key/value pairs at the correct positions in the cache. This is especially important for fixed-size caches, like [`StaticCache`], that pre-allocates a specific cache length.
|
||||
|
||||
The generation loop usually takes care of the cache position, but if you're writing a custom generation method, it is important that cache positions are accurate since they are used to write and read key/value states into fixed slots.
|
||||
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache, infer_device
|
||||
|
||||
device = f"{infer_device()}:0"
|
||||
|
||||
model_id = "meta-llama/Llama-2-7b-chat-hf"
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map=device)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||
|
||||
messages = [{"role": "user", "content": "You are a helpful assistant."}]
|
||||
inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to(model.device)
|
||||
generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=10)
|
||||
|
||||
```
|
||||
|
||||
|
||||
## Legacy cache format
|
||||
|
||||
Before the [`Cache`] class, the cache used to be stored as a tuple of tuples of tensors. This format is dynamic because it grows as text is generated, similar to [`DynamicCache`].
|
||||
@ -169,7 +143,7 @@ The legacy format is essentially the same data structure but organized different
|
||||
- The tensors have the same shape `[batch_size, num_heads, seq_len, head_dim]`.
|
||||
- The format is less flexible and doesn't support features like quantization or offloading.
|
||||
|
||||
If your project depends on this legacy format, we recommend to convert to [`DynamicCache`] with [`~DynamicCache.from_legacy_cache`]. Note that legacy cache format is deprecated and not used anymore in `Transformers`. You can convert back to tuple format with [`DynamicCache.to_legacy_cache`] functions, which is helpful if you have custom logic for manipulating a cache in a specific format.
|
||||
If your project depends on this legacy format, you can convert between [`DynamicCache`] and a tuple of tuples as shown below with the [`~DynamicCache.from_legacy_cache`] and [`DynamicCache.to_legacy_cache`] functions. This is helpful if you have custom logic for manipulating a cache in a specific format.
|
||||
|
||||
```py
|
||||
import torch
|
||||
@ -185,4 +159,4 @@ generation_outputs = model.generate(**inputs, return_dict_in_generate=True, retu
|
||||
|
||||
cache = DynamicCache.from_legacy_cache(generation_outputs.past_key_values)
|
||||
legacy_format_cache = cache.to_legacy_cache()
|
||||
```
|
||||
```
|
@ -161,7 +161,7 @@ It can be confusing to manage multiple templates though, so we recommend using a
|
||||
|
||||
It is important to set a chat template format that matches the template format a model was pretrained on, otherwise performance may suffer. Even if you’re training the model further, performance is best if the chat tokens are kept constant.
|
||||
|
||||
But if you’re training a model from scratch or finetuning a model for chat, you have more options to select a template. For example, [ChatML](https://github.com/openai/openai-python/blob/release-v0.28.0/chatml.md) is a popular format that is flexible enough to handle many use cases. It even includes support for [generation prompts](#add_generation_prompt), but it doesn’t add beginning-of-string (`BOS`) or end-of-string (`EOS`) tokens. If your model expects `BOS` and `EOS` tokens, set `add_special_tokens=True` and make sure to add them to your template.
|
||||
But if you’re training a model from scratch or finetuning a model for chat, you have more options to select a template. For example, [ChatML](https://github.com/openai/openai-python/blob/release-v0.28.0/chatml.md) is a popular format that is flexbile enough to handle many use cases. It even includes support for [generation prompts](#add_generation_prompt), but it doesn’t add beginning-of-string (`BOS`) or end-of-string (`EOS`) tokens. If your model expects `BOS` and `EOS` tokens, set `add_special_tokens=True` and make sure to add them to your template.
|
||||
|
||||
```py
|
||||
{%- for message in messages %}
|
||||
|
@ -56,7 +56,7 @@ Create a [`ImageTextToTextPipeline`] and pass the chat to it. For large models,
|
||||
import torch
|
||||
from transformers import pipeline
|
||||
|
||||
pipeline = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device_map="auto", torch_dtype=torch.float16)
|
||||
pipeline = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device="cuda", torch_dtype=torch.float16)
|
||||
pipeline(text=messages, max_new_tokens=50, return_full_text=False)
|
||||
[{'input_text': [{'role': 'system',
|
||||
'content': [{'type': 'text',
|
||||
@ -111,7 +111,6 @@ Some vision models also support video inputs. The message format is very similar
|
||||
|
||||
- The content `"type"` should be `"video"` to indicate the content is a video.
|
||||
- For videos, it can be a link to the video (`"url"`) or it could be a file path (`"path"`). Videos loaded from a URL can only be decoded with [PyAV](https://pyav.basswood-io.com/docs/stable/) or [Decord](https://github.com/dmlc/decord).
|
||||
- In addition to loading videos from a URL or file path, you can also pass decoded video data directly. This is useful if you’ve already preprocessed or decoded video frames elsewhere in memory (e.g., using OpenCV, decord, or torchvision). You don't need to save to files or store it in an URL.
|
||||
|
||||
> [!WARNING]
|
||||
> Loading a video from `"url"` is only supported by the PyAV or Decord backends.
|
||||
@ -138,52 +137,6 @@ messages = [
|
||||
]
|
||||
```
|
||||
|
||||
### Example: Passing decoded video objects
|
||||
```python
|
||||
import numpy as np
|
||||
|
||||
video_object1 = np.random.randint(0, 255, size=(16, 224, 224, 3), dtype=np.uint8),
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video", "video": video_object1},
|
||||
{"type": "text", "text": "What do you see in this video?"}
|
||||
],
|
||||
},
|
||||
]
|
||||
```
|
||||
You can also use existing (`"load_video()"`) function to load a video, edit the video in memory and pass it in the messages.
|
||||
```python
|
||||
|
||||
# Make sure a video backend library (pyav, decord, or torchvision) is available.
|
||||
from transformers.video_utils import load_video
|
||||
|
||||
# load a video file in memory for testing
|
||||
video_object2, _ = load_video(
|
||||
"https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4"
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video", "video": video_object2},
|
||||
{"type": "text", "text": "What do you see in this video?"}
|
||||
],
|
||||
},
|
||||
]
|
||||
```
|
||||
|
||||
Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content. There are a few extra parameters to include in [`~ProcessorMixin.apply_chat_template`] that controls the sampling process.
|
||||
|
||||
The `video_load_backend` parameter refers to a specific framework to load a video. It supports [PyAV](https://pyav.basswood-io.com/docs/stable/), [Decord](https://github.com/dmlc/decord), [OpenCV](https://github.com/opencv/opencv), and [torchvision](https://pytorch.org/vision/stable/index.html).
|
||||
@ -222,7 +175,7 @@ processed_chat = processor.apply_chat_template(
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
video_fps=16,
|
||||
video_fps=32,
|
||||
video_load_backend="decord",
|
||||
)
|
||||
print(processed_chat.keys())
|
||||
|
@ -25,9 +25,9 @@ Check model leaderboards like [OpenLLM](https://hf.co/spaces/HuggingFaceH4/open_
|
||||
|
||||
This guide shows you how to quickly start chatting with Transformers from the command line, how build and format a conversation, and how to chat using the [`TextGenerationPipeline`].
|
||||
|
||||
## chat CLI
|
||||
## transformers CLI
|
||||
|
||||
After you've [installed Transformers](./installation), chat with a model directly from the command line as shown below. It launches an interactive session with a model, with a few base commands listed at the start of the session.
|
||||
After you've [installed Transformers](./installation.md), chat with a model directly from the command line as shown below. It launches an interactive session with a model, with a few base commands listed at the start of the session.
|
||||
|
||||
```bash
|
||||
transformers chat Qwen/Qwen2.5-0.5B-Instruct
|
||||
@ -49,8 +49,7 @@ For a full list of options, run the command below.
|
||||
transformers chat -h
|
||||
```
|
||||
|
||||
The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating). It uses the `transformers serve` CLI under the hood ([docs](./serving.md#serve-cli)).
|
||||
|
||||
The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating).
|
||||
|
||||
## TextGenerationPipeline
|
||||
|
||||
@ -158,4 +157,4 @@ The easiest solution for improving generation speed is to either quantize a mode
|
||||
You can also try techniques like [speculative decoding](./generation_strategies#speculative-decoding), where a smaller model generates candidate tokens that are verified by the larger model. If the candidate tokens are correct, the larger model can generate more than one token per `forward` pass. This significantly alleviates the bandwidth bottleneck and improves generation speed.
|
||||
|
||||
> [!TIP]
|
||||
> Parameters may not be active for every generated token in MoE models such as [Mixtral](./model_doc/mixtral), [Qwen2MoE](./model_doc/qwen2_moe), and [DBRX](./model_doc/dbrx). As a result, MoE models generally have much lower memory bandwidth requirements and can be faster than a regular LLM of the same size. However, techniques like speculative decoding are ineffective with MoE models because parameters become activated with each new speculated token.
|
||||
> Parameters may not be active for every generated token in MoE models such as [Mixtral](./model_doc/mixtral), [Qwen2MoE](./model_doc/qwen2_moe.md), and [DBRX](./model_doc/dbrx). As a result, MoE models generally have much lower memory bandwidth requirements and can be faster than a regular LLM of the same size. However, techniques like speculative decoding are ineffective with MoE models because parameters become activated with each new speculated token.
|
||||
|
@ -1,42 +0,0 @@
|
||||
# Using Cursor as a client of transformers serve
|
||||
|
||||
This example shows how to use `transformers serve` as a local LLM provider for [Cursor](https://cursor.com/), the popular IDE. In this particular case, requests to `transformers serve` will come from an external IP (Cursor's server IPs), which requires some additional setup. Furthermore, some of Cursor's requests require [CORS](https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/CORS), which is disabled by default for security reasons.
|
||||
|
||||
To launch a server with CORS enabled, run
|
||||
|
||||
```shell
|
||||
transformers serve --enable-cors
|
||||
```
|
||||
|
||||
You'll also need to expose your server to external IPs. A potential solution is to use [`ngrok`](https://ngrok.com/), which has a permissive free tier. After setting up your `ngrok` account and authenticating on your server machine, you run
|
||||
|
||||
```shell
|
||||
ngrok http [port]
|
||||
```
|
||||
|
||||
where `port` is the port used by `transformers serve` (`8000` by default). On the terminal where you launched `ngrok`, you'll see a https address in the "Forwarding" row, as in the image below. This is the address to send requests to.
|
||||
|
||||
<h3 align="center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_ngrok.png"/>
|
||||
</h3>
|
||||
|
||||
You're now ready to set things up on the app side! In Cursor, while you can't set a new provider, you can change the endpoint for OpenAI requests in the model selection settings. First, navigate to "Settings" > "Cursor Settings", "Models" tab, and expand the "API Keys" collapsible. To set your `transformers serve` endpoint, follow this order:
|
||||
1. Unselect ALL models in the list above (e.g. `gpt4`, ...);
|
||||
2. Add and select the model you want to use (e.g. `Qwen/Qwen3-4B`)
|
||||
3. Add some random text to OpenAI API Key. This field won't be used, but it can’t be empty;
|
||||
4. Add the https address from `ngrok` to the "Override OpenAI Base URL" field, appending `/v1` to the address (i.e. `https://(...).ngrok-free.app/v1`);
|
||||
5. Hit "Verify".
|
||||
|
||||
After you follow these steps, your "Models" tab should look like the image below. Your server should also have received a few requests from the verification step.
|
||||
|
||||
<h3 align="center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_cursor.png"/>
|
||||
</h3>
|
||||
|
||||
You are now ready to use your local model in Cursor! For instance, if you toggle the AI Pane, you can select the model you added and ask it questions about your local files.
|
||||
|
||||
<h3 align="center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_cursor_chat.png"/>
|
||||
</h3>
|
||||
|
||||
|
@ -271,7 +271,7 @@ The model is ready to be pushed to the Hub now. Log in to your Hugging Face acco
|
||||
<hfoption id="huggingface-CLI">
|
||||
|
||||
```bash
|
||||
hf auth login
|
||||
huggingface-cli login
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
|
@ -26,7 +26,6 @@ Pass the audio signal, typically stored in `array`, to the feature extractor and
|
||||
from transformers import AutoFeatureExtractor
|
||||
|
||||
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
|
||||
dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
|
||||
processed_sample = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=16000)
|
||||
processed_sample
|
||||
{'input_values': [array([ 9.4472744e-05, 3.0777880e-03, -2.8888427e-03, ...,
|
||||
|
@ -315,37 +315,37 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
||||
```
|
||||
|
||||
|
||||
## Custom generation methods
|
||||
## Custom decoding methods
|
||||
|
||||
Custom generation methods enable specialized behavior such as:
|
||||
Custom decoding methods enable specialized generation behavior such as the following:
|
||||
- have the model continue thinking if it is uncertain;
|
||||
- roll back generation if the model gets stuck;
|
||||
- handle special tokens with custom logic;
|
||||
- use specialized KV caches;
|
||||
- enhanced input preparation for advanced models;
|
||||
|
||||
We enable custom generation methods through model repositories, assuming a specific model tag and file structure (see subsection below). This feature is an extension of [custom modeling code](./models.md#custom-models) and, like such, requires setting `trust_remote_code=True`.
|
||||
We enable custom decoding methods through model repositories, assuming a specific model tag and file structure (see subsection below). This feature is an extension of [custom modeling code](./models.md#custom-models) and, like such, requires setting `trust_remote_code=True`.
|
||||
|
||||
If a model repository holds a custom generation method, the easiest way to try it out is to load the model and generate with it:
|
||||
If a model repository holds a custom decoding method, the easiest way to try it out is to load the model and generate with it:
|
||||
|
||||
```py
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
# `transformers-community/custom_generate_example` holds a copy of `Qwen/Qwen2.5-0.5B-Instruct`, but
|
||||
# with custom generation code -> calling `generate` uses the custom generation method!
|
||||
# with custom generation code -> calling `generate` uses the custom decoding method!
|
||||
tokenizer = AutoTokenizer.from_pretrained("transformers-community/custom_generate_example")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"transformers-community/custom_generate_example", device_map="auto", trust_remote_code=True
|
||||
)
|
||||
|
||||
inputs = tokenizer(["The quick brown"], return_tensors="pt").to(model.device)
|
||||
# The custom generation method is a minimal greedy decoding implementation. It also prints a custom message at run time.
|
||||
# The custom decoding method is a minimal greedy decoding implementation. It also prints a custom message at run time.
|
||||
gen_out = model.generate(**inputs)
|
||||
# you should now see its custom message, "✨ using a custom generation method ✨"
|
||||
print(tokenizer.batch_decode(gen_out, skip_special_tokens=True))
|
||||
'The quick brown fox jumps over a lazy dog, and the dog is a type of animal. Is'
|
||||
```
|
||||
|
||||
Model repositories with custom generation methods have a special property: their generation method can be loaded from **any** model through [`~GenerationMixin.generate`]'s `custom_generate` argument. This means anyone can create and share their custom generation method to potentially work with any Transformers model, without requiring users to install additional Python packages.
|
||||
Model repositories with custom decoding methods have a special property: their decoding method can be loaded from **any** model through [`~GenerationMixin.generate`]'s `custom_generate` argument. This means anyone can create and share their custom generation method to potentially work with any Transformers model, without requiring users to install additional Python packages.
|
||||
|
||||
```py
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
@ -354,7 +354,7 @@ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
|
||||
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", device_map="auto")
|
||||
|
||||
inputs = tokenizer(["The quick brown"], return_tensors="pt").to(model.device)
|
||||
# `custom_generate` replaces the original `generate` by the custom generation method defined in
|
||||
# `custom_generate` replaces the original `generate` by the custom decoding method defined in
|
||||
# `transformers-community/custom_generate_example`
|
||||
gen_out = model.generate(**inputs, custom_generate="transformers-community/custom_generate_example", trust_remote_code=True)
|
||||
print(tokenizer.batch_decode(gen_out, skip_special_tokens=True)[0])
|
||||
@ -364,7 +364,7 @@ print(tokenizer.batch_decode(gen_out, skip_special_tokens=True)[0])
|
||||
You should read the `README.md` file of the repository containing the custom generation strategy to see what the new arguments and output type differences are, if they exist. Otherwise, you can assume it works like the base [`~GenerationMixin.generate`] method.
|
||||
|
||||
> [!TIP]
|
||||
> You can find all custom generation methods by [searching for their custom tag.](https://huggingface.co/models?other=custom_generate), `custom_generate`.
|
||||
> You can find all custom decoding methods by [searching for their custom tag.](https://huggingface.co/models?other=custom_generate), `custom_generate`
|
||||
|
||||
Consider the Hub repository [transformers-community/custom_generate_example](https://huggingface.co/transformers-community/custom_generate_example) as an example. The `README.md` states that it has an additional input argument, `left_padding`, which adds a number of padding tokens before the prompt.
|
||||
|
||||
@ -387,11 +387,11 @@ torch>=99.0 (installed: 2.6.0)
|
||||
|
||||
Updating your Python requirements accordingly will remove this error message.
|
||||
|
||||
### Creating a custom generation method
|
||||
### Creating a custom decoding method
|
||||
|
||||
To create a new generation method, you need to create a new [**Model**](https://huggingface.co/new) repository and push a few files into it.
|
||||
1. The model you've designed your generation method with.
|
||||
2. `custom_generate/generate.py`, which contains all the logic for your custom generation method.
|
||||
To create a new decoding method, you need to create a new [**Model**](https://huggingface.co/new) repository and push a few files into it.
|
||||
1. The model you've designed your decoding method with.
|
||||
2. `custom_generate/generate.py`, which contains all the logic for your custom decoding method.
|
||||
3. `custom_generate/requirements.txt`, used to optionally add new Python requirements and/or lock specific versions to correctly use your method.
|
||||
4. `README.md`, where you should add the `custom_generate` tag and document any new arguments or output type differences of your custom method here.
|
||||
|
||||
@ -409,7 +409,7 @@ your_repo/
|
||||
|
||||
#### Adding the base model
|
||||
|
||||
The starting point for your custom generation method is a model repository just like any other. The model to add to this repository should be the model you've designed your method with, and it is meant to be part of a working self-contained model-generate pair. When the model in this repository is loaded, your custom generation method will override `generate`. Don't worry -- your generation method can still be loaded with any other Transformers model, as explained in the section above.
|
||||
The starting point for your custom decoding method is a model repository just like any other. The model to add to this repository should be the model you've designed your method with, and it is meant to be part of a working self-contained model-generate pair. When the model in this repository is loaded, your custom decoding method will override `generate`. Don't worry -- your decoding method can still be loaded with any other Transformers model, as explained in the section above.
|
||||
|
||||
If you simply want to copy an existing model, you can do
|
||||
|
||||
@ -418,13 +418,13 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("source/model_repo")
|
||||
model = AutoModelForCausalLM.from_pretrained("source/model_repo")
|
||||
tokenizer.save_pretrained("your/generation_method", push_to_hub=True)
|
||||
model.save_pretrained("your/generation_method", push_to_hub=True)
|
||||
tokenizer.save_pretrained("your/decoding_method", push_to_hub=True)
|
||||
model.save_pretrained("your/decoding_method", push_to_hub=True)
|
||||
```
|
||||
|
||||
#### generate.py
|
||||
|
||||
This is the core of your generation method. It *must* contain a method named `generate`, and this method *must* contain a `model` argument as its first argument. `model` is the model instance, which means you have access to all attributes and methods in the model, including the ones defined in [`GenerationMixin`] (like the base `generate` method).
|
||||
This is the core of your decoding method. It *must* contain a method named `generate`, and this method *must* contain a `model` argument as its first argument. `model` is the model instance, which means you have access to all attributes and methods in the model, including the ones defined in [`GenerationMixin`] (like the base `generate` method).
|
||||
|
||||
> [!WARNING]
|
||||
> `generate.py` must be placed in a folder named `custom_generate`, and not at the root level of the repository. The file paths for this feature are hardcoded.
|
||||
@ -465,27 +465,19 @@ def generate(model, input_ids, generation_config=None, left_padding=None, **kwar
|
||||
return input_ids
|
||||
```
|
||||
|
||||
Follow the recommended practices below to ensure your custom generation method works as expected.
|
||||
Follow the recommended practices below to ensure your custom decoding method works as expected.
|
||||
- Feel free to reuse the logic for validation and input preparation in the original [`~GenerationMixin.generate`].
|
||||
- Pin the `transformers` version in the requirements if you use any private method/attribute in `model`.
|
||||
- You can add other files in the `custom_generate` folder, and use relative imports.
|
||||
- Consider adding model validation, input validation, or even a separate test file to help users sanity-check your code in their environment.
|
||||
|
||||
Your custom `generate` method can relative import code from the `custom_generate` folder. For example, if you have a `utils.py` file, you can import it like this:
|
||||
|
||||
```py
|
||||
from .utils import some_function
|
||||
```
|
||||
|
||||
Only relative imports from the same-level `custom_generate` folder are supported. Parent/sibling folder imports are not valid. The `custom_generate` argument also works locally with any directory that contains a `custom_generate` structure. This is the recommended workflow for developing your custom generation method.
|
||||
|
||||
|
||||
#### requirements.txt
|
||||
|
||||
You can optionally specify additional Python requirements in a `requirements.txt` file inside the `custom_generate` folder. These are checked at runtime and an exception will be thrown if they're missing, nudging users to update their environment accordingly.
|
||||
|
||||
#### README.md
|
||||
|
||||
The root level `README.md` in the model repository usually describes the model therein. However, since the focus of the repository is the custom generation method, we highly recommend to shift its focus towards describing the custom generation method. In addition to a description of the method, we recommend documenting any input and/or output differences to the original [`~GenerationMixin.generate`]. This way, users can focus on what's new, and rely on Transformers docs for generic implementation details.
|
||||
The root level `README.md` in the model repository usually describes the model therein. However, since the focus of the repository is the custom decoding method, we highly recommend to shift its focus towards describing the custom decoding method. In addition to a description of the method, we recommend documenting any input and/or output differences to the original [`~GenerationMixin.generate`]. This way, users can focus on what's new, and rely on Transformers docs for generic implementation details.
|
||||
|
||||
For discoverability, we highly recommend you to add the `custom_generate` tag to your repository. To do so, the top of your `README.md` file should look like the example below. After you push the file, you should see the tag in your repository!
|
||||
|
||||
@ -504,36 +496,6 @@ Recommended practices:
|
||||
- Add self-contained examples to enable quick experimentation.
|
||||
- Describe soft-requirements such as if the method only works well with a certain family of models.
|
||||
|
||||
### Reusing `generate`’s input preparation
|
||||
|
||||
If you're adding a new decoding loop, you might want to preserve the input preparation present in `generate` (batch expansion, attention masks, logits processors, stopping criteria, etc.). You can also pass a **callable** to `custom_generate` to reuse [`~GenerationMixin.generate`]’s full preparation pipeline while overriding only the decoding loop.
|
||||
|
||||
```py
|
||||
def custom_loop(model, input_ids, attention_mask, logits_processor, stopping_criteria, generation_config, **model_kwargs):
|
||||
next_tokens = input_ids
|
||||
while input_ids.shape[1] < stopping_criteria[0].max_length:
|
||||
logits = model(next_tokens, attention_mask=attention_mask, **model_kwargs).logits
|
||||
next_token_logits = logits_processor(input_ids, logits[:, -1, :])
|
||||
next_tokens = torch.argmax(next_token_logits, dim=-1)[:, None]
|
||||
input_ids = torch.cat((input_ids, next_tokens), dim=-1)
|
||||
attention_mask = torch.cat((attention_mask, torch.ones_like(next_tokens)), dim=-1)
|
||||
return input_ids
|
||||
|
||||
output = model.generate(
|
||||
**inputs,
|
||||
custom_generate=custom_loop,
|
||||
max_new_tokens=10,
|
||||
)
|
||||
```
|
||||
|
||||
> [!TIP]
|
||||
> If you publish a `custom_generate` repository, your `generate` implementation can itself define a callable and pass it to `model.generate()`. This lets you customize the decoding loop while still benefiting from Transformers’ built-in input preparation logic.
|
||||
|
||||
### Finding custom generation methods
|
||||
|
||||
You can find all custom generation methods by [searching for their custom tag.](https://huggingface.co/models?other=custom_generate), `custom_generate`. In addition to the tag, we curate two collections of `custom_generate` methods:
|
||||
- [Custom generation methods - Community](https://huggingface.co/collections/transformers-community/custom-generation-methods-community-6888fb1da0efbc592d3a8ab6) -- a collection of powerful methods contributed by the community;
|
||||
- [Custom generation methods - Tutorials](https://huggingface.co/collections/transformers-community/custom-generation-methods-tutorials-6823589657a94940ea02cfec) -- a collection of reference implementations for methods that previously were part of `transformers`, as well as tutorials for `custom_generate`.
|
||||
|
||||
## Resources
|
||||
|
||||
|
@ -48,4 +48,3 @@ Most of those are only useful if you are studying the general code in the librar
|
||||
## Other Utilities
|
||||
|
||||
[[autodoc]] utils._LazyModule
|
||||
[[autodoc]] pytorch_utils.infer_device
|
||||
|
@ -356,76 +356,65 @@ A [`Constraint`] can be used to force the generation to include specific tokens
|
||||
|
||||
## Caches
|
||||
|
||||
[[autodoc]] CacheLayerMixin
|
||||
- update
|
||||
- get_seq_length
|
||||
- get_mask_sizes
|
||||
- get_max_cache_shape
|
||||
- reset
|
||||
- reorder_cache
|
||||
- lazy_initialization
|
||||
|
||||
[[autodoc]] DynamicLayer
|
||||
- update
|
||||
- lazy_initialization
|
||||
- crop
|
||||
- batch_repeat_interleave
|
||||
- batch_select_indices
|
||||
|
||||
[[autodoc]] StaticLayer
|
||||
- update
|
||||
- lazy_initialization
|
||||
|
||||
[[autodoc]] SlidingWindowLayer
|
||||
- update
|
||||
- lazy_initialization
|
||||
|
||||
[[autodoc]] QuantoQuantizedLayer
|
||||
- update
|
||||
- lazy_initialization
|
||||
|
||||
[[autodoc]] HQQQuantizedLayer
|
||||
- update
|
||||
- lazy_initialization
|
||||
|
||||
[[autodoc]] Cache
|
||||
- update
|
||||
- early_initialization
|
||||
- get_seq_length
|
||||
- get_mask_sizes
|
||||
- get_max_cache_shape
|
||||
- reset
|
||||
- reorder_cache
|
||||
- crop
|
||||
- batch_repeat_interleave
|
||||
- batch_select_indices
|
||||
|
||||
[[autodoc]] CacheConfig
|
||||
- update
|
||||
|
||||
[[autodoc]] QuantizedCacheConfig
|
||||
- validate
|
||||
|
||||
[[autodoc]] DynamicCache
|
||||
- update
|
||||
- get_seq_length
|
||||
- reorder_cache
|
||||
- to_legacy_cache
|
||||
- from_legacy_cache
|
||||
|
||||
[[autodoc]] QuantizedCache
|
||||
- update
|
||||
- get_seq_length
|
||||
|
||||
[[autodoc]] QuantoQuantizedCache
|
||||
|
||||
[[autodoc]] HQQQuantizedCache
|
||||
|
||||
[[autodoc]] OffloadedCache
|
||||
- update
|
||||
- prefetch_layer
|
||||
- evict_previous_layer
|
||||
|
||||
[[autodoc]] StaticCache
|
||||
- update
|
||||
- get_seq_length
|
||||
- reset
|
||||
|
||||
[[autodoc]] OffloadedStaticCache
|
||||
- update
|
||||
- get_seq_length
|
||||
- reset
|
||||
|
||||
[[autodoc]] HybridCache
|
||||
|
||||
[[autodoc]] HybridChunkedCache
|
||||
- update
|
||||
- get_seq_length
|
||||
- reset
|
||||
|
||||
[[autodoc]] SlidingWindowCache
|
||||
- update
|
||||
- reset
|
||||
|
||||
[[autodoc]] EncoderDecoderCache
|
||||
- get_seq_length
|
||||
- to_legacy_cache
|
||||
- from_legacy_cache
|
||||
- reset
|
||||
- reorder_cache
|
||||
|
||||
[[autodoc]] MambaCache
|
||||
- update_conv_state
|
||||
- update_ssm_state
|
||||
- reset
|
||||
|
||||
## Watermark Utils
|
||||
|
||||
|
@ -247,114 +247,3 @@ first and last layer will be shown. This is useful when some layers (typically c
|
||||
layers.
|
||||
|
||||
[[autodoc]] model_addition_debugger_context
|
||||
|
||||
## Analyzer of skipped tests
|
||||
|
||||
### Scan skipped tests - for model adders and maintainers
|
||||
|
||||
This small util is a power user tool intended for model adders and maintainers. It lists all test methods
|
||||
existing in `test_modeling_common.py`, inherited by all model tester classes, and scans the repository to measure
|
||||
how many tests are being skipped and for which models.
|
||||
|
||||
### Rationale
|
||||
|
||||
When porting models to transformers, tests fail as they should, and sometimes `test_modeling_common` feels irreconcilable with the peculiarities of our brand new model. But how can we be sure we're not breaking everything by adding a seemingly innocent skip?
|
||||
|
||||
This utility:
|
||||
- scans all test_modeling_common methods
|
||||
- looks for times where a method is skipped
|
||||
- returns a summary json you can load as a DataFrame/inspect
|
||||
|
||||
**For instance test_inputs_embeds is skipped in a whooping 39% proportion at the time of writing this util.**
|
||||
|
||||

|
||||
|
||||
|
||||
### Usage
|
||||
|
||||
You can run the skipped test analyzer in two ways:
|
||||
|
||||
#### Full scan (default)
|
||||
|
||||
From the root of `transformers` repo, scans all common test methods and outputs the results to a JSON file (default: `all_tests_scan_result.json`).
|
||||
|
||||
```bash
|
||||
python utils/scan_skipped_tests.py --output_dir path/to/output
|
||||
```
|
||||
|
||||
- `--output_dir` (optional): Directory where the JSON results will be saved. Defaults to the current directory.
|
||||
|
||||
**Example output:**
|
||||
|
||||
```
|
||||
🔬 Parsing 331 model test files once each...
|
||||
📝 Aggregating 224 tests...
|
||||
(224/224) test_update_candidate_strategy_with_matches_1es_3d_is_nonecodet_schedule_fa_kwargs
|
||||
✅ Scan complete.
|
||||
|
||||
📄 JSON saved to /home/pablo/git/transformers/all_tests_scan_result.json
|
||||
|
||||
```
|
||||
|
||||
And it will generate `all_tests_scan_result.json` file that you can inspect. The JSON is indexed by method name, and each entry follows this schema, indicating the origin as well (from `common`or `GenerationMixin`.)
|
||||
|
||||
```json
|
||||
{
|
||||
"<method_name>": {
|
||||
"origin": "<test suite>"
|
||||
"models_ran": ["<model_name>", ...],
|
||||
"models_skipped": ["<model_name>", ...],
|
||||
"skipped_proportion": <float>,
|
||||
"reasons_skipped": ["<model_name>: <reason>",
|
||||
...
|
||||
]
|
||||
},
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
Which you can visualise as above with e.g. `pandas`
|
||||
|
||||
```python
|
||||
df = pd.read_json('all_tests_scan_result.json').T
|
||||
df.sort_values(by=['skipped_proportion'], ascending=False)
|
||||
|
||||
```
|
||||
|
||||
### Scan a single test method
|
||||
|
||||
You can focus on a specific test method using `--test_method_name`:
|
||||
|
||||
```bash
|
||||
$ python utils/scan_skipped_tests.py --test_method_name test_inputs_embeds --output_dir path/to/output
|
||||
```
|
||||
|
||||
- `--test_method_name`: Name of the test method to scan (e.g., `test_inputs_embeds`).
|
||||
- `--output_dir` (optional): Directory where the JSON result will be saved.
|
||||
|
||||
**Example output:**
|
||||
|
||||
```bash
|
||||
$ python utils/scan_skipped_tests.py --test_method_name test_inputs_embeds
|
||||
|
||||
🔬 Parsing 331 model test files once each...
|
||||
|
||||
== test_inputs_embeds ==
|
||||
|
||||
Ran : 199/323
|
||||
Skipped : 124/323 (38.4%)
|
||||
- aimv2: Aimv2 does not use inputs_embeds
|
||||
- align: Inputs_embeds is tested in individual model tests
|
||||
- altclip: Inputs_embeds is tested in individual model tests
|
||||
- audio_spectrogram_transformer: AST does not use inputs_embeds
|
||||
- beit: BEiT does not use inputs_embeds
|
||||
- bit: Bit does not use inputs_embeds
|
||||
- blip: Blip does not use inputs_embeds
|
||||
- blip_2: Inputs_embeds is tested in individual model tests
|
||||
- bridgetower:
|
||||
- canine: CANINE does not have a get_input_embeddings() method.
|
||||
- ...
|
||||
|
||||
📄 JSON saved to /home/pablo/git/transformers/scan_test_inputs_embeds.json
|
||||
|
||||
```
|
@ -1,32 +0,0 @@
|
||||
# Jan: using the serving API as a local LLM provider
|
||||
|
||||
This example shows how to use `transformers serve` as a local LLM provider for the [Jan](https://jan.ai/) app. Jan is a ChatGPT-alternative graphical interface, fully running on your machine. The requests to `transformers serve` come directly from the local app -- while this section focuses on Jan, you can extrapolate some instructions to other apps that make local requests.
|
||||
|
||||
## Running models locally
|
||||
|
||||
To connect `transformers serve` with Jan, you'll need to set up a new model provider ("Settings" > "Model Providers"). Click on "Add Provider", and set a new name. In your new model provider page, all you need to set is the "Base URL" to the following pattern:
|
||||
|
||||
```shell
|
||||
http://[host]:[port]/v1
|
||||
```
|
||||
|
||||
where `host` and `port` are the `transformers serve` CLI parameters (`localhost:8000` by default). After setting this up, you should be able to see some models in the "Models" section, hitting "Refresh". Make sure you add some text in the "API key" text field too -- this data is not actually used, but the field can't be empty. Your custom model provider page should look like this:
|
||||
|
||||
<h3 align="center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_jan_model_providers.png"/>
|
||||
</h3>
|
||||
|
||||
You are now ready to chat!
|
||||
|
||||
> [!TIP]
|
||||
> You can add any `transformers`-compatible model to Jan through `transformers serve`. In the custom model provider you created, click on the "+" button in the "Models" section and add its Hub repository name, e.g. `Qwen/Qwen3-4B`.
|
||||
|
||||
## Running models on a separate machine
|
||||
|
||||
To conclude this example, let's look into a more advanced use-case. If you have a beefy machine to serve models with, but prefer using Jan on a different device, you need to add port forwarding. If you have `ssh` access from your Jan machine into your server, this can be accomplished by typing the following to your Jan machine's terminal
|
||||
|
||||
```
|
||||
ssh -N -f -L 8000:localhost:8000 your_server_account@your_server_IP -p port_to_ssh_into_your_server
|
||||
```
|
||||
|
||||
Port forwarding is not Jan-specific: you can use it to connect `transformers serve` running in a different machine with an app of your choice.
|
@ -44,7 +44,7 @@ import torch
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
|
||||
inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)
|
||||
|
||||
model.generate(**inputs, do_sample=False, max_new_tokens=20, use_cache=False)
|
||||
@ -59,7 +59,7 @@ import torch
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
|
||||
inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)
|
||||
|
||||
past_key_values = DynamicCache()
|
||||
@ -87,7 +87,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
|
||||
ckpt = "microsoft/Phi-3-mini-4k-instruct"
|
||||
tokenizer = AutoTokenizer.from_pretrained(ckpt)
|
||||
model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, device_map="auto")
|
||||
model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16).to("cuda:0")
|
||||
inputs = tokenizer("Fun fact: The shortest", return_tensors="pt").to(model.device)
|
||||
|
||||
out = model.generate(**inputs, do_sample=False, max_new_tokens=23, cache_implementation="offloaded")
|
||||
@ -99,26 +99,24 @@ The example below shows how you can fallback on [`OffloadedCache`] if you run ou
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, infer_device
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
|
||||
def resilient_generate(model, *args, **kwargs):
|
||||
oom = False
|
||||
device = infer_device()
|
||||
torch_device_module = getattr(torch, device, torch.cuda)
|
||||
try:
|
||||
return model.generate(*args, **kwargs)
|
||||
except torch.OutOfMemoryError as e:
|
||||
except torch.cuda.OutOfMemoryError as e:
|
||||
print(e)
|
||||
print("retrying with cache_implementation='offloaded'")
|
||||
oom = True
|
||||
if oom:
|
||||
torch_device_module.empty_cache()
|
||||
torch.cuda.empty_cache()
|
||||
kwargs["cache_implementation"] = "offloaded"
|
||||
return model.generate(*args, **kwargs)
|
||||
|
||||
ckpt = "microsoft/Phi-3-mini-4k-instruct"
|
||||
tokenizer = AutoTokenizer.from_pretrained(ckpt)
|
||||
model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, device_map="auto")
|
||||
model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16).to("cuda:0")
|
||||
prompt = ["okay "*1000 + "Fun fact: The most"]
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
||||
beams = { "num_beams": 40, "num_beam_groups": 40, "num_return_sequences": 40, "diversity_penalty": 1.0, "max_new_tokens": 23, "early_stopping": True, }
|
||||
@ -136,7 +134,7 @@ The [`QuantizedCache`] reduces memory requirements by quantizing the KV values t
|
||||
> [!WARNING]
|
||||
> Quantizing the cache can harm latency if the context length is short and there is enough GPU memory available for generation without enabling cache quantization. Try to find a balance between memory efficiency and latency.
|
||||
|
||||
Enable [`QuantizedCache`] by configuring `cache_implementation="quantized"` in [`GenerationConfig`], and the quantization backend, as well as any additional quantization related parameters should also be passed either as a dict. You should use the default values for these additional parameters unless you're running out-of-memory. In that case, consider decreasing the residual length.
|
||||
Enable [`QuantizedCache`] by configuring `cache_implementation="quantized"` in [`GenerationConfig`], and indicate the quantization backend in [`QuantizedCacheConfig`]. Any additional quantization related parameters should also be passed either as a dict or an instance of [`QuantizedCacheConfig`]. You should use the default values for these additional parameters unless you're running out-of-memory. In that case, consider decreasing the residual length.
|
||||
|
||||
<hfoptions id="quantized-cache">
|
||||
<hfoption id="HQQQuantizedCache">
|
||||
@ -144,14 +142,13 @@ Enable [`QuantizedCache`] by configuring `cache_implementation="quantized"` in [
|
||||
For [`HQQQuantizedCache`], we recommend setting the `axis-key` and `axis-value` parameters to `1`.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, HQQQuantizedCache
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, HQQQuantizedCache, QuantizedCacheConfig
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
|
||||
inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)
|
||||
|
||||
out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"backend": "HQQ"})
|
||||
out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"axis-key": 1, "axis-value": 1, "backend": "hqq"})
|
||||
print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
|
||||
I like rock music because it's loud and energetic. It's a great way to express myself and rel
|
||||
```
|
||||
@ -162,14 +159,13 @@ I like rock music because it's loud and energetic. It's a great way to express m
|
||||
For [`QuantoQuantizedCache`], we recommend setting the `axis-key` and `axis-value` parameters to `0`.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, QuantoQuantizedCache
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, QuantoQuantizedCache, QuantizedCacheConfig
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
|
||||
inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)
|
||||
|
||||
out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"nbits": 4, "backend": "quanto"})
|
||||
out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"nbits": 4, "axis-key": 0, "axis-value": 0, "backend": "quanto"})
|
||||
print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
|
||||
I like rock music because it's loud and energetic. It's a great way to express myself and rel
|
||||
```
|
||||
@ -211,14 +207,14 @@ import torch
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map={"": 0})
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
|
||||
inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
|
||||
|
||||
out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="offloaded_static")
|
||||
tokenizer.batch_decode(out, skip_special_tokens=True)[0]
|
||||
"Hello, my name is [Your Name], and I am a [Your Profession] with [Number of Years] of"
|
||||
```
|
||||
Cache offloading requires a CUDA GPU or Intel XPU.
|
||||
Cache offloading requires a CUDA GPU.
|
||||
|
||||
### Sliding window cache
|
||||
|
||||
@ -231,7 +227,7 @@ import torch
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
|
||||
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16, device_map="auto")
|
||||
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16).to("cuda:0")
|
||||
inputs = tokenizer("Yesterday I was on a rock concert and.", return_tensors="pt").to(model.device)
|
||||
|
||||
out = model.generate(**inputs, do_sample=False, max_new_tokens=30, cache_implementation="sliding_window")
|
||||
@ -277,6 +273,7 @@ from transformers.cache_utils import (
|
||||
StaticCache,
|
||||
SlidingWindowCache,
|
||||
QuantoQuantizedCache,
|
||||
QuantizedCacheConfig,
|
||||
)
|
||||
|
||||
model_id = "meta-llama/Llama-2-7b-chat-hf"
|
||||
@ -309,15 +306,15 @@ import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache, StaticCache
|
||||
|
||||
model_id = "meta-llama/Llama-2-7b-chat-hf"
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map={"": 0})
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda")
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||
|
||||
# Init StaticCache with big enough max-length (1024 tokens for the below example)
|
||||
# You can also init a DynamicCache, if that suits you better
|
||||
prompt_cache = StaticCache(config=model.config, max_cache_len=1024)
|
||||
prompt_cache = StaticCache(config=model.config, max_batch_size=1, max_cache_len=1024, device="cuda", dtype=torch.bfloat16)
|
||||
|
||||
INITIAL_PROMPT = "You are a helpful assistant. "
|
||||
inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to(model.device.type)
|
||||
inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to("cuda")
|
||||
# This is the common prompt cached, we need to run forward without grad to be able to copy
|
||||
with torch.no_grad():
|
||||
prompt_cache = model(**inputs_initial_prompt, past_key_values = prompt_cache).past_key_values
|
||||
@ -325,7 +322,7 @@ with torch.no_grad():
|
||||
prompts = ["Help me to write a blogpost about travelling.", "What is the capital of France?"]
|
||||
responses = []
|
||||
for prompt in prompts:
|
||||
new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to(model.device.type)
|
||||
new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to("cuda")
|
||||
past_key_values = copy.deepcopy(prompt_cache)
|
||||
outputs = model.generate(**new_inputs, past_key_values=past_key_values,max_new_tokens=20)
|
||||
response = tokenizer.batch_decode(outputs)[0]
|
||||
|
@ -93,8 +93,11 @@ model.generation_config.max_new_tokens = 16
|
||||
|
||||
past_key_values = StaticCache(
|
||||
config=model.config,
|
||||
max_batch_size=1,
|
||||
# If you plan to reuse the cache, make sure the cache length is large enough for all cases
|
||||
max_cache_len=prompt_length+(model.generation_config.max_new_tokens*2),
|
||||
device=model.device,
|
||||
dtype=model.dtype
|
||||
)
|
||||
outputs = model.generate(**input_ids, past_key_values=past_key_values)
|
||||
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
|
||||
@ -114,9 +117,10 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
|
||||
Another option for using [`StaticCache`] is to pass it to a models forward pass using the same `past_key_values` argument. This allows you to write your own custom decoding function to decode the next token given the current token, position, and cache position of previously generated tokens.
|
||||
|
||||
```py
|
||||
from transformers import LlamaTokenizer, LlamaForCausalLM, StaticCache, logging, infer_device
|
||||
from transformers import LlamaTokenizer, LlamaForCausalLM, StaticCache, logging
|
||||
from transformers.testing_utils import CaptureLogger
|
||||
import torch
|
||||
from accelerate.test_utils.testing import get_backend
|
||||
|
||||
prompts = [
|
||||
"Simply put, the theory of relativity states that ",
|
||||
@ -124,7 +128,7 @@ prompts = [
|
||||
]
|
||||
|
||||
NUM_TOKENS_TO_GENERATE = 40
|
||||
torch_device = infer_device()
|
||||
torch_device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
|
||||
|
||||
tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", pad_token="</s>", padding_side="right")
|
||||
model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", device_map="sequential")
|
||||
@ -155,7 +159,7 @@ from torch.nn.attention import SDPBackend, sdpa_kernel
|
||||
batch_size, seq_length = inputs["input_ids"].shape
|
||||
with torch.no_grad():
|
||||
past_key_values = StaticCache(
|
||||
config=model.config, max_cache_len=4096
|
||||
config=model.config, max_batch_size=2, max_cache_len=4096, device=torch_device, dtype=model.dtype
|
||||
)
|
||||
cache_position = torch.arange(seq_length, device=torch_device)
|
||||
generated_ids = torch.zeros(
|
||||
@ -238,10 +242,11 @@ Enable speculative decoding by loading an assistant model and passing it to [`~G
|
||||
<hfoption id="greedy search">
|
||||
|
||||
```py
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
import torch
|
||||
from accelerate.test_utils.testing import get_backend
|
||||
|
||||
device = infer_device()
|
||||
device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b")
|
||||
inputs = tokenizer("Einstein's theory of relativity states", return_tensors="pt").to(device)
|
||||
@ -259,10 +264,11 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
||||
For speculative sampling decoding, add the [do_sample](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationConfig.do_sample) and [temperature](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationConfig.temperature) parameters to [`~GenerationMixin.generate`].
|
||||
|
||||
```py
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
import torch
|
||||
from accelerate.test_utils.testing import get_backend
|
||||
|
||||
device = infer_device()
|
||||
device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b")
|
||||
inputs = tokenizer("Einstein's theory of relativity states", return_tensors="pt").to(device)
|
||||
@ -287,10 +293,11 @@ To enable prompt lookup decoding, specify the number of tokens that should be ov
|
||||
<hfoption id="greedy decoding">
|
||||
|
||||
```py
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
import torch
|
||||
from accelerate.test_utils.testing import get_backend
|
||||
|
||||
device = infer_device()
|
||||
device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b")
|
||||
inputs = tokenizer("The second law of thermodynamics states", return_tensors="pt").to(device)
|
||||
@ -308,10 +315,11 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
|
||||
For prompt lookup decoding with sampling, add the [do_sample](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationConfig.do_sample) and [temperature](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationConfig.temperature) parameters to [`~GenerationMixin.generate`].
|
||||
|
||||
```py
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
import torch
|
||||
from accelerate.test_utils.testing import get_backend
|
||||
|
||||
device = infer_device()
|
||||
device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b")
|
||||
inputs = tokenizer("The second law of thermodynamics states", return_tensors="pt").to(device)
|
||||
@ -333,7 +341,7 @@ A known issue with transformer models is that the self-attention mechanism grows
|
||||
|
||||
FlashAttention and [FlashAttention-2](./perf_infer_gpu_one#flashattention-2) break up the attention computation into smaller chunks and reduces the number of intermediate read/write operations to the GPU memory to speed up inference. FlashAttention-2 improves on the original FlashAttention algorithm by also parallelizing over sequence length dimension and better partitioning work on the hardware to reduce synchronization and communication overhead.
|
||||
|
||||
To use FlashAttention-2, set [attn_implementation](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.PreTrainedModel.from_pretrained.attn_implementation) to `"flash_attention_2"` in [`~PreTrainedModel.from_pretrained`] or set with `model.set_attention_implementation("flash_attention_2")` to dynamically update the [attention interface](./attention_interface) after the model is loaded.
|
||||
To use FlashAttention-2, set [attn_implementation](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.PreTrainedModel.from_pretrained.attn_implementation) to `"flash_attention_2"` in [`~PreTrainedModel.from_pretrained`].
|
||||
|
||||
```py
|
||||
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
|
||||
@ -345,14 +353,6 @@ model = AutoModelForCausalLM.from_pretrained(
|
||||
torch_dtype=torch.bfloat16,
|
||||
attn_implementation="flash_attention_2",
|
||||
)
|
||||
|
||||
# Change the model's attention dynamically after loading
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"google/gemma-2b",
|
||||
quantization_config=quant_config,
|
||||
torch_dtype=torch.bfloat16
|
||||
)
|
||||
model.set_attention_implementation("flash_attention_2")
|
||||
```
|
||||
|
||||
### PyTorch scaled dot product attention
|
||||
@ -360,7 +360,7 @@ model.set_attention_implementation("flash_attention_2")
|
||||
Scaled dot product attention (SDPA) is automatically enabled in PyTorch 2.0 and it supports FlashAttention, xFormers, and PyTorch's C++ implementation. SDPA chooses the most performant attention algorithm if you're using a CUDA backend. For other backends, SDPA defaults to the PyTorch C++ implementation.
|
||||
|
||||
> [!TIP]
|
||||
> SDPA automatically supports FlashAttention-2 as long as you have the latest PyTorch version installed.
|
||||
> SDPA automaticallysupports FlashAttention-2 as long as you have the latest PyTorch version installed.
|
||||
|
||||
Use the [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) context manager to explicitly enable or disable any of the four attention algorithms. For example, use `SDPBackend.FLASH_ATTENTION` to enable FlashAttention.
|
||||
|
||||
|
@ -148,9 +148,9 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
|
||||
| Option name | Type | Simplified description |
|
||||
|---|---|---|
|
||||
| `max_new_tokens` | `int` | Controls the maximum generation length. Be sure to define it, as it usually defaults to a small value. |
|
||||
| `do_sample` | `bool` | Defines whether generation will sample the next token (`True`), or is greedy instead (`False`). Most use cases should set this flag to `True`. Check [this guide](./generation_strategies) for more information. |
|
||||
| `do_sample` | `bool` | Defines whether generation will sample the next token (`True`), or is greedy instead (`False`). Most use cases should set this flag to `True`. Check [this guide](./generation_strategies.md) for more information. |
|
||||
| `temperature` | `float` | How unpredictable the next selected token will be. High values (`>0.8`) are good for creative tasks, low values (e.g. `<0.4`) for tasks that require "thinking". Requires `do_sample=True`. |
|
||||
| `num_beams` | `int` | When set to `>1`, activates the beam search algorithm. Beam search is good on input-grounded tasks. Check [this guide](./generation_strategies) for more information. |
|
||||
| `num_beams` | `int` | When set to `>1`, activates the beam search algorithm. Beam search is good on input-grounded tasks. Check [this guide](./generation_strategies.md) for more information. |
|
||||
| `repetition_penalty` | `float` | Set it to `>1.0` if you're seeing the model repeat itself often. Larger values apply a larger penalty. |
|
||||
| `eos_token_id` | `list[int]` | The token(s) that will cause generation to stop. The default value is usually good, but you can specify a different token. |
|
||||
|
||||
|
@ -23,11 +23,11 @@ The crux of these challenges lies in augmenting the computational and memory cap
|
||||
|
||||
In this guide, we will go over the effective techniques for efficient LLM deployment:
|
||||
|
||||
1. **Lower Precision:** Research has shown that operating at reduced numerical precision, namely [8-bit and 4-bit](./main_classes/quantization) can achieve computational advantages without a considerable decline in model performance.
|
||||
1. **Lower Precision:** Research has shown that operating at reduced numerical precision, namely [8-bit and 4-bit](./main_classes/quantization.md) can achieve computational advantages without a considerable decline in model performance.
|
||||
|
||||
2. **Flash Attention:** Flash Attention is a variation of the attention algorithm that not only provides a more memory-efficient approach but also realizes increased efficiency due to optimized GPU memory utilization.
|
||||
|
||||
3. **Architectural Innovations:** Considering that LLMs are always deployed in the same way during inference, namely autoregressive text generation with a long input context, specialized model architectures have been proposed that allow for more efficient inference. The most important advancement in model architectures hereby are [Alibi](https://huggingface.co/papers/2108.12409), [Rotary embeddings](https://huggingface.co/papers/2104.09864), [Multi-Query Attention (MQA)](https://huggingface.co/papers/1911.02150) and [Grouped-Query-Attention (GQA)](https://huggingface.co/papers/2305.13245).
|
||||
3. **Architectural Innovations:** Considering that LLMs are always deployed in the same way during inference, namely autoregressive text generation with a long input context, specialized model architectures have been proposed that allow for more efficient inference. The most important advancement in model architectures hereby are [Alibi](https://huggingface.co/papers/2108.12409), [Rotary embeddings](https://huggingface.co/papers/2104.09864), [Multi-Query Attention (MQA)](https://huggingface.co/papers/1911.02150) and [Grouped-Query-Attention (GQA)]((https://huggingface.co/papers/2305.13245)).
|
||||
|
||||
Throughout this guide, we will offer an analysis of auto-regressive generation from a tensor's perspective. We delve into the pros and cons of adopting lower precision, provide a comprehensive exploration of the latest attention algorithms, and discuss improved LLM architectures. While doing so, we run practical examples showcasing each of the feature improvements.
|
||||
|
||||
|
@ -33,7 +33,6 @@ By default, `TrainingArguments.report_to` is set to `"all"`, so a [`Trainer`] wi
|
||||
it's the second one).
|
||||
- [`~integrations.TensorBoardCallback`] if tensorboard is accessible (either through PyTorch >= 1.4
|
||||
or tensorboardX).
|
||||
- [`~integrations.TrackioCallback`] if [trackio](https://github.com/gradio-app/trackio) is installed.
|
||||
- [`~integrations.WandbCallback`] if [wandb](https://www.wandb.com/) is installed.
|
||||
- [`~integrations.CometCallback`] if [comet_ml](https://www.comet.com/site/) is installed.
|
||||
- [`~integrations.MLflowCallback`] if [mlflow](https://www.mlflow.org/) is installed.
|
||||
@ -73,9 +72,6 @@ Here is the list of the available [`TrainerCallback`] in the library:
|
||||
|
||||
[[autodoc]] integrations.TensorBoardCallback
|
||||
|
||||
[[autodoc]] integrations.TrackioCallback
|
||||
- setup
|
||||
|
||||
[[autodoc]] integrations.WandbCallback
|
||||
- setup
|
||||
|
||||
|
@ -65,10 +65,6 @@ Learn how to quantize models in the [Quantization](../quantization) guide.
|
||||
|
||||
[[autodoc]] HqqConfig
|
||||
|
||||
## Mxfp4Config
|
||||
|
||||
[[autodoc]] Mxfp4Config
|
||||
|
||||
## FbgemmFp8Config
|
||||
|
||||
[[autodoc]] FbgemmFp8Config
|
||||
@ -97,10 +93,6 @@ Learn how to quantize models in the [Quantization](../quantization) guide.
|
||||
|
||||
[[autodoc]] QuarkConfig
|
||||
|
||||
## FPQuantConfig
|
||||
|
||||
[[autodoc]] FPQuantConfig
|
||||
|
||||
## AutoRoundConfig
|
||||
|
||||
[[autodoc]] AutoRoundConfig
|
||||
|
@ -1,105 +0,0 @@
|
||||
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2024-11-21 and added to Hugging Face Transformers on 2025-07-08.*
|
||||
|
||||
# AIMv2
|
||||
|
||||
## Overview
|
||||
|
||||
The AIMv2 model was proposed in [Multimodal Autoregressive Pre-training of Large Vision Encoders](https://huggingface.co/papers/2411.14402) by Enrico Fini, Mustafa Shukor, Xiujun Li, Philipp Dufter, Michal Klein, David Haldimann, Sai Aitharaju, Victor Guilherme Turrisi da Costa, Louis Béthune, Zhe Gan, Alexander T Toshev, Marcin Eichner, Moin Nabi, Yinfei Yang, Joshua M. Susskind, Alaaeldin El-Nouby.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*We introduce a novel method for pre-training of large-scale vision encoders. Building on recent advancements in autoregressive pre-training of vision models, we extend this framework to a multimodal setting, i.e., images and text. In this paper, we present AIMV2, a family of generalist vision encoders characterized by a straightforward pre-training process, scalability, and remarkable performance across a range of downstream tasks. This is achieved by pairing the vision encoder with a multimodal decoder that autoregressively generates raw image patches and text tokens. Our encoders excel not only in multimodal evaluations but also in vision benchmarks such as localization, grounding, and classification. Notably, our AIMV2-3B encoder achieves 89.5% accuracy on ImageNet-1k with a frozen trunk. Furthermore, AIMV2 consistently outperforms state-of-the-art contrastive models (e.g., CLIP, SigLIP) in multimodal image understanding across diverse settings.*
|
||||
|
||||
|
||||
This model was contributed by [Yaswanth Gali](https://huggingface.co/yaswanthgali).
|
||||
The original code can be found [here](https://github.com/apple/ml-aim).
|
||||
|
||||
## Usage Example
|
||||
|
||||
Here is an example of Image Feature Extraction using specific checkpoints on resized images and native resolution images:
|
||||
|
||||
```python
|
||||
import requests
|
||||
from PIL import Image
|
||||
from transformers import AutoImageProcessor, AutoModel
|
||||
|
||||
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
image = Image.open(requests.get(url, stream=True).raw)
|
||||
|
||||
processor = AutoImageProcessor.from_pretrained("apple/aimv2-large-patch14-native")
|
||||
model = AutoModel.from_pretrained("apple/aimv2-large-patch14-native")
|
||||
|
||||
inputs = processor(images=image, return_tensors="pt")
|
||||
outputs = model(**inputs)
|
||||
```
|
||||
|
||||
Here is an example of a checkpoint performing zero-shot classification:
|
||||
|
||||
```python
|
||||
import requests
|
||||
from PIL import Image
|
||||
from transformers import AutoProcessor, AutoModel
|
||||
|
||||
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
image = Image.open(requests.get(url, stream=True).raw)
|
||||
text = ["Picture of a dog.", "Picture of a cat.", "Picture of a horse."]
|
||||
|
||||
processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")
|
||||
model = AutoModel.from_pretrained("apple/aimv2-large-patch14-224-lit")
|
||||
|
||||
inputs = processor(
|
||||
images=image,
|
||||
text=text,
|
||||
add_special_tokens=True,
|
||||
truncation=True,
|
||||
padding=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
outputs = model(**inputs)
|
||||
probs = outputs.logits_per_image.softmax(dim=-1)
|
||||
```
|
||||
|
||||
## Aimv2Config
|
||||
|
||||
[[autodoc]] Aimv2Config
|
||||
|
||||
## Aimv2TextConfig
|
||||
|
||||
[[autodoc]] Aimv2TextConfig
|
||||
|
||||
## Aimv2VisionConfig
|
||||
|
||||
[[autodoc]] Aimv2VisionConfig
|
||||
|
||||
## Aimv2Model
|
||||
|
||||
[[autodoc]] Aimv2Model
|
||||
- forward
|
||||
|
||||
## Aimv2VisionModel
|
||||
|
||||
[[autodoc]] Aimv2VisionModel
|
||||
- forward
|
||||
|
||||
## Aimv2TextModel
|
||||
|
||||
[[autodoc]] Aimv2TextModel
|
||||
- forward
|
||||
|
||||
</pt>
|
||||
<tf>
|
@ -13,7 +13,6 @@ specific language governing permissions and limitations under the License.
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2019-09-26 and added to Hugging Face Transformers on 2020-11-16.*
|
||||
|
||||
<div style="float: right;">
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
|
@ -13,7 +13,6 @@ specific language governing permissions and limitations under the License.
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2021-02-11 and added to Hugging Face Transformers on 2023-03-01.*
|
||||
<div style="float: right;">
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
@ -66,18 +65,18 @@ from PIL import Image
|
||||
from transformers import AutoProcessor, AutoModelForZeroShotImageClassification
|
||||
|
||||
processor = AutoProcessor.from_pretrained("kakaobrain/align-base")
|
||||
model = AutoModelForZeroShotImageClassification.from_pretrained("kakaobrain/align-base", device_map="auto")
|
||||
model = AutoModelForZeroShotImageClassification.from_pretrained("kakaobrain/align-base").to("cuda")
|
||||
|
||||
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
|
||||
image = requests.get(url, stream=True)
|
||||
inputs = Image.open(image.raw).convert("RGB")
|
||||
|
||||
image_inputs = processor(images=inputs, return_tensors="pt").to(model.device)
|
||||
image_inputs = processor(images=inputs, return_tensors="pt").to("cuda")
|
||||
with torch.no_grad():
|
||||
image_embeds = model.get_image_features(**image_inputs)
|
||||
|
||||
candidate_labels = ["a photo of a dog", "a photo of a cat", "a photo of a person"]
|
||||
text_inputs = processor(text=candidate_labels, padding=True, return_tensors="pt").to(model.device)
|
||||
text_inputs = processor(text=candidate_labels, padding=True, return_tensors="pt").to("cuda")
|
||||
with torch.no_grad():
|
||||
text_embeds = model.get_text_features(**text_inputs)
|
||||
|
||||
|
@ -13,7 +13,6 @@ specific language governing permissions and limitations under the License.
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2022-11-12 and added to Hugging Face Transformers on 2023-01-04.*
|
||||
|
||||
<div style="float: right;">
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
|
@ -1,105 +0,0 @@
|
||||
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2025-06-18 and added to Hugging Face Transformers on 2025-06-24.*
|
||||
|
||||
<div style="float: right;">
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
|
||||
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
# Arcee
|
||||
|
||||
[Arcee](https://www.arcee.ai/blog/deep-dive-afm-4-5b-the-first-arcee-foundational-model) is a decoder-only transformer model based on the Llama architecture with a key modification: it uses ReLU² (ReLU-squared) activation in the MLP blocks instead of SiLU, following recent research showing improved training efficiency with squared activations. This architecture is designed for efficient training and inference while maintaining the proven stability of the Llama design.
|
||||
|
||||
The Arcee model is architecturally similar to Llama but uses `x * relu(x)` in MLP layers for improved gradient flow and is optimized for efficiency in both training and inference scenarios.
|
||||
|
||||
> [!TIP]
|
||||
> The Arcee model supports extended context with RoPE scaling and all standard transformers features including Flash Attention 2, SDPA, gradient checkpointing, and quantization support.
|
||||
|
||||
The example below demonstrates how to generate text with Arcee using [`Pipeline`] or the [`AutoModel`].
|
||||
|
||||
<hfoptions id="usage">
|
||||
<hfoption id="Pipeline">
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import pipeline
|
||||
|
||||
pipeline = pipeline(
|
||||
task="text-generation",
|
||||
model="arcee-ai/AFM-4.5B",
|
||||
torch_dtype=torch.float16,
|
||||
device=0
|
||||
)
|
||||
|
||||
output = pipeline("The key innovation in Arcee is")
|
||||
print(output[0]["generated_text"])
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="AutoModel">
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import AutoTokenizer, ArceeForCausalLM
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("arcee-ai/AFM-4.5B")
|
||||
model = ArceeForCausalLM.from_pretrained(
|
||||
"arcee-ai/AFM-4.5B",
|
||||
torch_dtype=torch.float16,
|
||||
device_map="auto"
|
||||
)
|
||||
|
||||
inputs = tokenizer("The key innovation in Arcee is", return_tensors="pt")
|
||||
with torch.no_grad():
|
||||
outputs = model.generate(**inputs, max_new_tokens=50)
|
||||
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## ArceeConfig
|
||||
|
||||
[[autodoc]] ArceeConfig
|
||||
|
||||
## ArceeModel
|
||||
|
||||
[[autodoc]] ArceeModel
|
||||
- forward
|
||||
|
||||
## ArceeForCausalLM
|
||||
|
||||
[[autodoc]] ArceeForCausalLM
|
||||
- forward
|
||||
|
||||
## ArceeForSequenceClassification
|
||||
|
||||
[[autodoc]] ArceeForSequenceClassification
|
||||
- forward
|
||||
|
||||
## ArceeForQuestionAnswering
|
||||
|
||||
[[autodoc]] ArceeForQuestionAnswering
|
||||
- forward
|
||||
|
||||
## ArceeForTokenClassification
|
||||
|
||||
[[autodoc]] ArceeForTokenClassification
|
||||
- forward
|
@ -13,7 +13,6 @@ specific language governing permissions and limitations under the License.
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2024-10-08 and added to Hugging Face Transformers on 2024-12-06.*
|
||||
|
||||
<div style="float: right;">
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
|
@ -13,7 +13,6 @@ specific language governing permissions and limitations under the License.
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2021-04-05 and added to Hugging Face Transformers on 2022-11-21.*
|
||||
|
||||
# Audio Spectrogram Transformer
|
||||
|
||||
|
@ -258,10 +258,6 @@ The following auto classes are available for the following computer vision tasks
|
||||
|
||||
[[autodoc]] AutoModelForKeypointDetection
|
||||
|
||||
### AutoModelForKeypointMatching
|
||||
|
||||
[[autodoc]] AutoModelForKeypointMatching
|
||||
|
||||
### AutoModelForMaskedImageModeling
|
||||
|
||||
[[autodoc]] AutoModelForMaskedImageModeling
|
||||
@ -354,10 +350,6 @@ The following auto classes are available for the following audio tasks.
|
||||
|
||||
[[autodoc]] AutoModelForTextToWaveform
|
||||
|
||||
### AutoModelForAudioTokenization
|
||||
|
||||
[[autodoc]] AutoModelForAudioTokenization
|
||||
|
||||
## Multimodal
|
||||
|
||||
The following auto classes are available for the following multimodal tasks.
|
||||
|
@ -13,7 +13,6 @@ specific language governing permissions and limitations under the License.
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2021-06-24 and added to Hugging Face Transformers on 2023-05-30.*
|
||||
|
||||
# Autoformer
|
||||
|
||||
|
@ -13,7 +13,6 @@ specific language governing permissions and limitations under the License.
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2025-05-13 and added to Hugging Face Transformers on 2025-03-04.*
|
||||
|
||||
<div style="float: right;">
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
@ -60,8 +59,8 @@ print(outputs)
|
||||
|
||||
```python
|
||||
# pip install 'git+https://github.com/huggingface/transformers.git@v4.49.0-Aya Vision'
|
||||
import torch
|
||||
from transformers import AutoProcessor, AutoModelForImageTextToText
|
||||
import torch
|
||||
|
||||
model_id = "CohereLabs/aya-vision-8b"
|
||||
|
||||
@ -133,7 +132,7 @@ inputs = processor.apply_chat_template(
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_tensors="pt"
|
||||
).to(model.device)
|
||||
).to("cuda")
|
||||
|
||||
generated = model.generate(**inputs, max_new_tokens=50)
|
||||
print(processor.tokenizer.decode(generated[0], skip_special_tokens=True))
|
||||
@ -148,12 +147,12 @@ print(processor.tokenizer.decode(generated[0], skip_special_tokens=True))
|
||||
- The example below demonstrates inference with multiple images.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import AutoProcessor, AutoModelForImageTextToText
|
||||
import torch
|
||||
|
||||
processor = AutoProcessor.from_pretrained("CohereForAI/aya-vision-8b")
|
||||
model = AutoModelForImageTextToText.from_pretrained(
|
||||
"CohereForAI/aya-vision-8b", device_map="auto", torch_dtype=torch.float16
|
||||
"CohereForAI/aya-vision-8b", device_map="cuda", torch_dtype=torch.float16
|
||||
)
|
||||
|
||||
messages = [
|
||||
@ -178,7 +177,7 @@ print(processor.tokenizer.decode(generated[0], skip_special_tokens=True))
|
||||
|
||||
inputs = processor.apply_chat_template(
|
||||
messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
|
||||
).to(model.device)
|
||||
).to("cuda")
|
||||
|
||||
gen_tokens = model.generate(
|
||||
**inputs,
|
||||
@ -194,12 +193,12 @@ print(processor.tokenizer.decode(generated[0], skip_special_tokens=True))
|
||||
- The example below demonstrates inference with batched inputs.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import AutoProcessor, AutoModelForImageTextToText
|
||||
import torch
|
||||
|
||||
processor = AutoProcessor.from_pretrained(model_id)
|
||||
model = AutoModelForImageTextToText.from_pretrained(
|
||||
"CohereForAI/aya-vision-8b", device_map="auto", torch_dtype=torch.float16
|
||||
"CohereForAI/aya-vision-8b", device_map="cuda", torch_dtype=torch.float16
|
||||
)
|
||||
|
||||
batch_messages = [
|
||||
|
@ -13,129 +13,85 @@ specific language governing permissions and limitations under the License.
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2024-12-18 and added to Hugging Face Transformers on 2024-12-19.*
|
||||
|
||||
<div style="float: right;">
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
|
||||
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
# Bamba
|
||||
|
||||
[Bamba](https://huggingface.co/blog/bamba) is a 9B parameter decoder-only language model built on the [Mamba-2](./mamba2) architecture. It is pretrained in two stages - it starts by training on 2T tokens from the [Dolma v1.7](https://huggingface.co/datasets/allenai/dolma) dataset and then trained on an additional 200B tokens from [FineWeb](https://huggingface.co/datasets/HuggingFaceFW/fineweb) and [Cosmopedia](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia).
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
|
||||
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
</div>
|
||||
|
||||
You can find all the original Bamba checkpoints under the [Bamba](https://huggingface.co/collections/ibm-ai-platform/bamba-674f1388b9bbc98b413c7bab) collection.
|
||||
## Overview
|
||||
|
||||
> [!TIP]
|
||||
> This model was contributed by [ani300](https://github.com/ani300) and [fabianlim](https://github.com/fabianlim).
|
||||
>
|
||||
> Click on the Bamba models in the right sidebar for more examples of how to apply Bamba to different text generation tasks.
|
||||
Bamba-9B is a decoder-only language model based on the [Mamba-2](https://github.com/state-spaces/mamba) architecture and is designed to handle a wide range of text generation tasks. It is trained from scratch using a two-stage training approach. In the first stage, the model is trained on 2 trillion tokens from the Dolma v1.7 dataset. In the second stage, it undergoes additional training on 200 billion tokens, leveraging a carefully curated blend of high-quality data to further refine its performance and enhance output quality.
|
||||
|
||||
The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line.
|
||||
|
||||
<hfoptions id="usage">
|
||||
<hfoption id="Pipeline">
|
||||
|
||||
```python
|
||||
import torch
|
||||
from transformers import pipeline
|
||||
|
||||
pipeline = pipeline(
|
||||
task="text-generation",
|
||||
model="ibm-ai-platform/Bamba-9B-v2",
|
||||
torch_dtype=torch.bfloat16,
|
||||
device=0
|
||||
)
|
||||
pipeline("Plants create energy through a process known as")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
|
||||
<hfoption id="AutoModel">
|
||||
|
||||
```python
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("ibm-ai-platform/Bamba-9B-v2")
|
||||
model = AutoModelForCausalLM.from_pretrained("ibm-ai-platform/Bamba-9B-v2", torch_dtype=torch.bfloat16, device_map="auto", attn_implementation="sdpa")
|
||||
input_ids = tokenizer("Plants create energy through a process known as", return_tensors="pt").to(model.device)
|
||||
|
||||
output = model.generate(**input_ids)
|
||||
print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
|
||||
<hfoption id="transformers CLI">
|
||||
```bash
|
||||
echo "Plants create energy through a process known as" | transformers-cli run --task text-generation --model ibm-ai-platform/Bamba-9B-v2 --device 0
|
||||
```
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
|
||||
|
||||
The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
|
||||
|
||||
quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
|
||||
tokenizer = AutoTokenizer.from_pretrained("ibm-ai-platform/Bamba-9B-v2")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"ibm-ai-platform/Bamba-9B-v2",
|
||||
quantization_config=quantization_config,
|
||||
device_map="auto",
|
||||
attn_implementation="sdpa"
|
||||
)
|
||||
|
||||
inputs = tokenizer("Plants create energy through a process known as", return_tensors="pt").to(model.device)
|
||||
output = model.generate(**inputs)
|
||||
print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- Bamba supports padding-free training which concatenates distinct training examples while still processing inputs as separate batches. It can significantly accelerate inference by [~2x](https://github.com/huggingface/transformers/pull/35861#issue-2807873129) (depending on model and data distribution) and reduce memory-usage if there are examples of varying lengths by avoiding unnecessary compute and memory overhead from padding tokens.
|
||||
|
||||
Padding-free training requires the `flash-attn`, `mamba-ssm`, and `causal-conv1d` packages and the following arguments must be passed to the model in addition to `input_ids` and `labels`.
|
||||
|
||||
- `position_ids: torch.LongTensor`: the position index of each token in each sequence.
|
||||
- `seq_idx: torch.IntTensor`: the index of each sequence in the batch.
|
||||
- Each of the [`FlashAttentionKwargs`]
|
||||
- `cu_seq_lens_q: torch.LongTensor`: the cumulative sequence lengths of all queries.
|
||||
- `cu_seq_lens_k: torch.LongTensor`: the cumulative sequence lengths of all keys.
|
||||
- `max_length_q: int`: the longest query length in the batch.
|
||||
- `max_length_k: int`: the longest key length in the batch.
|
||||
|
||||
The `attention_mask` inputs should not be provided. The [`DataCollatorWithFlattening`] programmatically generates the set of additional arguments above using `return_seq_idx=True` and `return_flash_attn_kwargs=True`. See the [Improving Hugging Face Training Efficiency Through Packing with Flash Attention](https://huggingface.co/blog/packing-with-FA2) blog post for additional information.
|
||||
|
||||
```python
|
||||
from transformers import DataCollatorWithFlattening
|
||||
|
||||
# Example of using padding-free training
|
||||
data_collator = DataCollatorWithFlattening(
|
||||
tokenizer=tokenizer,
|
||||
return_seq_idx=True,
|
||||
return_flash_attn_kwargs=True
|
||||
)
|
||||
```
|
||||
Checkout all Bamba-9B model checkpoints [here](https://github.com/foundation-model-stack/bamba).
|
||||
|
||||
## BambaConfig
|
||||
|
||||
| Model | Params | # Layers | Hidden Dim. | Attention Heads | GQA | KV Heads | Context Length | Tied Embeddings |
|
||||
|-------------------|--------------|----------|-------------|-----------------|-----|----------|----------------|------------------|
|
||||
| Bamba | 9B (9.78B) | 32 | 4096 | 32 | Yes | 8 | 4096 | True |
|
||||
|
||||
[[autodoc]] BambaConfig
|
||||
|
||||
<!---
|
||||
## Usage Tips
|
||||
|
||||
Tips:
|
||||
|
||||
- The architecture is based on Mamba-2 models.
|
||||
|
||||
## BambaModel
|
||||
|
||||
[[autodoc]] BambaModel
|
||||
- forward
|
||||
-->
|
||||
|
||||
## BambaForCausalLM
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("ibm-fms/Bamba-9B")
|
||||
tokenizer = AutoTokenizer.from_pretrained("ibm-fms/Bamba-9B")
|
||||
|
||||
message = ["Mamba is a snake with following properties "]
|
||||
inputs = tokenizer(message, return_tensors='pt', return_token_type_ids=False)
|
||||
response = model.generate(**inputs, max_new_tokens=64)
|
||||
print(tokenizer.batch_decode(response, skip_special_tokens=True)[0])
|
||||
```
|
||||
|
||||
|
||||
## Padding-Free Training
|
||||
|
||||
Bamba supports padding-free training in which distinct training examples can be concatenated
|
||||
together while nevertheless processing the inputs as though they belonged to separate batches. When
|
||||
the examples are of varying lengths, padding-free training can provide significant speed ups and
|
||||
memory savings compared to batching the examples together and using padding, as the unnecessary
|
||||
compute and memory due to padding is avoided entirely. The performance gains depend on factors such
|
||||
as the model and the data distribution, but throughput gains up to [~2x are commonly
|
||||
seen](https://github.com/huggingface/transformers/pull/35861#issue-2807873129).
|
||||
|
||||
Using padding-free training with Bamba requires the `flash-attn`, `mamba-ssm`, and `causal-conv1d`
|
||||
packages, and the following arguments must be passed to the model in addition to `input_ids` and
|
||||
`labels`:
|
||||
* `position_ids: torch.LongTensor`: the position index of each token in each sequence.
|
||||
* `seq_idx: torch.IntTensor`: the index of each sequence in the batch.
|
||||
* Each of the [`FlashAttentionKwargs`]
|
||||
* `cu_seq_lens_q: torch.LongTensor`: The cumulative sequence lengths of all queries.
|
||||
* `cu_seq_lens_k: torch.LongTensor`: The cumulative sequence lengths of all keys.
|
||||
* `max_length_q: int`: the longest query length in the batch.
|
||||
* `max_length_k: int`: the longest key length in the batch.
|
||||
|
||||
The `attention_mask` inputs should not be provided. The [`DataCollatorWithFlattening`] can be used
|
||||
to programmatically generate the above set of additional arguments using `return_seq_idx=True` and
|
||||
`return_flash_attn_kwargs=True`. See [this blog post](https://huggingface.co/blog/packing-with-FA2)
|
||||
for additional information.
|
||||
|
||||
|
||||
[[autodoc]] BambaForCausalLM
|
||||
- forward
|
||||
|
||||
This HF implementation is contributed by [ani300](https://github.com/ani300) and [fabianlim](https://github.com/fabianlim).
|
||||
|
@ -9,7 +9,6 @@ Unless required by applicable law or agreed to in writing, software distributed
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
*This model was released on 2023-04-09 and added to Hugging Face Transformers on 2023-07-17.*
|
||||
|
||||
# Bark
|
||||
|
||||
@ -20,7 +19,7 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
## Overview
|
||||
|
||||
[Bark](https://huggingface.co/suno/bark) is a transformer-based text-to-speech model proposed by Suno AI in [suno-ai/bark](https://github.com/suno-ai/bark).
|
||||
Bark is a transformer-based text-to-speech model proposed by Suno AI in [suno-ai/bark](https://github.com/suno-ai/bark).
|
||||
|
||||
Bark is made of 4 main models:
|
||||
|
||||
@ -43,10 +42,10 @@ Bark can be optimized with just a few extra lines of code, which **significantly
|
||||
You can speed up inference and reduce memory footprint by 50% simply by loading the model in half-precision.
|
||||
|
||||
```python
|
||||
from transformers import BarkModel, infer_device
|
||||
from transformers import BarkModel
|
||||
import torch
|
||||
|
||||
device = infer_device()
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16).to(device)
|
||||
```
|
||||
|
||||
@ -54,7 +53,7 @@ model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16).
|
||||
|
||||
As mentioned above, Bark is made up of 4 sub-models, which are called up sequentially during audio generation. In other words, while one sub-model is in use, the other sub-models are idle.
|
||||
|
||||
If you're using a CUDA GPU or Intel XPU, a simple solution to benefit from an 80% reduction in memory footprint is to offload the submodels from device to CPU when they're idle. This operation is called *CPU offloading*. You can use it with one line of code as follows:
|
||||
If you're using a CUDA device, a simple solution to benefit from an 80% reduction in memory footprint is to offload the submodels from GPU to CPU when they're idle. This operation is called *CPU offloading*. You can use it with one line of code as follows:
|
||||
|
||||
```python
|
||||
model.enable_cpu_offload()
|
||||
@ -114,10 +113,10 @@ At batch size 8, on an NVIDIA A100, Flash Attention 2 is also 10% faster than Be
|
||||
You can combine optimization techniques, and use CPU offload, half-precision and Flash Attention 2 (or 🤗 Better Transformer) all at once.
|
||||
|
||||
```python
|
||||
from transformers import BarkModel, infer_device
|
||||
from transformers import BarkModel
|
||||
import torch
|
||||
|
||||
device = infer_device()
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
|
||||
# load in fp16 and use Flash Attention 2
|
||||
model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to(device)
|
||||
|
@ -13,7 +13,6 @@ specific language governing permissions and limitations under the License.
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2019-10-29 and added to Hugging Face Transformers on 2020-11-16.*
|
||||
|
||||
|
||||
<div style="float: right;">
|
||||
@ -65,7 +64,7 @@ model = AutoModelForMaskedLM.from_pretrained(
|
||||
device_map="auto",
|
||||
attn_implementation="sdpa"
|
||||
)
|
||||
inputs = tokenizer("Plants create <mask> through a process known as photosynthesis.", return_tensors="pt").to(model.device)
|
||||
inputs = tokenizer("Plants create <mask> through a process known as photosynthesis.", return_tensors="pt").to("cuda")
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs)
|
||||
|
@ -13,83 +13,50 @@ specific language governing permissions and limitations under the License.
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
*This model was released on 2020-10-23 and added to Hugging Face Transformers on 2020-11-27.*
|
||||
|
||||
<div style="float: right;">
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
|
||||
<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=
|
||||
">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
# BARThez
|
||||
|
||||
[BARThez](https://huggingface.co/papers/2010.12321) is a [BART](./bart) model designed for French language tasks. Unlike existing French BERT models, BARThez includes a pretrained encoder-decoder, allowing it to generate text as well. This model is also available as a multilingual variant, mBARThez, by continuing pretraining multilingual BART on a French corpus.
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
|
||||
<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
|
||||
<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=
|
||||
">
|
||||
</div>
|
||||
|
||||
You can find all of the original BARThez checkpoints under the [BARThez](https://huggingface.co/collections/dascim/barthez-670920b569a07aa53e3b6887) collection.
|
||||
## Overview
|
||||
|
||||
> [!TIP]
|
||||
> This model was contributed by [moussakam](https://huggingface.co/moussakam).
|
||||
> Refer to the [BART](./bart) docs for more usage examples.
|
||||
The BARThez model was proposed in [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://huggingface.co/papers/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis on 23 Oct,
|
||||
2020.
|
||||
|
||||
The abstract of the paper:
|
||||
|
||||
|
||||
The example below demonstrates how to predict the `<mask>` token with [`Pipeline`], [`AutoModel`], and from the command line.
|
||||
*Inductive transfer learning, enabled by self-supervised learning, have taken the entire Natural Language Processing
|
||||
(NLP) field by storm, with models such as BERT and BART setting new state of the art on countless natural language
|
||||
understanding tasks. While there are some notable exceptions, most of the available models and research have been
|
||||
conducted for the English language. In this work, we introduce BARThez, the first BART model for the French language
|
||||
(to the best of our knowledge). BARThez was pretrained on a very large monolingual French corpus from past research
|
||||
that we adapted to suit BART's perturbation schemes. Unlike already existing BERT-based French language models such as
|
||||
CamemBERT and FlauBERT, BARThez is particularly well-suited for generative tasks, since not only its encoder but also
|
||||
its decoder is pretrained. In addition to discriminative tasks from the FLUE benchmark, we evaluate BARThez on a novel
|
||||
summarization dataset, OrangeSum, that we release with this paper. We also continue the pretraining of an already
|
||||
pretrained multilingual BART on BARThez's corpus, and we show that the resulting model, which we call mBARTHez,
|
||||
provides a significant boost over vanilla BARThez, and is on par with or outperforms CamemBERT and FlauBERT.*
|
||||
|
||||
<hfoptions id="usage">
|
||||
<hfoption id="Pipeline">
|
||||
This model was contributed by [moussakam](https://huggingface.co/moussakam). The Authors' code can be found [here](https://github.com/moussaKam/BARThez).
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import pipeline
|
||||
<Tip>
|
||||
|
||||
pipeline = pipeline(
|
||||
task="fill-mask",
|
||||
model="moussaKam/barthez",
|
||||
torch_dtype=torch.float16,
|
||||
device=0
|
||||
)
|
||||
pipeline("Les plantes produisent <mask> grâce à un processus appelé photosynthèse.")
|
||||
```
|
||||
BARThez implementation is the same as BART, except for tokenization. Refer to [BART documentation](bart) for information on
|
||||
configuration classes and their parameters. BARThez-specific tokenizers are documented below.
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="AutoModel">
|
||||
</Tip>
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
||||
## Resources
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
"moussaKam/barthez",
|
||||
)
|
||||
model = AutoModelForMaskedLM.from_pretrained(
|
||||
"moussaKam/barthez",
|
||||
torch_dtype=torch.float16,
|
||||
device_map="auto",
|
||||
)
|
||||
inputs = tokenizer("Les plantes produisent <mask> grâce à un processus appelé photosynthèse.", return_tensors="pt").to(model.device)
|
||||
- BARThez can be fine-tuned on sequence-to-sequence tasks in a similar way as BART, check:
|
||||
[examples/pytorch/summarization/](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/README.md).
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs)
|
||||
predictions = outputs.logits
|
||||
|
||||
masked_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
|
||||
predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
|
||||
predicted_token = tokenizer.decode(predicted_token_id)
|
||||
|
||||
print(f"The predicted token is: {predicted_token}")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="transformers CLI">
|
||||
|
||||
```bash
|
||||
echo -e "Les plantes produisent <mask> grâce à un processus appelé photosynthèse." | transformers run --task fill-mask --model moussaKam/barthez --device 0
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## BarthezTokenizer
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user