Compare commits

..

16 Commits

876 changed files with 24313 additions and 14070 deletions

View File

@ -22,6 +22,7 @@ tests/generation/ @gante
/src/transformers/models/auto/ @ArthurZucker
/src/transformers/utils/ @ArthurZucker @Rocketknight1
/src/transformers/loss/ @ArthurZucker
/src/transformers/onnx/ @michaelbenayoun
# Specific files come after the sections/globs, so they take priority
/.circleci/config.yml @ArthurZucker @ydshieh

View File

@ -28,19 +28,20 @@ jobs:
(github.event_name == 'pull_request' && contains( github.event.pull_request.labels.*.name, 'run-benchmark') )||
(github.event_name == 'push' && github.ref == 'refs/heads/main')
container:
image: huggingface/transformers-all-latest-gpu
image: huggingface/transformers-pytorch-gpu
options: --gpus all --privileged --ipc host
steps:
- name: Get repo
uses: actions/checkout@v5
uses: actions/checkout@v4
with:
fetch-depth: 1
ref: ${{ github.event.pull_request.head.sha || github.sha }}
- name: Install benchmark script dependencies
run: python3 -m pip install -r benchmark_v2/requirements.txt kernels
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e ".[torch]"
working-directory: /transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e ".[torch]" && python3 -m pip uninstall -y torchvision # temp fix
- name: Run benchmark
run: |
@ -51,7 +52,7 @@ jobs:
commit_id=$GITHUB_SHA
fi
commit_msg=$(git show -s --format=%s | cut -c1-70)
python3 benchmark_v2/run_benchmarks.py -b 32 -s 128 -n 256 --level 2 --branch-name "$BRANCH_NAME" --commit-id "$commit_id" --commit-message "$commit_msg" --model-id "$MODEL_ID" --log-level INFO --push-result-to-dataset "$DATASET_ID"
python3 benchmark_v2/run_benchmarks.py -b 32 -s 128 -n 256 --branch-name "$BRANCH_NAME" --commit-id "$commit_id" --commit-message "$commit_msg" --model-id "$MODEL_ID" --log-level INFO --push-result-to-dataset "$DATASET_ID"
env:
HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
PUSH_TO_HUB_TOKEN: ${{ secrets.PUSH_TO_HUB_TOKEN }}

View File

@ -9,7 +9,7 @@ jobs:
uses: ./.github/workflows/benchmark_v2.yml
with:
runner: aws-g5-4xlarge-cache-use1-public-80
container_image: huggingface/transformers-all-latest-gpu
container_image: huggingface/transformers-pytorch-gpu
container_options: --gpus all --privileged --ipc host --shm-size "16gb"
commit_sha: ${{ github.sha }}
run_id: ${{ github.run_id }}

View File

@ -45,59 +45,33 @@ jobs:
REF=main
push: true
tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the transformers-all-latest-gpu docker build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
flash-attn-ci-image:
name: "PyTorch with Flash Attn [dev]"
runs-on:
group: aws-general-8-plus
steps:
# Push CI images still need to be re-built daily
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
name: Build and push (for Push CI) in a daily basis
# This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
# The later case is useful for manual image building for debugging purpose. Use another tag in this case!
if: inputs.image_postfix != '-push-ci'
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-all-latest-gpu
build-args: |
REF=main
PYTORCH=2.8.0
TORCHCODEC=0.7.0
FLASH_ATTN=yes
push: true
tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}:flash-attn
tags: huggingface/transformers-all-latest-gpu-push-ci
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the transformers-all-latest-gpu docker build
title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
latest-torch-deepspeed-docker:
name: "Latest PyTorch + DeepSpeed"
runs-on:
group: aws-general-8-plus
group: aws-g4dn-2xlarge-cache
steps:
-
name: Set up Docker Buildx
@ -130,8 +104,51 @@ jobs:
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
# Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`)
latest-torch-deepspeed-docker-for-push-ci-daily-build:
name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)"
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
# Push CI images still need to be re-built daily
-
name: Build and push (for Push CI) in a daily basis
# This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
# The later case is useful for manual image building for debugging purpose. Use another tag in this case!
if: inputs.image_postfix != '-push-ci'
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-pytorch-deepspeed-latest-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
doc-builder:
name: "Doc builder"
# Push CI doesn't need this image
if: inputs.image_postfix != '-push-ci'
runs-on:
group: aws-general-8-plus
steps:
@ -164,6 +181,44 @@ jobs:
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
latest-pytorch:
name: "Latest PyTorch [dev]"
# Push CI doesn't need this image
if: inputs.image_postfix != '-push-ci'
runs-on:
group: aws-general-8-plus
steps:
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Check out code
uses: actions/checkout@v4
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-pytorch-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-gpu
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
latest-pytorch-amd:
name: "Latest PyTorch (AMD) [dev]"
runs-on:
@ -190,47 +245,29 @@ jobs:
REF=main
push: true
tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }}
# Push CI images still need to be re-built daily
-
name: Build and push (for Push CI) in a daily basis
# This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
# The later case is useful for manual image building for debugging purpose. Use another tag in this case!
if: inputs.image_postfix != '-push-ci'
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-pytorch-amd-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-amd-gpu-push-ci
- name: Post to Slack
if: always()
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu build
title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
cache-latest-pytorch-amd:
name: "Cache Latest Pytorch (AMD) Image"
needs: latest-pytorch-amd
runs-on:
group: amd-mi325-1gpu
steps:
-
name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
-
name: Pull and save docker image to cache
run: |
image="huggingface/transformers-pytorch-amd-gpu"
final_path="/mnt/image-cache/transformers-pytorch-amd-gpu.tar"
tmp_path="${final_path}.tmp"
echo "Pulling image: ${image}"
docker pull "${image}"
echo "Saving to temp file: ${tmp_path}"
docker save "${image}" -o "${tmp_path}"
echo "Moving to final path: ${final_path}"
mv -f "${tmp_path}" "${final_path}"
echo "Cache populated successfully at ${final_path}"
latest-pytorch-deepspeed-amd:
name: "PyTorch + DeepSpeed (AMD) [dev]"
runs-on:
@ -257,6 +294,19 @@ jobs:
REF=main
push: true
tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}
# Push CI images still need to be re-built daily
-
name: Build and push (for Push CI) in a daily basis
# This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
# The later case is useful for manual image building for debugging purpose. Use another tag in this case!
if: inputs.image_postfix != '-push-ci'
uses: docker/build-push-action@v5
with:
context: ./docker/transformers-pytorch-deepspeed-amd-gpu
build-args: |
REF=main
push: true
tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci
- name: Post to Slack
if: always()
@ -269,6 +319,8 @@ jobs:
latest-quantization-torch-docker:
name: "Latest Pytorch + Quantization [dev]"
# Push CI doesn't need this image
if: inputs.image_postfix != '-push-ci'
runs-on:
group: aws-general-8-plus
steps:

View File

@ -1,23 +0,0 @@
---
name: Check Permissions Advisor
on:
workflow_dispatch:
inputs:
workflow_name:
description: 'Workflow file name'
type: string
run_count:
description: 'Number of runs to analyze'
type: string
default: "10"
jobs:
advisor:
uses: huggingface/security-workflows/.github/workflows/permissions-advisor-reusable.yml@main
permissions:
actions: read
contents: read
with:
workflow_name: ${{ inputs.workflow_name }}
run_count: ${{ fromJSON(inputs.run_count) }}

View File

@ -6,6 +6,9 @@ on:
docker:
required: true
type: string
start_sha:
required: true
type: string
job:
required: true
type: string
@ -21,13 +24,7 @@ on:
commit_sha:
required: false
type: string
pr_number:
required: false
type: string
outputs:
report:
description: "Content of the report of new failures"
value: ${{ jobs.process_new_failures_with_commit_info.outputs.report }}
env:
HF_HOME: /mnt/cache
@ -64,15 +61,13 @@ jobs:
- name: Check file
id: check_file
working-directory: /transformers
env:
job: ${{ inputs.job }}
run: |
if [ -f "ci_results_${job}/new_failures.json" ]; then
echo "\`ci_results_${job}/new_failures.json\` exists, continue ..."
if [ -f ci_results_${{ inputs.job }}/new_failures.json ]; then
echo "`ci_results_${{ inputs.job }}/new_failures.json` exists, continue ..."
echo "process=true" >> $GITHUB_ENV
echo "process=true" >> $GITHUB_OUTPUT
else
echo "\`ci_results_${job}/new_failures.json\` doesn't exist, abort."
echo "`ci_results_${{ inputs.job }}/new_failures.json` doesn't exist, abort."
echo "process=false" >> $GITHUB_ENV
echo "process=false" >> $GITHUB_OUTPUT
fi
@ -93,62 +88,27 @@ jobs:
echo "PREV_WORKFLOW_RUN_ID=" >> $GITHUB_ENV
fi
if [ -f setup_values/other_workflow_run_id.txt ]; then
echo "OTHER_WORKFLOW_RUN_ID=$(cat setup_values/other_workflow_run_id.txt)" >> $GITHUB_ENV
else
echo "OTHER_WORKFLOW_RUN_ID=" >> $GITHUB_ENV
fi
- name: Update clone
working-directory: /transformers
if: ${{ env.process == 'true' }}
env:
commit_sha: ${{ inputs.commit_sha || github.sha }}
run: |
git fetch origin "$commit_sha" && git checkout "$commit_sha"
run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
- name: Get `START_SHA`
- name: Get target commit
working-directory: /transformers/utils
if: ${{ env.process == 'true' }}
env:
commit_sha: ${{ inputs.commit_sha || github.sha }}
run: |
echo "START_SHA=$commit_sha" >> $GITHUB_ENV
echo "END_SHA=$(TOKEN=${{ secrets.ACCESS_REPO_INFO_TOKEN }} python3 -c 'import os; from get_previous_daily_ci import get_last_daily_ci_run_commit; commit=get_last_daily_ci_run_commit(token=os.environ["TOKEN"], workflow_run_id=os.environ["PREV_WORKFLOW_RUN_ID"]); print(commit)')" >> $GITHUB_ENV
# This is used if the CI is triggered from a pull request `self-comment-ci.yml` (after security check is verified)
- name: Extract the base commit on `main` (of the merge commit created by Github) if it is a PR
id: pr_info
if: ${{ env.process == 'true' && inputs.pr_number != '' }}
uses: actions/github-script@v6
with:
script: |
const { data: pr } = await github.rest.pulls.get({
owner: context.repo.owner,
repo: context.repo.repo,
pull_number: ${{ inputs.pr_number }}
});
const { data: merge_commit } = await github.rest.repos.getCommit({
owner: pr.base.repo.owner.login,
repo: pr.base.repo.name,
ref: '${{ inputs.commit_sha }}',
});
core.setOutput('merge_commit_base_sha', merge_commit.parents[0].sha);
# Usually, `END_SHA` should be the commit of the last previous workflow run of the **SAME** (scheduled) workflow.
# (This is why we don't need to specify `workflow_id` which would be fetched automatically in the python script.)
- name: Get `END_SHA` from previous CI runs of the same workflow
working-directory: /transformers/utils
if: ${{ env.process == 'true' && inputs.pr_number == '' }}
env:
ACCESS_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
run: |
echo "END_SHA=$(TOKEN="$ACCESS_TOKEN" python3 -c 'import os; from get_previous_daily_ci import get_last_daily_ci_run_commit; commit=get_last_daily_ci_run_commit(token=os.environ["TOKEN"], workflow_run_id=os.environ["PREV_WORKFLOW_RUN_ID"]); print(commit)')" >> $GITHUB_ENV
# However, for workflow runs triggered by `issue_comment` (for pull requests), we want to check against the
# parent commit (on `main`) of the `merge_commit` (dynamically created by GitHub). In this case, the goal is to
# see if a reported failing test is actually ONLY failing on the `merge_commit`.
- name: Set `END_SHA`
if: ${{ env.process == 'true' && inputs.pr_number != '' }}
env:
merge_commit_base_sha: ${{ steps.pr_info.outputs.merge_commit_base_sha }}
run: |
echo "END_SHA=$merge_commit_base_sha" >> $GITHUB_ENV
- name: Checkout to `start_sha`
working-directory: /transformers
if: ${{ env.process == 'true' }}
run: git fetch && git checkout ${{ inputs.start_sha }}
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
@ -178,20 +138,14 @@ jobs:
- name: Check failed tests
working-directory: /transformers
if: ${{ env.process == 'true' }}
env:
job: ${{ inputs.job }}
run_idx: ${{ matrix.run_idx }}
run: python3 utils/check_bad_commit.py --start_commit "$START_SHA" --end_commit "$END_SHA" --file "ci_results_${job}/new_failures.json" --output_file "new_failures_with_bad_commit_${job}_${run_idx}.json"
run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_${{ inputs.job }}/new_failures.json --output_file new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
- name: Show results
working-directory: /transformers
if: ${{ env.process == 'true' }}
env:
job: ${{ inputs.job }}
run_idx: ${{ matrix.run_idx }}
run: |
ls -l "new_failures_with_bad_commit_${job}_${run_idx}.json"
cat "new_failures_with_bad_commit_${job}_${run_idx}.json"
ls -l new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
cat new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
- name: Upload artifacts
uses: actions/upload-artifact@v4
@ -205,8 +159,6 @@ jobs:
if: needs.check_new_failures.outputs.process == 'true'
runs-on:
group: aws-g5-4xlarge-cache
outputs:
report: ${{ steps.set_output.outputs.report }}
container:
image: ${{ inputs.docker }}
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -224,28 +176,32 @@ jobs:
- name: Check files
working-directory: /transformers
env:
job: ${{ inputs.job }}
run: |
ls -la /transformers
ls -la "/transformers/new_failures_with_bad_commit_${job}"
ls -la /transformers/new_failures_with_bad_commit_${{ inputs.job }}
# Currently, we only run with a single runner by using `run_idx: [1]`. We might try to run with multiple runners
# to further reduce the false positive caused by flaky tests, which requires further processing to merge reports.
- name: Merge files
shell: bash
working-directory: /transformers
env:
job: ${{ inputs.job }}
run: |
cp "/transformers/new_failures_with_bad_commit_${job}/new_failures_with_bad_commit_${job}_1.json" new_failures_with_bad_commit.json
cp /transformers/new_failures_with_bad_commit_${{ inputs.job }}/new_failures_with_bad_commit_${{ inputs.job }}_1.json new_failures_with_bad_commit.json
- name: Update clone
working-directory: /transformers
run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
- name: Process report
shell: bash
working-directory: /transformers
env:
commit_sha: ${{ inputs.commit_sha || github.sha }}
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
JOB_NAME: ${{ inputs.job }}
REPORT_REPO_ID: ${{ inputs.report_repo_id }}
run: |
git fetch origin "$commit_sha" && git checkout "$commit_sha"
python3 utils/process_bad_commit_report.py
- name: Process report
shell: bash
@ -262,37 +218,11 @@ jobs:
echo EOF
} >> "$GITHUB_ENV"
# The output is useful if a caller needs more processing, for example, we have a chain
# self-comment-ci.yml -> self-scheduled.yml -> this one (check_failed_tests.yml),
# and `self-comment-ci.yml` needs further processing before sending a GitHub comment to the pull request page.
- name: Show results & Set outputs
id: set_output
working-directory: /transformers
run: |
ls -l new_failures_with_bad_commit.json
cat new_failures_with_bad_commit.json
{
echo 'report<<EOF'
cat new_failures_with_bad_commit.json
echo '' # Force a newline
echo EOF
} >> "$GITHUB_OUTPUT"
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
name: new_failures_with_bad_commit_${{ inputs.job }}
path: /transformers/new_failures_with_bad_commit.json
- name: Prepare Slack report title
working-directory: /transformers
env:
ci_event: ${{ inputs.ci_event }}
job: ${{ inputs.job }}
run: |
pip install slack_sdk
echo "title=$(python3 -c 'import sys; import os; sys.path.append("utils"); from utils.notification_service import job_to_test_map; ci_event = os.environ["ci_event"]; job = os.environ["job"]; test_name = job_to_test_map[job]; title = f"New failed tests of {ci_event}" + ":" + f" {test_name}"; print(title)')" >> $GITHUB_ENV
echo "title=$(python3 -c 'import sys; sys.path.append("utils"); from utils.notification_service import job_to_test_map; ci_event = "${{ inputs.ci_event }}"; job = "${{ inputs.job }}"; test_name = job_to_test_map[job]; title = f"New failed tests of {ci_event}" + ":" + f" {test_name}"; print(title)')" >> $GITHUB_ENV
- name: Send processed report
if: ${{ !endsWith(env.REPORT_TEXT, '{}') }}

View File

@ -1,22 +0,0 @@
---
name: CodeQL Security Analysis
on:
push:
branches: ["main", "fix_security_issue_*"]
# pull_request:
# branches: ["main"]
workflow_dispatch:
jobs:
codeql:
name: CodeQL Analysis
uses: huggingface/security-workflows/.github/workflows/codeql-reusable.yml@main
permissions:
security-events: write
packages: read
actions: read
contents: read
with:
languages: '["actions"]'
queries: 'security-extended,security-and-quality'

View File

@ -39,9 +39,6 @@ on:
PR_MERGE_COMMIT_SHA:
description: "The sha of the merge commit for the pull request (created by GitHub) in the base repository"
value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_SHA }}
PR_MERGE_COMMIT_BASE_SHA:
description: "The sha of the parent commit of the the merge commit on the target branch in the base repository"
value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_BASE_SHA }}
PR_HEAD_COMMIT_DATE:
description: "The date of the head sha of the pull request branch in the head repository"
value: ${{ jobs.get-pr-info.outputs.PR_HEAD_COMMIT_DATE }}
@ -77,7 +74,6 @@ jobs:
PR_BASE_REF: ${{ steps.pr_info.outputs.base_ref }}
PR_HEAD_SHA: ${{ steps.pr_info.outputs.head_sha }}
PR_BASE_SHA: ${{ steps.pr_info.outputs.base_sha }}
PR_MERGE_COMMIT_BASE_SHA: ${{ steps.pr_info.outputs.merge_commit_base_sha }}
PR_MERGE_COMMIT_SHA: ${{ steps.pr_info.outputs.merge_commit_sha }}
PR_HEAD_COMMIT_DATE: ${{ steps.pr_info.outputs.head_commit_date }}
PR_MERGE_COMMIT_DATE: ${{ steps.pr_info.outputs.merge_commit_date }}
@ -126,7 +122,6 @@ jobs:
core.setOutput('base_ref', pr.base.ref);
core.setOutput('head_sha', pr.head.sha);
core.setOutput('base_sha', pr.base.sha);
core.setOutput('merge_commit_base_sha', merge_commit.parents[0].sha);
core.setOutput('merge_commit_sha', pr.merge_commit_sha);
core.setOutput('pr', pr);
@ -147,21 +142,16 @@ jobs:
date: merge_commit.commit.committer.date
});
console.log('PR Info:', {
pr_info: pr
});
- name: Convert dates to timestamps
id: get_timestamps
env:
head_commit_date: ${{ steps.pr_info.outputs.head_commit_date }}
merge_commit_date: ${{ steps.pr_info.outputs.merge_commit_date }}
run: |
echo "$head_commit_date"
echo "$merge_commit_date"
head_commit_date=${{ steps.pr_info.outputs.head_commit_date }}
merge_commit_date=${{ steps.pr_info.outputs.merge_commit_date }}
echo $head_commit_date
echo $merge_commit_date
head_commit_timestamp=$(date -d "$head_commit_date" +%s)
merge_commit_timestamp=$(date -d "$merge_commit_date" +%s)
echo "$head_commit_timestamp"
echo "$merge_commit_timestamp"
echo $head_commit_timestamp
echo $merge_commit_timestamp
echo "head_commit_timestamp=$head_commit_timestamp" >> $GITHUB_OUTPUT
echo "merge_commit_timestamp=$merge_commit_timestamp" >> $GITHUB_OUTPUT
echo "merge_commit_timestamp=$merge_commit_timestamp" >> $GITHUB_OUTPUT

View File

@ -15,19 +15,13 @@ jobs:
steps:
- name: Get PR number
shell: bash
env:
issue_number: ${{ github.event.issue.number }}
is_pull_request_issue: ${{ github.event.issue.pull_request != null }}
pr_number: ${{ github.event.pull_request.number }}
is_pull_request: ${{ github.event.pull_request != null }}
event_number: ${{ github.event.number }}
run: |
if [[ "$issue_number" != "" && "$is_pull_request_issue" == "true" ]]; then
echo "PR_NUMBER=$issue_number" >> $GITHUB_ENV
elif [[ "$pr_number" != "" ]]; then
echo "PR_NUMBER=$pr_number" >> $GITHUB_ENV
elif [[ "$is_pull_request" == "true" ]]; then
echo "PR_NUMBER=$event_number" >> $GITHUB_ENV
if [[ "${{ github.event.issue.number }}" != "" && "${{ github.event.issue.pull_request }}" != "" ]]; then
echo "PR_NUMBER=${{ github.event.issue.number }}" >> $GITHUB_ENV
elif [[ "${{ github.event.pull_request.number }}" != "" ]]; then
echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV
elif [[ "${{ github.event.pull_request }}" != "" ]]; then
echo "PR_NUMBER=${{ github.event.number }}" >> $GITHUB_ENV
else
echo "PR_NUMBER=" >> $GITHUB_ENV
fi
@ -35,8 +29,8 @@ jobs:
- name: Check PR number
shell: bash
run: |
echo "$PR_NUMBER"
echo "${{ env.PR_NUMBER }}"
- name: Set PR number
id: set_pr_number
run: echo "PR_NUMBER=$PR_NUMBER" >> "$GITHUB_OUTPUT"
run: echo "PR_NUMBER=${{ env.PR_NUMBER }}" >> "$GITHUB_OUTPUT"

View File

@ -28,9 +28,6 @@ on:
report_repo_id:
required: false
type: string
pytest_marker:
required: false
type: string
env:
HF_HOME: /mnt/cache
@ -62,33 +59,25 @@ jobs:
steps:
- name: Echo input and matrix info
shell: bash
env:
folder_slices: ${{ inputs.folder_slices }}
matrix_folders: ${{ matrix.folders }}
slice_data: ${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}
run: |
echo "$folder_slices"
echo "$matrix_folders"
echo "$slice_data"
echo "${{ inputs.folder_slices }}"
echo "${{ matrix.folders }}"
echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
- name: Echo folder ${{ matrix.folders }}
shell: bash
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
# set the artifact folder names (because the character `/` is not allowed).
env:
matrix_folders_raw: ${{ matrix.folders }}
run: |
echo "$matrix_folders_raw"
matrix_folders="${matrix_folders_raw/'models/'/'models_'}"
echo "${{ matrix.folders }}"
matrix_folders=${{ matrix.folders }}
matrix_folders=${matrix_folders/'models/'/'models_'}
echo "$matrix_folders"
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
- name: Update clone
working-directory: /transformers
env:
commit_sha: ${{ inputs.commit_sha || github.sha }}
run: |
git fetch origin "$commit_sha" && git checkout "$commit_sha"
run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
@ -123,17 +112,15 @@ jobs:
id: set_machine_type
working-directory: /transformers
shell: bash
env:
input_machine_type: ${{ inputs.machine_type }}
run: |
echo "$input_machine_type"
echo "${{ inputs.machine_type }}"
if [ "$input_machine_type" = "aws-g5-4xlarge-cache" ]; then
if [ "${{ inputs.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
machine_type=single-gpu
elif [ "$input_machine_type" = "aws-g5-12xlarge-cache" ]; then
elif [ "${{ inputs.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
machine_type=multi-gpu
else
machine_type="$input_machine_type"
machine_type=${{ inputs.machine_type }}
fi
echo "$machine_type"
@ -142,21 +129,15 @@ jobs:
- name: Create report directory if it doesn't exist
shell: bash
env:
report_name_prefix: ${{ inputs.report_name_prefix }}
run: |
mkdir -p "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports"
echo "dummy" > "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports/dummy.txt"
ls -la "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports"
mkdir -p /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
echo "dummy" > /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/dummy.txt
ls -la /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
- name: Run all tests on GPU
working-directory: /transformers
env:
report_name_prefix: ${{ inputs.report_name_prefix }}
pytest_marker: ${{ inputs.pytest_marker }}
model: ${{ matrix.folders }}
run: |
script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports python3 -m pytest -rsfE -v -m '${pytest_marker}' --make-reports=${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports tests/${model}" test_outputs.txt
script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports tests/${{ matrix.folders }}" test_outputs.txt
ls -la
# Extract the exit code from the output file
EXIT_CODE=$(tail -1 test_outputs.txt | grep -o 'COMMAND_EXIT_CODE="[0-9]*"' | cut -d'"' -f2)
@ -167,25 +148,19 @@ jobs:
# This step is only to show information on Github Actions log.
# Always mark this step as successful, even if the report directory or the file `failures_short.txt` in it doesn't exist
continue-on-error: true
env:
report_name_prefix: ${{ inputs.report_name_prefix }}
run: cat "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports/failures_short.txt"
run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/failures_short.txt
- name: Captured information
if: ${{ failure() }}
continue-on-error: true
env:
report_name_prefix: ${{ inputs.report_name_prefix }}
run: |
cat "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports/captured_info.txt"
cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/captured_info.txt
- name: Copy test_outputs.txt
if: ${{ always() }}
continue-on-error: true
env:
report_name_prefix: ${{ inputs.report_name_prefix }}
run: |
cp /transformers/test_outputs.txt "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports"
cp /transformers/test_outputs.txt /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
- name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
if: ${{ always() }}
@ -196,7 +171,7 @@ jobs:
collated_reports:
name: Collated Reports
if: ${{ always() && inputs.runner_type != '' }}
if: ${{ always() }}
needs: run_models_gpu
uses: huggingface/transformers/.github/workflows/collated-reports.yml@main
with:

View File

@ -1,4 +1,4 @@
name: PR slow CI - Suggestion
name: PR slow CI
on:
pull_request_target:
types: [opened, synchronize, reopened]
@ -23,28 +23,11 @@ jobs:
outputs:
jobs: ${{ steps.get_jobs.outputs.jobs_to_run }}
steps:
# This checkout to the main branch
- uses: actions/checkout@v4
with:
fetch-depth: "0"
# We need to use `${{ ... }}` here to avoid `Argument list too long` error when a PR changes a lot of files.
# (We could also try to use artifact approach, but it's more involved).
# `CodeQL` doesn't identify any security issue here. Also `PR_FILES` is from `get-pr-info.yml` by using an api
# `github.rest.pulls.listFiles`, which is fine.
- name: Write pr_files file
run: |
cat > pr_files.txt << 'EOF'
${{ needs.get-pr-info.outputs.PR_FILES }}
EOF
- name: Get repository content
id: repo_content
uses: actions/github-script@v6
with:
script: |
const fs = require('node:fs');
const { data: tests_dir } = await github.rest.repos.getContent({
owner: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}',
repo: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}',
@ -66,10 +49,38 @@ jobs:
ref: '${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}',
});
// Write to files instead of outputs
fs.writeFileSync('tests_dir.txt', JSON.stringify(tests_dir, null, 2));
fs.writeFileSync('tests_models_dir.txt', JSON.stringify(tests_models_dir, null, 2));
fs.writeFileSync('tests_quantization_dir.txt', JSON.stringify(tests_quantization_dir, null, 2));
core.setOutput('tests_dir', tests_dir);
core.setOutput('tests_models_dir', tests_models_dir);
core.setOutput('tests_quantization_dir', tests_quantization_dir);
# This checkout to the main branch
- uses: actions/checkout@v4
with:
fetch-depth: "0"
- name: Write pr_files file
run: |
cat > pr_files.txt << 'EOF'
${{ needs.get-pr-info.outputs.PR_FILES }}
EOF
- name: Write tests_dir file
run: |
cat > tests_dir.txt << 'EOF'
${{ steps.repo_content.outputs.tests_dir }}
EOF
- name: Write tests_models_dir file
run: |
cat > tests_models_dir.txt << 'EOF'
${{ steps.repo_content.outputs.tests_models_dir }}
EOF
- name: Write tests_quantization_dir file
run: |
cat > tests_quantization_dir.txt << 'EOF'
${{ steps.repo_content.outputs.tests_quantization_dir }}
EOF
- name: Run script to get jobs to run
id: get_jobs

View File

@ -149,9 +149,9 @@ jobs:
with:
job: run_models_gpu
slack_report_channel: "#transformers-ci-push"
docker: huggingface/transformers-all-latest-gpu:flash-attn
docker: huggingface/transformers-all-latest-gpu
ci_event: push
report_repo_id: hf-internal-testing/transformers_ci_push
commit_sha: ${{ github.sha }}
subdirs: ${{ needs.get_modified_models.outputs.matrix }}
models: ${{ needs.get_modified_models.outputs.matrix }}
secrets: inherit

View File

@ -23,34 +23,62 @@ env:
TF_FORCE_GPU_ALLOW_GROWTH: true
CUDA_VISIBLE_DEVICES: 0,1
jobs:
get-pr-number:
runs-on: ubuntu-22.04
name: Get PR number
# For security: only allow team members to run
if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "eustlb", "MekkCyber", "vasqu", "ivarflakstad", "stevhliu", "ebezzam", "remi-or", "itazap"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
uses: ./.github/workflows/get-pr-number.yml
outputs:
PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
steps:
- name: Get PR number
shell: bash
run: |
if [[ "${{ github.event.issue.number }}" != "" && "${{ github.event.issue.pull_request }}" != "" ]]; then
echo "PR_NUMBER=${{ github.event.issue.number }}" >> $GITHUB_ENV
else
echo "PR_NUMBER=" >> $GITHUB_ENV
fi
get-pr-info:
name: Get PR commit SHA
- name: Check PR number
shell: bash
run: |
echo "${{ env.PR_NUMBER }}"
- name: Set PR number
id: set_pr_number
run: echo "PR_NUMBER=${{ env.PR_NUMBER }}" >> "$GITHUB_OUTPUT"
get-sha:
runs-on: ubuntu-22.04
needs: get-pr-number
if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
uses: ./.github/workflows/get-pr-info.yml
with:
pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
check-timestamps:
name: Check timestamps (security check)
runs-on: ubuntu-22.04
needs: get-pr-info
outputs:
PR_HEAD_SHA: ${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}
PR_MERGE_SHA: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_SHA }}
PR_HEAD_SHA: ${{ steps.get_sha.outputs.PR_HEAD_SHA }}
PR_MERGE_SHA: ${{ steps.get_sha.outputs.PR_MERGE_SHA }}
steps:
- name: Verify `merge_commit` timestamp is older than the issue comment timestamp
- uses: actions/checkout@v4
with:
fetch-depth: "0"
ref: "refs/pull/${{needs.get-pr-number.outputs.PR_NUMBER}}/merge"
- name: Get SHA (and verify timestamps against the issue comment date)
id: get_sha
env:
PR_NUMBER: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
COMMENT_DATE: ${{ github.event.comment.created_at }}
PR_MERGE_COMMIT_TIMESTAMP: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
run: |
git fetch origin refs/pull/$PR_NUMBER/head:refs/remotes/pull/$PR_NUMBER/head
git checkout refs/remotes/pull/$PR_NUMBER/head
echo "PR_HEAD_SHA: $(git log -1 --format=%H)"
echo "PR_HEAD_SHA=$(git log -1 --format=%H)" >> "$GITHUB_OUTPUT"
git fetch origin refs/pull/$PR_NUMBER/merge:refs/remotes/pull/$PR_NUMBER/merge
git checkout refs/remotes/pull/$PR_NUMBER/merge
echo "PR_MERGE_SHA: $(git log -1 --format=%H)"
echo "PR_MERGE_SHA=$(git log -1 --format=%H)" >> "$GITHUB_OUTPUT"
PR_MERGE_COMMIT_TIMESTAMP=$(git log -1 --date=unix --format=%cd)
echo "PR_MERGE_COMMIT_TIMESTAMP: $PR_MERGE_COMMIT_TIMESTAMP"
COMMENT_TIMESTAMP=$(date -d "${COMMENT_DATE}" +"%s")
echo "COMMENT_DATE: $COMMENT_DATE"
echo "COMMENT_TIMESTAMP: $COMMENT_TIMESTAMP"
@ -59,10 +87,13 @@ jobs:
exit -1;
fi
# use a python script to handle this complex logic.
# use a python script to handle this complex logic
# case 1: `run-slow` (auto. infer with limited number of models, but in particular, new model)
# case 2: `run-slow model_1, model_2`
get-tests:
runs-on: ubuntu-22.04
needs: [get-pr-number, check-timestamps]
needs: [get-pr-number, get-sha]
if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
outputs:
models: ${{ steps.models_to_run.outputs.models }}
quantizations: ${{ steps.models_to_run.outputs.quantizations }}
@ -70,11 +101,11 @@ jobs:
- uses: actions/checkout@v4
with:
fetch-depth: "0"
ref: "refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge"
ref: "refs/pull/${{needs.get-pr-number.outputs.PR_NUMBER}}/merge"
- name: Verify merge commit SHA
env:
VERIFIED_PR_MERGE_SHA: ${{ needs.check-timestamps.outputs.PR_MERGE_SHA }}
VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }}
run: |
PR_MERGE_SHA=$(git log -1 --format=%H)
if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
@ -95,33 +126,11 @@ jobs:
- name: Show models to test
id: models_to_run
run: |
echo "$models"
echo "models=$models" >> $GITHUB_OUTPUT
echo "$quantizations"
echo "quantizations=$quantizations" >> $GITHUB_OUTPUT
# Report back if we are not able to get the tests (for example, security check is failing)
report_error_earlier:
name: Report error earlier
if: ${{ always() && needs.get-pr-info.result == 'success' && needs.get-tests.result != 'success' }}
needs: [get-pr-number, get-pr-info, get-tests]
permissions:
pull-requests: write
runs-on: ubuntu-22.04
steps:
- name: Reply to the comment
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
github_repository: ${{ github.repository }}
pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
run: |
gh api \
--method POST \
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"repos/${github_repository}/issues/${pr_number}/comments" \
-f body="💔 This comment contains \`run-slow\`, but unknown error occurred and [the workflow run]($GITHUB_RUN_URL) aborted!"
echo "${{ env.models }}"
echo "models=${{ env.models }}" >> $GITHUB_ENV
echo "models=${{ env.models }}" >> $GITHUB_OUTPUT
echo "${{ env.quantizations }}"
echo "quantizations=${{ env.quantizations }}" >> $GITHUB_OUTPUT
reply_to_comment:
name: Reply to the comment
@ -134,20 +143,20 @@ jobs:
- name: Reply to the comment
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
BODY: '\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}'
github_repository: ${{ github.repository }}
pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
MODELS: ${{ needs.get-tests.outputs.models }}
BODY: "\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}"
run: |
gh api \
--method POST \
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"repos/${github_repository}/issues/${pr_number}/comments" \
-f body="This comment contains \`run-slow\`, running the specified jobs: $(echo -e "$BODY")"
repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \
-f "body=This comment contains run-slow, running the specified jobs: ${{ env.BODY }} ..."
create_run:
name: Create run
needs: [check-timestamps, reply_to_comment]
if: ${{ needs.get-tests.outputs.models != '[]' || needs.get-tests.outputs.quantizations != '[]' }}
needs: [get-sha, get-tests, reply_to_comment]
permissions:
statuses: write
runs-on: ubuntu-22.04
@ -159,196 +168,248 @@ jobs:
# Create a commit status (pending) for a run of this workflow. The status has to be updated later in `update_run_status`.
# See https://docs.github.com/en/rest/commits/statuses?apiVersion=2022-11-28#create-a-commit-status
GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
github_repository: ${{ github.repository }}
pr_head_sha: ${{ needs.check-timestamps.outputs.PR_HEAD_SHA }}
run: |
gh api \
--method POST \
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"repos/${github_repository}/statuses/${pr_head_sha}" \
repos/${{ github.repository }}/statuses/${{ needs.get-sha.outputs.PR_HEAD_SHA }} \
-f "target_url=$GITHUB_RUN_URL" -f "state=pending" -f "description=Slow CI job" -f "context=pytest/custom-tests"
model-ci:
name: Model CI
run_models_gpu:
name: Run all tests for the model
if: ${{ needs.get-tests.outputs.models != '[]' }}
uses: ./.github/workflows/self-scheduled.yml
needs: [get-pr-number, check-timestamps, get-tests, create_run]
with:
job: run_models_gpu
slack_report_channel: "#transformers-ci-pr"
docker: huggingface/transformers-all-latest-gpu
ci_event: PR Comment CI
report_repo_id: hf-internal-testing/transformers_pr_ci
commit_sha: ${{ needs.check-timestamps.outputs.PR_MERGE_SHA }}
subdirs: ${{ needs.get-tests.outputs.models }}
pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
secrets: inherit
needs: [get-pr-number, get-sha, get-tests, create_run]
strategy:
fail-fast: false
matrix:
folders: ${{ fromJson(needs.get-tests.outputs.models) }}
machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
runs-on:
group: '${{ matrix.machine_type }}'
container:
image: huggingface/transformers-all-latest-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Echo input and matrix info
shell: bash
run: |
echo "${{ matrix.folders }}"
quantization-ci:
name: Quantization CI
- name: Echo folder ${{ matrix.folders }}
shell: bash
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
# set the artifact folder names (because the character `/` is not allowed).
run: |
echo "${{ matrix.folders }}"
matrix_folders=${{ matrix.folders }}
matrix_folders=${matrix_folders/'models/'/'models_'}
echo "$matrix_folders"
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
- name: Checkout to PR merge commit
working-directory: /transformers
run: |
git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
git log -1 --format=%H
- name: Verify merge commit SHA
env:
VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }}
working-directory: /transformers
run: |
PR_MERGE_SHA=$(git log -1 --format=%H)
if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!";
exit -1;
fi
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Set `machine_type` for report and artifact names
working-directory: /transformers
shell: bash
run: |
echo "${{ matrix.machine_type }}"
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
machine_type=single-gpu
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
machine_type=multi-gpu
else
machine_type=${{ matrix.machine_type }}
fi
echo "$machine_type"
echo "machine_type=$machine_type" >> $GITHUB_ENV
- name: Environment
working-directory: /transformers
run: |
python3 utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /transformers
run: pip freeze
- name: Run all tests on GPU
working-directory: /transformers
run: |
export CUDA_VISIBLE_DEVICES="$(python3 utils/set_cuda_devices_for_ci.py --test_folder ${{ matrix.folders }})"
echo $CUDA_VISIBLE_DEVICES
python3 -m pytest -v -rsfE --make-reports=${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
- name: Make sure report directory exists
shell: bash
run: |
mkdir -p /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
echo "hello" > /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
echo "${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
run_quantization_torch_gpu:
name: Run all tests for a quantization
if: ${{ needs.get-tests.outputs.quantizations != '[]' }}
uses: ./.github/workflows/self-scheduled.yml
needs: [get-pr-number, check-timestamps, get-tests, create_run]
with:
job: run_quantization_torch_gpu
slack_report_channel: "#transformers-ci-pr"
docker: huggingface/transformers-quantization-latest-gpu
ci_event: PR Comment CI
report_repo_id: hf-internal-testing/transformers_pr_ci
commit_sha: ${{ needs.check-timestamps.outputs.PR_MERGE_SHA }}
subdirs: ${{ needs.get-tests.outputs.quantizations }}
pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
secrets: inherit
needs: [get-pr-number, get-sha, get-tests, create_run]
strategy:
fail-fast: false
matrix:
folders: ${{ fromJson(needs.get-tests.outputs.quantizations) }}
machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
runs-on:
group: '${{ matrix.machine_type }}'
container:
image: huggingface/transformers-quantization-latest-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Echo folder ${{ matrix.folders }}
shell: bash
run: |
echo "${{ matrix.folders }}"
matrix_folders=${{ matrix.folders }}
matrix_folders=${matrix_folders/'quantization/'/'quantization_'}
echo "$matrix_folders"
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
report:
name: Check & Report
needs: [get-pr-number, check-timestamps, create_run, model-ci, quantization-ci]
- name: Checkout to PR merge commit
working-directory: /transformers
run: |
git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
git log -1 --format=%H
- name: Verify merge commit SHA
env:
VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }}
working-directory: /transformers
run: |
PR_MERGE_SHA=$(git log -1 --format=%H)
if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!";
exit -1;
fi
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Set `machine_type` for report and artifact names
working-directory: /transformers
shell: bash
run: |
echo "${{ matrix.machine_type }}"
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
machine_type=single-gpu
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
machine_type=multi-gpu
else
machine_type=${{ matrix.machine_type }}
fi
echo "$machine_type"
echo "machine_type=$machine_type" >> $GITHUB_ENV
- name: Environment
working-directory: /transformers
run: |
python3 utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /transformers
run: pip freeze
- name: Run quantization tests on GPU
working-directory: /transformers
run: |
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
- name: Make sure report directory exists
shell: bash
run: |
mkdir -p /transformers/reports/${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports
echo "hello" > /transformers/reports/${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports/hello.txt
echo "${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports"
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports
path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports
update_run_status:
name: Update Check Run Status
needs: [get-sha, create_run, run_models_gpu, run_quantization_torch_gpu]
permissions:
pull-requests: write
statuses: write
if: ${{ always() && needs.create_run.result == 'success' }}
runs-on: ubuntu-22.04
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
STATUS_OK: ${{ contains(fromJSON('["skipped", "success"]'), needs.run_models_gpu.result) && contains(fromJSON('["skipped", "success"]'), needs.run_quantization_torch_gpu.result) }}
steps:
- name: Show reports from jobs
env:
MODEL_REPORT: ${{ needs.model-ci.outputs.report }}
QUANT_REPORT: ${{ needs.quantization-ci.outputs.report }}
- name: Get `run_models_gpu` job status
run: |
echo "$MODEL_REPORT"
echo "$QUANT_REPORT"
- name: Process and filter reports
env:
MODEL_REPORT: ${{ needs.model-ci.outputs.report }}
QUANT_REPORT: ${{ needs.quantization-ci.outputs.report }}
run: |
# Preprocess with Python
python3 << 'PYTHON_SCRIPT'
import json
import os
def filter_and_format_report(data):
"""
Filter out entries where commit is `None` (failing tests who status is not certain) and format as text
"""
lines = []
for model, model_result in data.items():
model_lines = []
for device, failures in model_result.items():
# Filter out None commits and extract just the test names
test_names = [
failure['test']
for failure in failures
if isinstance(failure, dict) and failure.get('commit') is not None
]
# Add tests to model lines
for idx, test_name in enumerate(test_names):
if idx == 0:
job_link = failures[idx]['job_link']
model_lines.append(f"- [{model}]({job_link}):")
model_lines.append(f" {test_name}")
# Only add model section if it has tests
if len(model_lines) > 0:
lines.extend(model_lines)
lines.append("") # Empty line between models
return "\n".join(lines).strip()
# Load and filter reports
model_report_str = os.environ.get('MODEL_REPORT', '{}')
quant_report_str = os.environ.get('QUANT_REPORT', '{}')
model_report = json.loads(model_report_str) if model_report_str else {}
quant_report = json.loads(quant_report_str) if quant_report_str else {}
formatted_model = filter_and_format_report(model_report)
formatted_quant = filter_and_format_report(quant_report)
# Write to files
with open('model_ci.txt', 'w') as f:
f.write(formatted_model)
if formatted_model:
f.write('\n')
with open('quantization_ci.txt', 'w') as f:
f.write(formatted_quant)
if formatted_quant:
f.write('\n')
PYTHON_SCRIPT
- name: Post results as PR comment
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
github_repository: ${{ github.repository }}
pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
model_ci_result: ${{ needs.model-ci.result }}
quantization_ci_result: ${{ needs.quantization-ci.result }}
run: |
{
echo '## CI Results'
echo "[Workflow Run ⚙️]($GITHUB_RUN_URL)"
echo ''
# Check if both jobs were skipped or cancelled
if [[ "$model_ci_result" == "skipped" || "$model_ci_result" == "cancelled" ]] && \
[[ "$quantization_ci_result" == "skipped" || "$quantization_ci_result" == "cancelled" ]]; then
echo '⚠️ No test being reported (jobs are skipped or cancelled)!'
echo "STATUS=error" >> $GITHUB_ENV
# Check if either file has content
elif [ -s model_ci.txt ] || [ -s quantization_ci.txt ]; then
echo "STATUS=failure" >> $GITHUB_ENV
# Check if model_ci.txt has content
if [ -s model_ci.txt ]; then
echo '### Model CI Report'
echo ''
echo '#### ❌ Failed tests'
echo ''
cat model_ci.txt
echo ''
fi
# Check if quantization_ci.txt has content
if [ -s quantization_ci.txt ]; then
echo '### Quantization CI Report'
echo ''
echo '#### ❌ Failed tests'
echo ''
cat quantization_ci.txt
echo ''
fi
else
echo "STATUS=success" >> $GITHUB_ENV
echo '✅ No failing test specific to this PR 🎉 !'
fi
} > comment_body.txt
gh api \
--method POST \
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"repos/${github_repository}/issues/${pr_number}/comments" \
-F body=@comment_body.txt
echo "${{ needs.run_models_gpu.result }}"
echo "${{ needs.run_quantization_torch_gpu.result }}"
echo $STATUS_OK
if [ "$STATUS_OK" = "true" ]; then
echo "STATUS=success" >> $GITHUB_ENV
else
echo "STATUS=failure" >> $GITHUB_ENV
fi
- name: Update PR commit statuses
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
github_repository: ${{ github.repository }}
pr_head_sha: ${{ needs.check-timestamps.outputs.PR_HEAD_SHA }}
# The env. variable `STATUS` used here is set in the previous step
run: |
echo "${{ needs.run_models_gpu.result }}"
echo "${{ env.STATUS }}"
gh api \
--method POST \
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"repos/${github_repository}/statuses/${pr_head_sha}" \
-f "target_url=$GITHUB_RUN_URL" -f "state=$STATUS" -f "description=Slow CI job" -f "context=pytest/custom-tests"
repos/${{ github.repository }}/statuses/${{ needs.get-sha.outputs.PR_HEAD_SHA }} \
-f "target_url=$GITHUB_RUN_URL" -f "state=${{ env.STATUS }}" -f "description=Slow CI job" -f "context=pytest/custom-tests"

View File

@ -51,7 +51,6 @@ jobs:
slack_report_channel: "#transformers-ci-past-future"
docker: huggingface/transformers-all-latest-torch-nightly-gpu
ci_event: Nightly CI
runner_type: "a10"
report_repo_id: hf-internal-testing/transformers_daily_ci_with_torch_nightly
commit_sha: ${{ github.event.workflow_run.head_sha || github.sha }}
secrets: inherit

View File

@ -0,0 +1,25 @@
name: Self-hosted runner (AMD mi210 CI caller)
on:
#workflow_run:
# workflows: ["Self-hosted runner (push-caller)"]
# branches: ["main"]
# types: [completed]
push:
branches:
- run_amd_push_ci_caller*
paths:
- "src/**"
- "tests/**"
- ".github/**"
- "templates/**"
- "utils/**"
jobs:
run_amd_ci:
name: AMD mi210
if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
uses: ./.github/workflows/self-push-amd.yml
with:
gpu_flavor: mi210
secrets: inherit

View File

@ -0,0 +1,25 @@
name: Self-hosted runner (AMD mi250 CI caller)
on:
#workflow_run:
# workflows: ["Self-hosted runner (push-caller)"]
# branches: ["main"]
# types: [completed]
push:
branches:
- run_amd_push_ci_caller*
paths:
- "src/**"
- "tests/**"
- ".github/**"
- "templates/**"
- "utils/**"
jobs:
run_amd_ci:
name: AMD mi250
if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
uses: ./.github/workflows/self-push-amd.yml
with:
gpu_flavor: mi250
secrets: inherit

334
.github/workflows/self-push-amd.yml vendored Normal file
View File

@ -0,0 +1,334 @@
name: Self-hosted runner AMD GPU (push)
on:
workflow_call:
inputs:
gpu_flavor:
required: true
type: string
env:
HF_HOME: /mnt/cache
TRANSFORMERS_IS_CI: yes
OMP_NUM_THREADS: 8
MKL_NUM_THREADS: 8
PYTEST_TIMEOUT: 60
TF_FORCE_GPU_ALLOW_GROWTH: true
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
jobs:
check_runner_status:
name: Check Runner Status
runs-on: ubuntu-22.04
steps:
- name: Checkout transformers
uses: actions/checkout@v4
with:
fetch-depth: 2
- name: Check Runner Status
run: python utils/check_self_hosted_runner.py --target_runners amd-mi210-single-gpu-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
check_runners:
name: Check Runners
needs: check_runner_status
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
container:
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: ROCM-SMI
run: |
rocm-smi
- name: ROCM-INFO
run: |
rocminfo | grep "Agent" -A 14
- name: Show ROCR environment
run: |
echo "ROCR: $ROCR_VISIBLE_DEVICES"
setup_gpu:
name: Setup
needs: check_runners
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
container:
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
test_map: ${{ steps.set-matrix.outputs.test_map }}
env:
# `CI_BRANCH_PUSH`: The branch name from the push event
# `CI_BRANCH_WORKFLOW_RUN`: The name of the branch on which this workflow is triggered by `workflow_run` event
# `CI_SHA_PUSH`: The commit SHA from the push event
# `CI_SHA_WORKFLOW_RUN`: The commit SHA that triggers this workflow by `workflow_run` event
CI_BRANCH_PUSH: ${{ github.event.ref }}
CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
CI_SHA_PUSH: ${{ github.event.head_commit.id }}
CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
steps:
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
# We also take into account the `push` event (we might want to test some changes in a branch)
- name: Prepare custom environment variables
shell: bash
# `CI_BRANCH`: The non-empty branch name from the above two (one and only one of them is empty)
# `CI_SHA`: The non-empty commit SHA from the above two (one and only one of them is empty)
run: |
CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
echo $CI_BRANCH_PUSH
echo $CI_BRANCH_WORKFLOW_RUN
echo $CI_SHA_PUSH
echo $CI_SHA_WORKFLOW_RUN
[[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
[[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
- name: print environment variables
run: |
echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
echo "env.CI_SHA = ${{ env.CI_SHA }}"
- name: Update clone using environment variables
working-directory: /transformers
run: |
echo "original branch = $(git branch --show-current)"
git fetch && git checkout ${{ env.CI_BRANCH }}
echo "updated branch = $(git branch --show-current)"
git checkout ${{ env.CI_SHA }}
echo "log = $(git log -n 1)"
- name: Cleanup
working-directory: /transformers
run: |
rm -rf tests/__pycache__
rm -rf tests/models/__pycache__
rm -rf reports
- name: Show installed libraries and their versions
working-directory: /transformers
run: pip freeze
- name: Fetch the tests to run
working-directory: /transformers
# TODO: add `git-python` in the docker images
run: |
pip install --upgrade git-python
python3 utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
- name: Report fetched tests
uses: actions/upload-artifact@v4
with:
name: test_fetched
path: /transformers/test_preparation.txt
- id: set-matrix
name: Organize tests into models
working-directory: /transformers
# The `keys` is used as GitHub actions matrix for jobs, i.e. `models/bert`, `tokenization`, `pipeline`, etc.
# The `test_map` is used to get the actual identified test files under each key.
# If no test to run (so no `test_map.json` file), create a dummy map (empty matrix will fail)
run: |
if [ -f test_map.json ]; then
keys=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); d = list(test_map.keys()); print(d)')
test_map=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); print(test_map)')
else
keys=$(python3 -c 'keys = ["dummy"]; print(keys)')
test_map=$(python3 -c 'test_map = {"dummy": []}; print(test_map)')
fi
echo $keys
echo $test_map
echo "matrix=$keys" >> $GITHUB_OUTPUT
echo "test_map=$test_map" >> $GITHUB_OUTPUT
run_models_gpu:
name: Model tests
needs: setup_gpu
# `dummy` means there is no test to run
if: contains(fromJson(needs.setup_gpu.outputs.matrix), 'dummy') != true
strategy:
fail-fast: false
matrix:
folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }}
machine_type: [single-gpu, multi-gpu]
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
container:
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
env:
# For the meaning of these environment variables, see the job `Setup`
CI_BRANCH_PUSH: ${{ github.event.ref }}
CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
CI_SHA_PUSH: ${{ github.event.head_commit.id }}
CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
steps:
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
# We also take into account the `push` event (we might want to test some changes in a branch)
- name: Prepare custom environment variables
shell: bash
# For the meaning of these environment variables, see the job `Setup`
run: |
CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
echo $CI_BRANCH_PUSH
echo $CI_BRANCH_WORKFLOW_RUN
echo $CI_SHA_PUSH
echo $CI_SHA_WORKFLOW_RUN
[[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
[[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
- name: print environment variables
run: |
echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
echo "env.CI_SHA = ${{ env.CI_SHA }}"
- name: Update clone using environment variables
working-directory: /transformers
run: |
echo "original branch = $(git branch --show-current)"
git fetch && git checkout ${{ env.CI_BRANCH }}
echo "updated branch = $(git branch --show-current)"
git checkout ${{ env.CI_SHA }}
echo "log = $(git log -n 1)"
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
- name: Echo folder ${{ matrix.folders }}
shell: bash
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
# set the artifact folder names (because the character `/` is not allowed).
run: |
echo "${{ matrix.folders }}"
echo "${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }}"
matrix_folders=${{ matrix.folders }}
matrix_folders=${matrix_folders/'models/'/'models_'}
echo "$matrix_folders"
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
- name: ROCM-SMI
run: |
rocm-smi
- name: ROCM-INFO
run: |
rocminfo | grep "Agent" -A 14
- name: Show ROCR environment
run: |
echo "ROCR: $ROCR_VISIBLE_DEVICES"
- name: Environment
working-directory: /transformers
run: |
python3 utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /transformers
run: pip freeze
- name: Run all non-slow selected tests on GPU
working-directory: /transformers
run: |
python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports ${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }} -m "not not_device_test"
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
path: /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
send_results:
name: Send results to webhook
runs-on: ubuntu-22.04
if: always()
needs: [
check_runner_status,
check_runners,
setup_gpu,
run_models_gpu,
# run_tests_torch_cuda_extensions_single_gpu,
# run_tests_torch_cuda_extensions_multi_gpu
]
env:
# For the meaning of these environment variables, see the job `Setup`
CI_BRANCH_PUSH: ${{ github.event.ref }}
CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
CI_SHA_PUSH: ${{ github.event.head_commit.id }}
CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
steps:
- name: Preliminary job status
shell: bash
# For the meaning of these environment variables, see the job `Setup`
run: |
echo "Runner availability: ${{ needs.check_runner_status.result }}"
echo "Setup status: ${{ needs.setup_gpu.result }}"
echo "Runner status: ${{ needs.check_runners.result }}"
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
# We also take into account the `push` event (we might want to test some changes in a branch)
- name: Prepare custom environment variables
shell: bash
# For the meaning of these environment variables, see the job `Setup`
run: |
CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
echo $CI_BRANCH_PUSH
echo $CI_BRANCH_WORKFLOW_RUN
echo $CI_SHA_PUSH
echo $CI_SHA_WORKFLOW_RUN
[[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
[[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
- name: print environment variables
run: |
echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
echo "env.CI_SHA = ${{ env.CI_SHA }}"
- uses: actions/checkout@v4
# To avoid failure when multiple commits are merged into `main` in a short period of time.
# Checking out to an old commit beyond the fetch depth will get an error `fatal: reference is not a tree: ...
# (Only required for `workflow_run` event, where we get the latest HEAD on `main` instead of the event commit)
with:
fetch-depth: 20
- name: Update clone using environment variables
run: |
echo "original branch = $(git branch --show-current)"
git fetch && git checkout ${{ env.CI_BRANCH }}
echo "updated branch = $(git branch --show-current)"
git checkout ${{ env.CI_SHA }}
echo "log = $(git log -n 1)"
- uses: actions/download-artifact@v4
- name: Send message to Slack
env:
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
CI_SLACK_CHANNEL_ID_AMD: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }}
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }}
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
CI_EVENT: Push CI (AMD) - ${{ inputs.gpu_flavor }}
CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
CI_SHA: ${{ env.CI_SHA }}
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
SETUP_STATUS: ${{ needs.setup_gpu.result }}
# We pass `needs.setup_gpu.outputs.matrix` as the argument. A processing in `notification_service.py` to change
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
run: |
pip install huggingface_hub
pip install slack_sdk
pip show slack_sdk
python utils/notification_service.py "${{ needs.setup_gpu.outputs.matrix }}"

54
.github/workflows/self-push-caller.yml vendored Normal file
View File

@ -0,0 +1,54 @@
# Used to trigger self-push CI
name: Self-hosted runner (push-caller)
on:
push:
branches:
- main
paths:
- "src/**"
- "tests/**"
- ".github/**"
- "templates/**"
- "utils/**"
jobs:
check-for-setup:
runs-on: ubuntu-22.04
name: Check if setup was changed
outputs:
changed: ${{ steps.was_changed.outputs.changed }}
steps:
- uses: actions/checkout@v4
with:
fetch-depth: "2"
- name: Get changed files
id: changed-files
uses: tj-actions/changed-files@1c8e6069583811afb28f97afeaf8e7da80c6be5c
- name: Was setup changed
id: was_changed
run: |
for file in ${{ steps.changed-files.outputs.all_changed_files }}; do
if [ `basename "${file}"` = "setup.py" ]; then
echo "changed=1" >> $GITHUB_OUTPUT
fi
done
build-docker-containers:
needs: check-for-setup
if: (github.event_name == 'push') && (needs.check-for-setup.outputs.changed == '1')
uses: ./.github/workflows/build-docker-images.yml
with:
image_postfix: "-push-ci"
secrets: inherit
run_push_ci:
name: Trigger Push CI
runs-on: ubuntu-22.04
if: ${{ always() }}
needs: build-docker-containers
steps:
- name: Trigger push CI via workflow_run
run: echo "Trigger push CI via workflow_run"

652
.github/workflows/self-push.yml vendored Normal file
View File

@ -0,0 +1,652 @@
name: Self-hosted runner (push)
on:
workflow_run:
workflows: ["Self-hosted runner (push-caller)"]
branches: ["main"]
types: [completed]
push:
branches:
- ci_*
- ci-*
paths:
- "src/**"
- "tests/**"
- ".github/**"
- "templates/**"
- "utils/**"
repository_dispatch:
env:
HF_HOME: /mnt/cache
TRANSFORMERS_IS_CI: yes
OMP_NUM_THREADS: 8
MKL_NUM_THREADS: 8
PYTEST_TIMEOUT: 60
TF_FORCE_GPU_ALLOW_GROWTH: true
CUDA_VISIBLE_DEVICES: 0,1
jobs:
setup:
name: Setup
strategy:
matrix:
machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
runs-on:
group: '${{ matrix.machine_type }}'
container:
image: huggingface/transformers-all-latest-gpu-push-ci
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
test_map: ${{ steps.set-matrix.outputs.test_map }}
env:
# `CI_BRANCH_PUSH`: The branch name from the push event
# `CI_BRANCH_WORKFLOW_RUN`: The name of the branch on which this workflow is triggered by `workflow_run` event
# `CI_SHA_PUSH`: The commit SHA from the push event
# `CI_SHA_WORKFLOW_RUN`: The commit SHA that triggers this workflow by `workflow_run` event
CI_BRANCH_PUSH: ${{ github.event.ref }}
CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
CI_SHA_PUSH: ${{ github.event.head_commit.id }}
CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
steps:
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
# We also take into account the `push` event (we might want to test some changes in a branch)
- name: Prepare custom environment variables
shell: bash
# `CI_BRANCH`: The non-empty branch name from the above two (one and only one of them is empty)
# `CI_SHA`: The non-empty commit SHA from the above two (one and only one of them is empty)
run: |
CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
echo $CI_BRANCH_PUSH
echo $CI_BRANCH_WORKFLOW_RUN
echo $CI_SHA_PUSH
echo $CI_SHA_WORKFLOW_RUN
[[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
[[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
- name: print environment variables
run: |
echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
echo "env.CI_SHA = ${{ env.CI_SHA }}"
- name: Update clone using environment variables
working-directory: /transformers
run: |
echo "original branch = $(git branch --show-current)"
git fetch && git checkout ${{ env.CI_BRANCH }}
echo "updated branch = $(git branch --show-current)"
git checkout ${{ env.CI_SHA }}
echo "log = $(git log -n 1)"
- name: Cleanup
working-directory: /transformers
run: |
rm -rf tests/__pycache__
rm -rf tests/models/__pycache__
rm -rf reports
- name: Show installed libraries and their versions
working-directory: /transformers
run: pip freeze
- name: Fetch the tests to run
working-directory: /transformers
# TODO: add `git-python` in the docker images
run: |
pip install --upgrade git-python
python3 utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
- name: Report fetched tests
uses: actions/upload-artifact@v4
with:
name: test_fetched
path: /transformers/test_preparation.txt
- id: set-matrix
name: Organize tests into models
working-directory: /transformers
# The `keys` is used as GitHub actions matrix for jobs, i.e. `models/bert`, `tokenization`, `pipeline`, etc.
# The `test_map` is used to get the actual identified test files under each key.
# If no test to run (so no `test_map.json` file), create a dummy map (empty matrix will fail)
run: |
if [ -f test_map.json ]; then
keys=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); d = list(test_map.keys()); print(d)')
test_map=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); print(test_map)')
else
keys=$(python3 -c 'keys = ["dummy"]; print(keys)')
test_map=$(python3 -c 'test_map = {"dummy": []}; print(test_map)')
fi
echo $keys
echo $test_map
echo "matrix=$keys" >> $GITHUB_OUTPUT
echo "test_map=$test_map" >> $GITHUB_OUTPUT
run_tests_single_gpu:
name: Model tests
needs: setup
# `dummy` means there is no test to run
if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
strategy:
fail-fast: false
matrix:
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
machine_type: [aws-g5-4xlarge-cache]
runs-on:
group: '${{ matrix.machine_type }}'
container:
image: huggingface/transformers-all-latest-gpu-push-ci
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
env:
# For the meaning of these environment variables, see the job `Setup`
CI_BRANCH_PUSH: ${{ github.event.ref }}
CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
CI_SHA_PUSH: ${{ github.event.head_commit.id }}
CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
steps:
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
# We also take into account the `push` event (we might want to test some changes in a branch)
- name: Prepare custom environment variables
shell: bash
# For the meaning of these environment variables, see the job `Setup`
run: |
CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
echo $CI_BRANCH_PUSH
echo $CI_BRANCH_WORKFLOW_RUN
echo $CI_SHA_PUSH
echo $CI_SHA_WORKFLOW_RUN
[[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
[[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
- name: print environment variables
run: |
echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
echo "env.CI_SHA = ${{ env.CI_SHA }}"
- name: Set `machine_type` for report and artifact names
working-directory: /transformers
shell: bash
run: |
echo "${{ matrix.machine_type }}"
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
machine_type=single-gpu
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
machine_type=multi-gpu
else
machine_type=${{ matrix.machine_type }}
fi
echo "$machine_type"
echo "machine_type=$machine_type" >> $GITHUB_ENV
- name: Update clone using environment variables
working-directory: /transformers
run: |
echo "original branch = $(git branch --show-current)"
git fetch && git checkout ${{ env.CI_BRANCH }}
echo "updated branch = $(git branch --show-current)"
git checkout ${{ env.CI_SHA }}
echo "log = $(git log -n 1)"
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
- name: Echo folder ${{ matrix.folders }}
shell: bash
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
# set the artifact folder names (because the character `/` is not allowed).
run: |
echo "${{ matrix.folders }}"
echo "${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}"
matrix_folders=${{ matrix.folders }}
matrix_folders=${matrix_folders/'models/'/'models_'}
echo "$matrix_folders"
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Environment
working-directory: /transformers
run: |
python3 utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /transformers
run: pip freeze
- name: Run all non-slow selected tests on GPU
working-directory: /transformers
run: |
python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
path: /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}
run_tests_multi_gpu:
name: Model tests
needs: setup
# `dummy` means there is no test to run
if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
strategy:
fail-fast: false
matrix:
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
machine_type: [aws-g5-12xlarge-cache]
runs-on:
group: '${{ matrix.machine_type }}'
container:
image: huggingface/transformers-all-latest-gpu-push-ci
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
env:
# For the meaning of these environment variables, see the job `Setup`
CI_BRANCH_PUSH: ${{ github.event.ref }}
CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
CI_SHA_PUSH: ${{ github.event.head_commit.id }}
CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
steps:
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
# We also take into account the `push` event (we might want to test some changes in a branch)
- name: Prepare custom environment variables
shell: bash
# For the meaning of these environment variables, see the job `Setup`
run: |
CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
echo $CI_BRANCH_PUSH
echo $CI_BRANCH_WORKFLOW_RUN
echo $CI_SHA_PUSH
echo $CI_SHA_WORKFLOW_RUN
[[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
[[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
- name: print environment variables
run: |
echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
echo "env.CI_SHA = ${{ env.CI_SHA }}"
- name: Set `machine_type` for report and artifact names
working-directory: /transformers
shell: bash
run: |
echo "${{ matrix.machine_type }}"
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
machine_type=single-gpu
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
machine_type=multi-gpu
else
machine_type=${{ matrix.machine_type }}
fi
echo "$machine_type"
echo "machine_type=$machine_type" >> $GITHUB_ENV
- name: Update clone using environment variables
working-directory: /transformers
run: |
echo "original branch = $(git branch --show-current)"
git fetch && git checkout ${{ env.CI_BRANCH }}
echo "updated branch = $(git branch --show-current)"
git checkout ${{ env.CI_SHA }}
echo "log = $(git log -n 1)"
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
- name: Echo folder ${{ matrix.folders }}
shell: bash
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
# set the artifact folder names (because the character `/` is not allowed).
run: |
echo "${{ matrix.folders }}"
echo "${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}"
matrix_folders=${{ matrix.folders }}
matrix_folders=${matrix_folders/'models/'/'models_'}
echo "$matrix_folders"
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Environment
working-directory: /transformers
run: |
python3 utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /transformers
run: pip freeze
- name: Run all non-slow selected tests on GPU
env:
MKL_SERVICE_FORCE_INTEL: 1
working-directory: /transformers
run: |
python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
path: /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}
run_tests_torch_cuda_extensions_single_gpu:
name: Torch CUDA extension tests
needs: setup
if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
strategy:
fail-fast: false
matrix:
machine_type: [aws-g5-4xlarge-cache]
runs-on:
group: '${{ matrix.machine_type }}'
container:
image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
env:
# For the meaning of these environment variables, see the job `Setup`
CI_BRANCH_PUSH: ${{ github.event.ref }}
CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
CI_SHA_PUSH: ${{ github.event.head_commit.id }}
CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
steps:
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
# We also take into account the `push` event (we might want to test some changes in a branch)
- name: Prepare custom environment variables
shell: bash
# For the meaning of these environment variables, see the job `Setup`
run: |
CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
echo $CI_BRANCH_PUSH
echo $CI_BRANCH_WORKFLOW_RUN
echo $CI_SHA_PUSH
echo $CI_SHA_WORKFLOW_RUN
[[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
[[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
- name: print environment variables
run: |
echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
echo "env.CI_SHA = ${{ env.CI_SHA }}"
- name: Set `machine_type` for report and artifact names
working-directory: /workspace/transformers
shell: bash
run: |
echo "${{ matrix.machine_type }}"
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
machine_type=single-gpu
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
machine_type=multi-gpu
else
machine_type=${{ matrix.machine_type }}
fi
echo "$machine_type"
echo "machine_type=$machine_type" >> $GITHUB_ENV
- name: Update clone using environment variables
working-directory: /workspace/transformers
run: |
echo "original branch = $(git branch --show-current)"
git fetch && git checkout ${{ env.CI_BRANCH }}
echo "updated branch = $(git branch --show-current)"
git checkout ${{ env.CI_SHA }}
echo "log = $(git log -n 1)"
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /workspace/transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
- name: Remove cached torch extensions
run: rm -rf /github/home/.cache/torch_extensions/
# To avoid unknown test failures
- name: Pre build DeepSpeed *again*
working-directory: /workspace
run: |
python3 -m pip uninstall -y deepspeed
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Environment
working-directory: /workspace/transformers
run: |
python utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /workspace/transformers
run: pip freeze
- name: Run all non-slow selected tests on GPU
working-directory: /workspace/transformers
# TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests.
run: |
python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
path: /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
run_tests_torch_cuda_extensions_multi_gpu:
name: Torch CUDA extension tests
needs: setup
if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
strategy:
fail-fast: false
matrix:
machine_type: [aws-g5-12xlarge-cache]
runs-on:
group: '${{ matrix.machine_type }}'
container:
image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
env:
# For the meaning of these environment variables, see the job `Setup`
CI_BRANCH_PUSH: ${{ github.event.ref }}
CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
CI_SHA_PUSH: ${{ github.event.head_commit.id }}
CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
steps:
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
# We also take into account the `push` event (we might want to test some changes in a branch)
- name: Prepare custom environment variables
shell: bash
# For the meaning of these environment variables, see the job `Setup`
run: |
CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
echo $CI_BRANCH_PUSH
echo $CI_BRANCH_WORKFLOW_RUN
echo $CI_SHA_PUSH
echo $CI_SHA_WORKFLOW_RUN
[[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
[[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
- name: print environment variables
run: |
echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
echo "env.CI_SHA = ${{ env.CI_SHA }}"
- name: Set `machine_type` for report and artifact names
working-directory: /workspace/transformers
shell: bash
run: |
echo "${{ matrix.machine_type }}"
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
machine_type=single-gpu
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
machine_type=multi-gpu
else
machine_type=${{ matrix.machine_type }}
fi
echo "$machine_type"
echo "machine_type=$machine_type" >> $GITHUB_ENV
- name: Update clone using environment variables
working-directory: /workspace/transformers
run: |
echo "original branch = $(git branch --show-current)"
git fetch && git checkout ${{ env.CI_BRANCH }}
echo "updated branch = $(git branch --show-current)"
git checkout ${{ env.CI_SHA }}
echo "log = $(git log -n 1)"
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /workspace/transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
- name: Remove cached torch extensions
run: rm -rf /github/home/.cache/torch_extensions/
# To avoid unknown test failures
- name: Pre build DeepSpeed *again*
working-directory: /workspace
run: |
python3 -m pip uninstall -y deepspeed
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Environment
working-directory: /workspace/transformers
run: |
python utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /workspace/transformers
run: pip freeze
- name: Run all non-slow selected tests on GPU
working-directory: /workspace/transformers
# TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests.
run: |
python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
path: /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
send_results:
name: Send results to webhook
runs-on: ubuntu-22.04
if: always()
needs: [
setup,
run_tests_single_gpu,
run_tests_multi_gpu,
run_tests_torch_cuda_extensions_single_gpu,
run_tests_torch_cuda_extensions_multi_gpu
]
env:
# For the meaning of these environment variables, see the job `Setup`
CI_BRANCH_PUSH: ${{ github.event.ref }}
CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
CI_SHA_PUSH: ${{ github.event.head_commit.id }}
CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
steps:
- name: Preliminary job status
shell: bash
# For the meaning of these environment variables, see the job `Setup`
run: |
echo "Setup status: ${{ needs.setup.result }}"
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
# We also take into account the `push` event (we might want to test some changes in a branch)
- name: Prepare custom environment variables
shell: bash
# For the meaning of these environment variables, see the job `Setup`
run: |
CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
echo $CI_BRANCH_PUSH
echo $CI_BRANCH_WORKFLOW_RUN
echo $CI_SHA_PUSH
echo $CI_SHA_WORKFLOW_RUN
[[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
[[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
- name: print environment variables
run: |
echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
echo "env.CI_SHA = ${{ env.CI_SHA }}"
- uses: actions/checkout@v4
# To avoid failure when multiple commits are merged into `main` in a short period of time.
# Checking out to an old commit beyond the fetch depth will get an error `fatal: reference is not a tree: ...
# (Only required for `workflow_run` event, where we get the latest HEAD on `main` instead of the event commit)
with:
fetch-depth: 20
- name: Update clone using environment variables
run: |
echo "original branch = $(git branch --show-current)"
git fetch && git checkout ${{ env.CI_BRANCH }}
echo "updated branch = $(git branch --show-current)"
git checkout ${{ env.CI_SHA }}
echo "log = $(git log -n 1)"
- uses: actions/download-artifact@v4
- name: Send message to Slack
env:
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
CI_EVENT: push
CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
CI_SHA: ${{ env.CI_SHA }}
SETUP_STATUS: ${{ needs.setup.result }}
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
run: |
pip install huggingface_hub
pip install slack_sdk
pip show slack_sdk
python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"

View File

@ -2,7 +2,7 @@ name: Self-hosted runner (AMD scheduled CI caller)
on:
schedule:
- cron: "17 5 * * *"
- cron: "17 2 * * *"
jobs:
run_scheduled_amd_ci:

View File

@ -21,7 +21,7 @@ jobs:
job: run_models_gpu
slack_report_channel: "#amd-hf-ci"
runner_group: hfc-amd-mi355
docker: huggingface/transformers-pytorch-amd-gpu
docker: huggingface/testing-rocm7.0-preview
ci_event: Scheduled CI (AMD) - mi355
report_repo_id: hf-transformers-bot/transformers-ci-dummy
secrets: inherit
@ -33,7 +33,7 @@ jobs:
job: run_pipelines_torch_gpu
slack_report_channel: "#amd-hf-ci"
runner_group: hfc-amd-mi355
docker: huggingface/transformers-pytorch-amd-gpu
docker: huggingface/testing-rocm7.0-preview
ci_event: Scheduled CI (AMD) - mi355
report_repo_id: hf-transformers-bot/transformers-ci-dummy
secrets: inherit
@ -45,7 +45,7 @@ jobs:
job: run_examples_gpu
slack_report_channel: "#amd-hf-ci"
runner_group: hfc-amd-mi355
docker: huggingface/transformers-pytorch-amd-gpu
docker: huggingface/testing-rocm7.0-preview
ci_event: Scheduled CI (AMD) - mi355
report_repo_id: hf-transformers-bot/transformers-ci-dummy
secrets: inherit

View File

@ -33,13 +33,10 @@ jobs:
runs-on: ubuntu-22.04
steps:
- name: Setup
env:
prev_workflow_run_id: ${{ inputs.prev_workflow_run_id || env.prev_workflow_run_id }}
other_workflow_run_id: ${{ inputs.other_workflow_run_id || env.other_workflow_run_id }}
run: |
mkdir "setup_values"
echo "$prev_workflow_run_id" > "setup_values/prev_workflow_run_id.txt"
echo "$other_workflow_run_id" > "setup_values/other_workflow_run_id.txt"
echo "${{ inputs.prev_workflow_run_id || env.prev_workflow_run_id }}" > "setup_values/prev_workflow_run_id.txt"
echo "${{ inputs.other_workflow_run_id || env.other_workflow_run_id }}" > "setup_values/other_workflow_run_id.txt"
- name: Upload artifacts
uses: actions/upload-artifact@v4
@ -66,7 +63,7 @@ jobs:
with:
job: run_pipelines_torch_gpu
slack_report_channel: "#transformers-ci-daily-pipeline-torch"
docker: huggingface/transformers-all-latest-gpu
docker: huggingface/transformers-pytorch-gpu
ci_event: Daily CI
report_repo_id: hf-internal-testing/transformers_daily_ci
commit_sha: ${{ github.sha }}
@ -121,15 +118,3 @@ jobs:
report_repo_id: hf-internal-testing/transformers_daily_ci
commit_sha: ${{ github.sha }}
secrets: inherit
kernels-ci:
name: Kernels CI
uses: ./.github/workflows/self-scheduled.yml
with:
job: run_kernels_gpu
slack_report_channel: "#transformers-ci-daily-kernels"
docker: huggingface/transformers-all-latest-gpu
ci_event: Daily CI
report_repo_id: hf-internal-testing/transformers_daily_ci
commit_sha: ${{ github.sha }}
secrets: inherit

View File

@ -1,60 +0,0 @@
name: Nvidia CI - Flash Attn
on:
repository_dispatch:
schedule:
- cron: "17 2 * * *"
push:
branches:
- run_nvidia_ci_flash_attn*
workflow_dispatch:
inputs:
prev_workflow_run_id:
description: 'previous workflow run id to compare'
type: string
required: false
default: ""
other_workflow_run_id:
description: 'other workflow run id to compare'
type: string
required: false
default: ""
# Used for `push` to easily modify the target workflow runs to compare against
env:
prev_workflow_run_id: ""
other_workflow_run_id: ""
jobs:
setup:
name: Setup
runs-on: ubuntu-22.04
steps:
- name: Setup
run: |
mkdir "setup_values"
echo "${{ inputs.prev_workflow_run_id || env.prev_workflow_run_id }}" > "setup_values/prev_workflow_run_id.txt"
echo "${{ inputs.other_workflow_run_id || env.other_workflow_run_id }}" > "setup_values/other_workflow_run_id.txt"
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
name: setup_values
path: setup_values
model-ci:
name: Model CI
uses: ./.github/workflows/self-scheduled.yml
with:
job: run_models_gpu
slack_report_channel: "#transformers-ci-flash-attn"
docker: huggingface/transformers-all-latest-gpu:flash-attn
ci_event: Daily CI
runner_type: "a10"
report_repo_id: hf-internal-testing/transformers_flash_attn_ci
commit_sha: ${{ github.sha }}
pytest_marker: "flash_attn_test or flash_attn_3_test"
secrets: inherit

View File

@ -34,20 +34,10 @@ on:
runner_type:
required: false
type: string
subdirs:
models:
default: ""
required: false
type: string
pytest_marker:
required: false
type: string
pr_number:
required: false
type: string
outputs:
report:
description: "Content of the report of new failures"
value: ${{ jobs.check_new_failures.outputs.report }}
env:
HF_HOME: /mnt/cache
@ -60,6 +50,7 @@ env:
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
TF_FORCE_GPU_ALLOW_GROWTH: true
CUDA_VISIBLE_DEVICES: 0,1
NUM_SLICES: 2
jobs:
setup:
@ -80,11 +71,8 @@ jobs:
steps:
- name: Update clone
working-directory: /transformers
env:
commit_sha: ${{ inputs.commit_sha || github.sha }}
run: |
git fetch origin $commit_sha
git fetch && git checkout $commit_sha
git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
- name: Cleanup
working-directory: /transformers
@ -101,17 +89,11 @@ jobs:
if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
name: Identify models to test
working-directory: /transformers/tests
env:
job: ${{ inputs.job }}
subdirs: ${{ inputs.subdirs }}
NUM_SLICES: 2
run: |
if [ "$job" = "run_models_gpu" ]; then
python3 ../utils/split_model_tests.py --subdirs "$subdirs" --num_splits "$NUM_SLICES" > folder_slices.txt
echo "folder_slices=$(cat folder_slices.txt)" >> $GITHUB_OUTPUT
python3 -c "import ast; folder_slices = ast.literal_eval(open('folder_slices.txt').read()); open('slice_ids.txt', 'w').write(str(list(range(len(folder_slices)))))"
echo "slice_ids=$(cat slice_ids.txt)" >> $GITHUB_OUTPUT
elif [ "$job" = "run_trainer_and_fsdp_gpu" ]; then
if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
echo "folder_slices=$(python3 ../utils/split_model_tests.py --models '${{ inputs.models }}' --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
fi
@ -120,10 +102,8 @@ jobs:
if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
name: Identify quantization method to test
working-directory: /transformers/tests
env:
subdirs: ${{ inputs.subdirs || 'None' }}
run: |
echo "quantization_matrix=$(python3 -c 'import ast; import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); subdirs = ast.literal_eval(os.environ["subdirs"]); quantization_tests = [x.removeprefix("quantization/") for x in subdirs] if subdirs is not None else quantization_tests; d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))); print(d)')" >> $GITHUB_OUTPUT
echo "quantization_matrix=$(python3 -c 'import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ; print(d)')" >> $GITHUB_OUTPUT
- name: NVIDIA-SMI
run: |
@ -147,7 +127,6 @@ jobs:
commit_sha: ${{ inputs.commit_sha || github.sha }}
runner_type: ${{ inputs.runner_type }}
report_repo_id: ${{ inputs.report_repo_id }}
pytest_marker: ${{ inputs.pytest_marker }}
secrets: inherit
run_trainer_and_fsdp_gpu:
@ -181,14 +160,12 @@ jobs:
runs-on:
group: '${{ matrix.machine_type }}'
container:
image: huggingface/transformers-all-latest-gpu
image: huggingface/transformers-pytorch-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Update clone
working-directory: /transformers
env:
commit_sha: ${{ inputs.commit_sha || github.sha }}
run: git fetch && git checkout "$commit_sha"
run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
@ -210,17 +187,15 @@ jobs:
- name: Set `machine_type` for report and artifact names
working-directory: /transformers
shell: bash
env:
matrix_machine_type: ${{ matrix.machine_type }}
run: |
echo "$matrix_machine_type"
echo "${{ matrix.machine_type }}"
if [ "$matrix_machine_type" = "aws-g5-4xlarge-cache" ]; then
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
machine_type=single-gpu
elif [ "$matrix_machine_type" = "aws-g5-12xlarge-cache" ]; then
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
machine_type=multi-gpu
else
machine_type="$matrix_machine_type"
machine_type=${{ matrix.machine_type }}
fi
echo "$machine_type"
@ -229,12 +204,12 @@ jobs:
- name: Run all pipeline tests on GPU
working-directory: /transformers
run: |
python3 -m pytest -n 1 -v --dist=loadfile --make-reports="${machine_type}_run_pipelines_torch_gpu_test_reports" tests/pipelines
python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat "/transformers/reports/${machine_type}_run_pipelines_torch_gpu_test_reports/failures_short.txt"
run: cat /transformers/reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports"
if: ${{ always() }}
@ -258,9 +233,7 @@ jobs:
steps:
- name: Update clone
working-directory: /transformers
env:
commit_sha: ${{ inputs.commit_sha || github.sha }}
run: git fetch && git checkout "$commit_sha"
run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
@ -282,17 +255,15 @@ jobs:
- name: Set `machine_type` for report and artifact names
working-directory: /transformers
shell: bash
env:
matrix_machine_type: ${{ matrix.machine_type }}
run: |
echo "$matrix_machine_type"
echo "${{ matrix.machine_type }}"
if [ "$matrix_machine_type" = "aws-g5-4xlarge-cache" ]; then
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
machine_type=single-gpu
elif [ "$matrix_machine_type" = "aws-g5-12xlarge-cache" ]; then
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
machine_type=multi-gpu
else
machine_type="$matrix_machine_type"
machine_type=${{ matrix.machine_type }}
fi
echo "$machine_type"
@ -302,12 +273,12 @@ jobs:
working-directory: /transformers
run: |
pip install -r examples/pytorch/_tests_requirements.txt
python3 -m pytest -v --make-reports="${machine_type}_run_examples_gpu_test_reports" examples/pytorch
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_examples_gpu_test_reports examples/pytorch
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat "/transformers/reports/${machine_type}_run_examples_gpu_test_reports/failures_short.txt"
run: cat /transformers/reports/${{ env.machine_type }}_run_examples_gpu_test_reports/failures_short.txt
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_examples_gpu_test_reports"
if: ${{ always() }}
@ -331,9 +302,7 @@ jobs:
steps:
- name: Update clone
working-directory: ${{ inputs.working-directory-prefix }}/transformers
env:
commit_sha: ${{ inputs.commit_sha || github.sha }}
run: git fetch && git checkout "$commit_sha"
run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: ${{ inputs.working-directory-prefix }}/transformers
@ -355,7 +324,7 @@ jobs:
working-directory: ${{ inputs.working-directory-prefix }}/
run: |
python3 -m pip uninstall -y deepspeed
DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check
DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
# To avoid unknown test failures
- name: Pre build DeepSpeed *again* (for nightly & Past CI)
@ -365,7 +334,7 @@ jobs:
python3 -m pip uninstall -y deepspeed
rm -rf DeepSpeed
git clone https://github.com/deepspeedai/DeepSpeed && cd DeepSpeed && rm -rf build
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
- name: NVIDIA-SMI
run: |
@ -383,17 +352,15 @@ jobs:
- name: Set `machine_type` for report and artifact names
working-directory: ${{ inputs.working-directory-prefix }}/transformers
shell: bash
env:
matrix_machine_type: ${{ matrix.machine_type }}
run: |
echo "$matrix_machine_type"
echo "${{ matrix.machine_type }}"
if [ "$matrix_machine_type" = "aws-g5-4xlarge-cache" ]; then
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
machine_type=single-gpu
elif [ "$matrix_machine_type" = "aws-g5-12xlarge-cache" ]; then
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
machine_type=multi-gpu
else
machine_type="$matrix_machine_type"
machine_type=${{ matrix.machine_type }}
fi
echo "$machine_type"
@ -402,14 +369,12 @@ jobs:
- name: Run all tests on GPU
working-directory: ${{ inputs.working-directory-prefix }}/transformers
run: |
python3 -m pytest -v --make-reports="${machine_type}_run_torch_cuda_extensions_gpu_test_reports" tests/deepspeed tests/extended
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
env:
working_directory_prefix: ${{ inputs.working-directory-prefix }}
run: cat "${working_directory_prefix}/transformers/reports/${machine_type}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt"
run: cat ${{ inputs.working-directory-prefix }}/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
if: ${{ always() }}
@ -436,19 +401,16 @@ jobs:
steps:
- name: Echo folder ${{ matrix.folders }}
shell: bash
env:
matrix_folders_raw: ${{ matrix.folders }}
run: |
echo "$matrix_folders_raw"
matrix_folders="${matrix_folders_raw/'quantization/'/'quantization_'}"
echo "${{ matrix.folders }}"
matrix_folders=${{ matrix.folders }}
matrix_folders=${matrix_folders/'quantization/'/'quantization_'}
echo "$matrix_folders"
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
- name: Update clone
working-directory: /transformers
env:
commit_sha: ${{ inputs.commit_sha || github.sha }}
run: git fetch && git checkout "$commit_sha"
run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
working-directory: /transformers
@ -470,17 +432,15 @@ jobs:
- name: Set `machine_type` for report and artifact names
working-directory: /transformers
shell: bash
env:
matrix_machine_type: ${{ matrix.machine_type }}
run: |
echo "$matrix_machine_type"
echo "${{ matrix.machine_type }}"
if [ "$matrix_machine_type" = "aws-g5-4xlarge-cache" ]; then
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
machine_type=single-gpu
elif [ "$matrix_machine_type" = "aws-g5-12xlarge-cache" ]; then
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
machine_type=multi-gpu
else
machine_type="$matrix_machine_type"
machine_type=${{ matrix.machine_type }}
fi
echo "$machine_type"
@ -488,96 +448,20 @@ jobs:
- name: Run quantization tests on GPU
working-directory: /transformers
env:
folders: ${{ matrix.folders }}
run: |
python3 -m pytest -v --make-reports="${machine_type}_run_quantization_torch_gpu_${matrix_folders}_test_reports" tests/${folders}
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat "/transformers/reports/${machine_type}_run_quantization_torch_gpu_${matrix_folders}_test_reports/failures_short.txt"
run: cat /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports
path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports
run_kernels_gpu:
if: ${{ inputs.job == 'run_kernels_gpu' }}
name: Kernel tests
strategy:
fail-fast: false
matrix:
machine_type: [aws-g5-4xlarge-cache]
runs-on:
group: '${{ matrix.machine_type }}'
container:
image: ${{ inputs.docker }}
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Update clone
working-directory: /transformers
env:
commit_sha: ${{ inputs.commit_sha || github.sha }}
run: git fetch && git checkout "$commit_sha"
- name: Reinstall transformers in edit mode
working-directory: /transformers
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .[testing]
- name: Install kernels
working-directory: /transformers
run: python3 -m pip install -U kernels
- name: NVIDIA-SMI
run: nvidia-smi
- name: Environment
working-directory: /transformers
run: python3 utils/print_env.py
- name: Show installed libraries and their versions
working-directory: /transformers
run: pip freeze
- name: Set `machine_type` for report and artifact names
working-directory: /transformers
shell: bash
env:
matrix_machine_type: ${{ matrix.machine_type }}
run: |
echo "$matrix_machine_type"
if [ "$matrix_machine_type" = "aws-g5-4xlarge-cache" ]; then
machine_type=single-gpu
elif [ "$matrix_machine_type" = "aws-g5-12xlarge-cache" ]; then
machine_type=multi-gpu
else
machine_type="$matrix_machine_type"
fi
echo "$machine_type"
echo "machine_type=$machine_type" >> $GITHUB_ENV
- name: Run kernel tests on GPU
working-directory: /transformers
run: |
python3 -m pytest -v --make-reports="${machine_type}_run_kernels_gpu_test_reports" tests/kernels/test_kernels.py
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat "/transformers/reports/${machine_type}_run_kernels_gpu_test_reports/failures_short.txt"
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_kernels_gpu_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ env.machine_type }}_run_kernels_gpu_test_reports
path: /transformers/reports/${{ env.machine_type }}_run_kernels_gpu_test_reports
path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports
run_extract_warnings:
# Let's only do this for the job `run_models_gpu` to simplify the (already complex) logic.
@ -586,10 +470,11 @@ jobs:
runs-on: ubuntu-22.04
needs: [setup, run_models_gpu]
steps:
# Checkout in order to run `utils/extract_warnings.py`. Avoid **explicit** checkout (i.e. don't specify `ref`) for
# security reason.
- name: Checkout transformers
uses: actions/checkout@v4
with:
fetch-depth: 2
ref: ${{ inputs.commit_sha || github.sha }}
- name: Install transformers
run: pip install transformers
@ -609,12 +494,9 @@ jobs:
working-directory: warnings_in_ci
- name: Extract warnings in CI artifacts
env:
github_run_id: ${{ github.run_id }}
access_token: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
run: |
python3 utils/extract_warnings.py --workflow_run_id "$github_run_id" --output_dir warnings_in_ci --token "$access_token" --from_gh
echo "$(python3 -c 'import os; import json; fp = open("warnings_in_ci/selected_warnings.json"); d = json.load(fp); d = "\n".join(d); print(d)')"
python3 utils/extract_warnings.py --workflow_run_id ${{ github.run_id }} --output_dir warnings_in_ci --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} --from_gh
echo "$(python3 -c 'import os; import json; fp = open("warnings_in_ci/selected_warnings.json"); d = json.load(fp); d = "\n".join(d) ;print(d)')"
- name: Upload artifact
if: ${{ always() }}
@ -633,7 +515,6 @@ jobs:
run_examples_gpu,
run_torch_cuda_extensions_gpu,
run_quantization_torch_gpu,
run_kernels_gpu,
run_extract_warnings
]
if: always() && !cancelled()
@ -653,17 +534,16 @@ jobs:
secrets: inherit
check_new_failures:
if: ${{ always() && needs.send_results.result == 'success' }}
if: ${{ always() && inputs.ci_event == 'Daily CI' && needs.send_results.result == 'success' }}
name: Check new failures
needs: send_results
uses: ./.github/workflows/check_failed_tests.yml
with:
docker: ${{ inputs.docker }}
commit_sha: ${{ inputs.commit_sha || github.sha }}
start_sha: ${{ inputs.commit_sha || github.sha }}
job: ${{ inputs.job }}
slack_report_channel: ${{ inputs.slack_report_channel }}
ci_event: ${{ inputs.ci_event }}
report_repo_id: ${{ inputs.report_repo_id }}
pr_number: ${{ inputs.pr_number }}
secrets: inherit

View File

@ -41,16 +41,13 @@ jobs:
- name: Preliminary job status
shell: bash
# For the meaning of these environment variables, see the job `Setup`
env:
setup_status: ${{ inputs.setup_status }}
run: |
echo "Setup status: $setup_status"
echo "Setup status: ${{ inputs.setup_status }}"
- uses: actions/checkout@v4
with:
fetch-depth: 2
# Security: checkout to the `main` branch for untrusted triggers (issue_comment, pull_request_target), otherwise use the specified ref
ref: ${{ (github.event_name == 'issue_comment' || github.event_name == 'pull_request_target') && 'main' || (inputs.commit_sha || github.sha) }}
ref: ${{ inputs.commit_sha || github.sha }}
- uses: actions/download-artifact@v4
@ -84,8 +81,6 @@ jobs:
CI_TEST_JOB: ${{ inputs.job }}
SETUP_STATUS: ${{ inputs.setup_status }}
REPORT_REPO_ID: ${{ inputs.report_repo_id }}
quantization_matrix: ${{ inputs.quantization_matrix }}
folder_slices: ${{ inputs.folder_slices }}
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
# For a job that doesn't depend on (i.e. `needs`) `setup`, the value for `inputs.folder_slices` would be an
@ -94,10 +89,10 @@ jobs:
pip install huggingface_hub
pip install slack_sdk
pip show slack_sdk
if [ "$quantization_matrix" != "" ]; then
python utils/notification_service.py "$quantization_matrix"
if [ "${{ inputs.quantization_matrix }}" != "" ]; then
python utils/notification_service.py "${{ inputs.quantization_matrix }}"
else
python utils/notification_service.py "$folder_slices"
python utils/notification_service.py "${{ inputs.folder_slices }}"
fi
# Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.

View File

@ -4,7 +4,7 @@ on:
workflow_dispatch:
inputs:
runner_type:
description: 'Type of runner to test (a10)'
description: 'Type of runner to test (a10 or t4)'
required: true
docker_image:
description: 'Name of the Docker image'
@ -36,10 +36,14 @@ jobs:
NUM_GPUS: ${{ github.event.inputs.num_gpus }}
RUNNER_TYPE: ${{ github.event.inputs.runner_type }}
run: |
if [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "a10" ]]; then
echo "RUNNER=aws-g5-4xlarge-cache-ssh" >> $GITHUB_ENV
if [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "t4" ]]; then
echo "RUNNER=aws-g4dn-4xlarge-cache" >> $GITHUB_ENV
elif [[ "$NUM_GPUS" == "multi" && "$RUNNER_TYPE" == "t4" ]]; then
echo "RUNNER=aws-g4dn-12xlarge-cache" >> $GITHUB_ENV
elif [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "a10" ]]; then
echo "RUNNER=aws-g5-4xlarge-cache" >> $GITHUB_ENV
elif [[ "$NUM_GPUS" == "multi" && "$RUNNER_TYPE" == "a10" ]]; then
echo "RUNNER=aws-g5-12xlarge-cache-ssh" >> $GITHUB_ENV
echo "RUNNER=aws-g5-12xlarge-cache" >> $GITHUB_ENV
else
echo "RUNNER=" >> $GITHUB_ENV
fi
@ -47,8 +51,8 @@ jobs:
- name: Set runner to use
id: set_runner
run: |
echo "$RUNNER"
echo "RUNNER=$RUNNER" >> $GITHUB_OUTPUT
echo ${{ env.RUNNER }}
echo "RUNNER=${{ env.RUNNER }}" >> $GITHUB_OUTPUT
ssh_runner:
name: "SSH"
@ -57,13 +61,13 @@ jobs:
group: ${{ needs.get_runner.outputs.RUNNER }}
container:
image: ${{ github.event.inputs.docker_image }}
options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Update clone
working-directory: /transformers
env:
commit_sha: ${{ github.sha }}
run: |
git fetch && git checkout "$commit_sha"
git fetch && git checkout ${{ github.sha }}
- name: Cleanup
working-directory: /transformers
@ -95,17 +99,14 @@ jobs:
- name: Store Slack infos
#because the SSH can be enabled dynamically if the workflow failed, so we need to store slack infos to be able to retrieve them during the waitforssh step
shell: bash
env:
user_slack_id: ${{ secrets[format('{0}_{1}', env.github_actor, 'SLACK_ID')] }}
default_slack_channel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
run: |
echo "$github_actor"
if [ "$user_slack_id" != "" ]; then
echo "SLACKCHANNEL=$user_slack_id" >> $GITHUB_ENV
echo "${{ env.github_actor }}"
if [ "${{ secrets[format('{0}_{1}', env.github_actor, 'SLACK_ID')] }}" != "" ]; then
echo "SLACKCHANNEL=${{ secrets[format('{0}_{1}', env.github_actor, 'SLACK_ID')] }}" >> $GITHUB_ENV
else
echo "SLACKCHANNEL=$default_slack_channel" >> $GITHUB_ENV
echo "SLACKCHANNEL=${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}" >> $GITHUB_ENV
fi
- name: Tailscale # In order to be able to SSH when a test fails
uses: huggingface/tailscale-action@main
with:

View File

@ -14,7 +14,7 @@ This AGENTS.md file provides guidance for code agents working with this codebase
- PRs should be as brief as possible. Bugfix PRs in particular can often be only one or two lines long, and do not need large comments, docstrings or new functions in this case. Aim to minimize the size of the diff.
- When writing tests, they should be added to an existing file. The only exception is for PRs to add a new model, when a new test directory should be created for that model.
- Code style is enforced in the CI. You can install the style tools with `pip install -e ".[quality]"`. You can then run `make fixup` to apply style and consistency fixes to your code.
- Code style is enforced in the CI. You can install the style tools with `pip install -e .[quality]`. You can then run `make fixup` to apply style and consistency fixes to your code.
## Copying and inheritance
@ -36,4 +36,4 @@ After making changes, you should usually run `make fixup` to ensure any copies a
the model you made the changes in and any other models that were updated by `make fixup`. Tests can be run with `pytest tests/models/[name]/test_modeling_[name].py`
If your changes affect code in other classes like tokenizers or processors, you should run those tests instead, like `test_processing_[name].py` or `test_tokenization_[name].py`.
In order to run tests, you may need to install dependencies. You can do this with `pip install -e ".[testing]"`. You will probably also need to `pip install torch accelerate` if your environment does not already have them.
In order to run tests, you may need to install dependencies. You can do this with `pip install -e .[testing]`. You will probably also need to `pip install torch accelerate` if your environment does not already have them.

View File

@ -64,8 +64,8 @@ limitations under the License.
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_as_a_model_definition.png"/>
</h3>
Transformers acts as the model-definition framework for state-of-the-art machine learning with text, computer
vision, audio, video, and multimodal models, for both inference and training.
Transformers acts as the model-definition framework for state-of-the-art machine learning models in text, computer
vision, audio, video, and multimodal model, for both inference and training.
It centralizes the model definition so that this definition is agreed upon across the ecosystem. `transformers` is the
pivot across frameworks: if a model definition is supported, it will be compatible with the majority of training

View File

@ -9,12 +9,6 @@ In this list, we showcase incredibly impactful and novel projects that have push
adding other projects to the list. If you believe a project should be here and it's not, then please, open a PR
to add it.
## [◉ Universal Intelligence](https://github.com/blueraai/universal-intelligence)
[Universal Intelligence](https://github.com/blueraai/universal-intelligence) aims to standardize models, tools, and agents —transforming them into simple, composable, portable, interoperable, framework-agnostic, hardware-agnostic interfaces (through auto-negotiation and resource sharing); for fast and accessible development of AI applications.
Keywords: Protocol, Open-source, LLMs, Large Language Models, Agents, Low-code
## [gpt4all](https://github.com/nomic-ai/gpt4all)
[gpt4all](https://github.com/nomic-ai/gpt4all) is an ecosystem of open-source chatbots trained on massive collections of clean assistant data including code, stories and dialogue. It offers open-source, large language models such as LLaMA and GPT-J trained in an assistant-style.

View File

@ -1,5 +1,6 @@
gpustat==1.1.1
psutil==6.0.0
psycopg2==2.9.9
torch>=2.4.0
hf_xet
pandas>=1.5.0
pandas>=1.5.0

View File

@ -1,11 +1,8 @@
import hashlib
import itertools
import json
import logging
from typing import Any
from transformers.utils.import_utils import is_flash_attn_2_available
KERNELIZATION_AVAILABLE = False
try:
@ -21,22 +18,11 @@ logger = logging.getLogger(__name__)
class BenchmarkConfig:
"""Configuration for a single benchmark scenario."""
all_attn_implementations = [
("flash_attention_2", None),
("eager", None),
("sdpa", "math"),
("sdpa", "flash_attention"),
("flex_attention", None),
]
all_compiled_modes = [None, "default", "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"]
def __init__(
self,
warmup_iterations: int = 5,
measurement_iterations: int = 20,
gpu_monitoring: bool = True, # NOTE: you may want to disable this at times as we have obsvered it could heavily slow down benchmarks on AMD
continuous_batching: bool = False,
batch_size: int = 1,
sequence_length: int = 128,
num_tokens_to_generate: int = 128,
@ -52,7 +38,6 @@ class BenchmarkConfig:
self.warmup_iterations = warmup_iterations
self.measurement_iterations = measurement_iterations
self.gpu_monitoring = gpu_monitoring
self.continuous_batching = continuous_batching
# Input parameters
self.batch_size = batch_size
self.sequence_length = sequence_length
@ -74,35 +59,12 @@ class BenchmarkConfig:
def check_validity(self, skip_validity_check: bool = False) -> None:
if skip_validity_check:
return
# Check FA is installed
if self.attn_implementation == "flash_attention_2" and not is_flash_attn_2_available():
logger.warning(
"Flash attention does not support compile mode. Defaulting to SDPA w/ flash attention backend."
)
self.attn_implementation = "sdpa"
self.sdpa_backend = "flash_attention"
# Flash attention does not support compile mode, so we turn it off # FIXME: it would be better to support it
is_fa = self.attn_implementation == "flash_attention_2"
is_fa |= self.attn_implementation == "sdpa" and self.sdpa_backend == "flash_attention"
if is_fa:
logger.warning("Flash attention does not support compile mode. Turning off compile mode.")
self.compile_mode = None
# Handle SDPA backend if not determined by the config (needs to be done before skipping duplicates)
if self.attn_implementation == "sdpa" and self.sdpa_backend is None:
default_backend = "flash_attention" # FIXME: torch has a _cur_sdpa_kernel_backends but it fails
logger.warning(f"No SDPA backend provided, using {default_backend} instead.")
self.sdpa_backend = default_backend
if self.continuous_batching:
if self.attn_implementation == "flex_attention":
logger.error(
"disabling continuous batching because of invalid configuration: flex attention is not supported"
)
self.continuous_batching = False
elif self.attn_implementation == "sdpa" and self.sdpa_backend is not None:
logger.warning(
"when continuous batching is enabled, sdpa_backend must be None because of the attention mask, setting it to None"
)
self.sdpa_backend = "math"
@property
def hash(self) -> str:
@ -118,7 +80,6 @@ class BenchmarkConfig:
attn_code += f"_{self.sdpa_backend}" if self.attn_implementation == "sdpa" else ""
compile_str = f"compiled_{self.compile_mode}" if self.compile_mode is not None else "uncompiled"
kernelize_str = "kernelized" if self.kernelize else "unkernelized"
continuous_batching_str = "cb" if self.continuous_batching else "generate"
sep = "-"
else:
iter_str = f"{self.warmup_iterations} warmup, {self.measurement_iterations} iterations"
@ -128,11 +89,8 @@ class BenchmarkConfig:
attn_code += f" with {self.sdpa_backend} backend" if self.attn_implementation == "sdpa" else ""
compile_str = "compiled" if self.compile_mode is not None else "not compiled"
kernelize_str = "kernelized" if self.kernelize else "not kernelized"
continuous_batching_str = "continuous batching" if self.continuous_batching else "regular generate"
sep = ", "
return sep.join(
[iter_str, gpu_monitor_str, dimensions_str, attn_code, compile_str, kernelize_str, continuous_batching_str]
)
return sep.join([iter_str, gpu_monitor_str, dimensions_str, attn_code, compile_str, kernelize_str])
def to_dict(self) -> dict[str, Any]:
return {
@ -140,7 +98,6 @@ class BenchmarkConfig:
"warmup_iterations": self.warmup_iterations,
"measurement_iterations": self.measurement_iterations,
"gpu_monitoring": self.gpu_monitoring,
"continuous_batching": self.continuous_batching,
"batch_size": self.batch_size,
"sequence_length": self.sequence_length,
"num_tokens_to_generate": self.num_tokens_to_generate,
@ -157,7 +114,6 @@ class BenchmarkConfig:
warmup_iterations=data.get("warmup_iterations", 5),
measurement_iterations=data.get("measurement_iterations", 20),
gpu_monitoring=data.get("gpu_monitoring", False),
continuous_batching=data.get("continuous_batching", False),
batch_size=data.get("batch_size", 1),
sequence_length=data.get("sequence_length", 128),
num_tokens_to_generate=data.get("num_tokens_to_generate", 128),
@ -171,72 +127,88 @@ class BenchmarkConfig:
)
def adapt_configs(
configs: list[BenchmarkConfig],
warmup_iterations: int | list[int] = 5,
measurement_iterations: int | list[int] = 20,
batch_size: int | list[int] = 1,
sequence_length: int | list[int] = 128,
num_tokens_to_generate: int | list[int] = 128,
gpu_monitoring: bool | list[bool] = True,
def cross_generate_configs(
attn_impl_and_sdpa_backend: list[tuple[str, str | None]],
compiled_mode: list[str | None],
kernelized: list[bool],
warmup_iterations: int = 5,
measurement_iterations: int = 20,
batch_size: int = 1,
sequence_length: int = 128,
num_tokens_to_generate: int = 128,
gpu_monitoring: bool = True,
) -> list[BenchmarkConfig]:
parameters = (
x if isinstance(x, list) else [x]
for x in [
warmup_iterations,
measurement_iterations,
batch_size,
sequence_length,
num_tokens_to_generate,
gpu_monitoring,
]
)
iterator = itertools.product(*parameters)
adapted_configs = []
for warmup_iters, measurement_iters, bs, seqlen, ntok, monitor in iterator:
for config in configs:
config = config.to_dict()
config["warmup_iterations"] = warmup_iters
config["measurement_iterations"] = measurement_iters
config["batch_size"] = bs
config["sequence_length"] = seqlen
config["num_tokens_to_generate"] = ntok
config["gpu_monitoring"] = monitor
adapted_configs.append(BenchmarkConfig.from_dict(config))
return adapted_configs
def get_config_by_level(level: int) -> list[BenchmarkConfig]:
# Create kwargs common to all configs
kwargs = {
"warmup_iterations": warmup_iterations,
"measurement_iterations": measurement_iterations,
"batch_size": batch_size,
"sequence_length": sequence_length,
"num_tokens_to_generate": num_tokens_to_generate,
"gpu_monitoring": gpu_monitoring,
}
# Cross-generate all combinations of attn_implementation, compiled_mode, and kernelized
configs = []
# Early return if level is greater than 3: we generate all combinations of configs, maybe even w/ all compile modes
if level >= 3:
for attn_implementation, sdpa_backend in BenchmarkConfig.all_attn_implementations:
# Usually there is not much to gain by compiling with other modes, but we allow it for level 4
compile_modes = BenchmarkConfig.all_compiled_modes if level >= 4 else [None, "default"]
for cm in compile_modes:
for kernelize_on in {False, KERNELIZATION_AVAILABLE}:
for cb_on in [False, True]:
configs.append(
BenchmarkConfig(
attn_implementation=attn_implementation,
sdpa_backend=sdpa_backend,
compile_mode=cm,
kernelize=kernelize_on,
continuous_batching=cb_on,
)
)
return configs
# Otherwise, we add the configs for the given level
if level >= 0:
configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default"))
if level >= 1:
configs.append(BenchmarkConfig(attn_implementation="flash_attention_2"))
configs.append(BenchmarkConfig(attn_implementation="eager", compile_mode="default"))
configs.append(BenchmarkConfig(attn_implementation="flash_attention_2", continuous_batching=True))
if level >= 2:
configs.append(BenchmarkConfig(attn_implementation="sdpa", compile_mode="default"))
configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", kernelize=True))
configs.append(BenchmarkConfig(attn_implementation="flash_attention_2", kernelize=True))
configs.append(BenchmarkConfig(attn_implementation="paged|sdpa", continuous_batching=True))
for attn_implementation, sdpa_backend in list(dict.fromkeys(attn_impl_and_sdpa_backend)):
for cm in list(dict.fromkeys(compiled_mode)):
for kernelize_on in list(dict.fromkeys(kernelized)):
config = BenchmarkConfig(
attn_implementation=attn_implementation,
sdpa_backend=sdpa_backend,
compile_mode=cm,
kernelize=kernelize_on,
**kwargs,
)
configs.append(config)
return configs
def generate_all_configs(
warmup_iterations: int = 5,
measurement_iterations: int = 20,
batch_size: int = 1,
sequence_length: int = 128,
num_tokens_to_generate: int = 128,
gpu_monitoring: bool = True,
) -> list[BenchmarkConfig]:
all_attn_implementations = [
("flash_attention_2", None),
("eager", None),
("sdpa", "math"),
("sdpa", "flash_attention"),
("flex_attention", None),
]
return cross_generate_configs(
attn_impl_and_sdpa_backend=all_attn_implementations,
compiled_mode=[None, "default", "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"],
kernelized=[False, KERNELIZATION_AVAILABLE],
warmup_iterations=warmup_iterations,
measurement_iterations=measurement_iterations,
batch_size=batch_size,
sequence_length=sequence_length,
num_tokens_to_generate=num_tokens_to_generate,
gpu_monitoring=gpu_monitoring,
)
def generate_main_configs(
warmup_iterations: int = 5,
measurement_iterations: int = 20,
batch_size: int = 1,
sequence_length: int = 128,
num_tokens_to_generate: int = 128,
) -> list[BenchmarkConfig]:
# Create kwargs common to all configs
kwargs = {
"warmup_iterations": warmup_iterations,
"measurement_iterations": measurement_iterations,
"batch_size": batch_size,
"sequence_length": sequence_length,
"num_tokens_to_generate": num_tokens_to_generate,
}
return [ # TODO: test max-autotune instead of default
BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", gpu_monitoring=False, **kwargs),
BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", gpu_monitoring=True, **kwargs),
BenchmarkConfig(attn_implementation="eager", compile_mode="default", gpu_monitoring=True, **kwargs),
BenchmarkConfig(attn_implementation="flash_attention_2", gpu_monitoring=True, **kwargs),
]

View File

@ -117,6 +117,8 @@ def flush_memory():
# Clear CUDA cache
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.reset_max_memory_allocated()
torch.cuda.reset_peak_memory_stats()
torch.cuda.synchronize()
gc.collect()
@ -232,9 +234,8 @@ class BenchmarkRunner:
self.logger.info(f"Running benchmark scenario: {config.name}")
# Quick validation: try one measurement first to see if this scenario works
generate_fn = self.time_generate_batch if config.continuous_batching else self.time_generate
flush_memory()
e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = generate_fn(
e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = self.time_generate(
max_new_tokens=1, gpu_monitor=None
)
if e2e_latency < 0:
@ -244,14 +245,14 @@ class BenchmarkRunner:
# Warmup runs
self.logger.info(f"Warming up with {config.warmup_iterations} iterations...")
for _ in trange(config.warmup_iterations):
_ = generate_fn(max_new_tokens=config.num_tokens_to_generate)
_ = self.time_generate(max_new_tokens=config.num_tokens_to_generate)
self.logger.info("Warmup over.")
# Measurement runs
result = BenchmarkResult()
self.logger.info(f"Benchmarking with {config.measurement_iterations} iterations.")
for _ in trange(config.measurement_iterations):
e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = generate_fn(
e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = self.time_generate(
max_new_tokens=config.num_tokens_to_generate,
gpu_monitor=(GPUMonitor(logger=self.logger) if config.gpu_monitoring else None),
)
@ -273,58 +274,6 @@ class BenchmarkRunner:
"config": config,
}
# TODO: refactor `generate_batch` to handle streaming so we can use it here
def time_generate_batch(
self,
max_new_tokens: int,
gpu_monitor: GPUMonitor | None = None,
) -> tuple[float, list[float], str, GPURawMetrics | None]:
if gpu_monitor is not None:
gpu_monitor.start()
config = GenerationConfig(
max_new_tokens=max_new_tokens,
eos_token_id=self.tokenizer.eos_token_id,
pad_token_id=self.tokenizer.pad_token_id,
do_sample=True,
)
manager = self.model.init_continuous_batching(config)
manager.start()
try:
first_req_results = []
timestamps = []
wall_time_0 = time.perf_counter()
inputs = self.inputs["input_ids"].tolist()
manager.add_requests(inputs, max_new_tokens=max_new_tokens, streaming=True)
first_req_id = None
num_requests = len(inputs)
finished_requests = 0
while finished_requests < num_requests:
# NOTE: I don't like having the extra if stmt here, but hopefully won't degrade perf too much
result = manager.get_result()
if result:
timestamps.append(time.perf_counter() - wall_time_0)
if result.is_finished():
finished_requests += 1
if first_req_id is None:
first_req_id = result.request_id
if result.request_id == first_req_id:
first_req_results.append(result)
else:
if not manager.is_running():
raise RuntimeError("Generation thread exited unexpectedly")
wall_time_1 = time.perf_counter()
gpu_metrics = gpu_monitor.stop_and_collect() if gpu_monitor is not None else None
decoded_output = self.tokenizer.decode(
[res.generated_tokens[0] for res in first_req_results], skip_special_tokens=True
)
shape_and_decoded_output = f"{(1, len(first_req_results))} | {decoded_output}"
e2e_latency = wall_time_1 - wall_time_0
return e2e_latency, timestamps, shape_and_decoded_output, gpu_metrics
except Exception as e:
raise e
finally:
manager.stop()
def time_generate(
self,
max_new_tokens: int,
@ -390,6 +339,12 @@ class BenchmarkRunner:
n_configs = len(benchmark_configs)
for i, config in enumerate(benchmark_configs):
# Handle SDPA backend if not determined by the config (needs to be done before skipping duplicates)
if config.attn_implementation == "sdpa" and config.sdpa_backend is None:
default_backend = "flash_attention" # FIXME: torch has a _cur_sdpa_kernel_backends but it fails
self.logger.warning(f"No SDPA backend provided, using {default_backend} instead.")
config.sdpa_backend = default_backend
# Skip if already run
if config.hash in all_results:
self.logger.info(f"Skipping duplicate config {config.name} for model {model_id} ({i + 1}/{n_configs})")
@ -413,27 +368,21 @@ class BenchmarkRunner:
self.cleanup()
self.save_results(model_id, all_results, timestamp=timestamp)
if len(all_results) < 1:
raise RuntimeError("No benchmark was run succesfully")
if pretty_print_summary:
print()
print("=" * 100)
print(f"Finished benchmarks in {time.perf_counter() - start_time:.2f} seconds")
print(f"Total number of benchmarks: {len(all_results)}")
print("First run metadata:")
first_key = list(all_results.keys())[0]
first_metadata = all_results[first_key]["metadata"].to_dict()
hardware_info = first_metadata.pop("hardware_info")
pretty_print_dict(first_metadata | hardware_info, tabs=1)
if len(all_results) > 0:
print("First run metadata:")
first_key = list(all_results.keys())[0]
first_metadata = all_results[first_key]["metadata"].to_dict()
hardware_info = first_metadata.pop("hardware_info")
pretty_print_dict(first_metadata | hardware_info, tabs=1)
for result in all_results.values():
print("=" * 100)
print(f"Config: {result['config'].infer_name(compact=False)}\n")
result["measurements"].pprint(
batch_size=result["config"].batch_size,
num_generated_tokens=result["config"].num_tokens_to_generate,
tabs=1,
)
result["measurements"].pprint(batch_size=result["config"].batch_size, tabs=1)
print("=" * 100)
return (timestamp, all_results)

View File

@ -36,17 +36,16 @@ def add_unit_to_duration(stats: dict[str, float]) -> dict[str, str]:
return stats
def equalize_lengths_and_collate(stats: dict[str, dict[str, str]]) -> dict[str, str]:
"""Note: This operation is destructive as it will update values in place before returning a new correctly formatted dict"""
def equalize_lengths_and_collate(stats: list[dict[str, str]]) -> list[str]:
keys = ["avg", "std", "min", "med", "max", "p95"]
for key in keys:
max_length = max(len(stat[key]) for stat in stats.values())
for stat in stats.values():
max_length = max(len(stat[key]) for stat in stats)
for stat in stats:
stat[key] = stat[key].ljust(max_length, " ")
return {name: " ".join([f"{key}={stat[key]}" for key in keys]) for name, stat in stats.items()}
return [" ".join([f"{key}={stat[key]}" for key in keys]) for stat in stats]
def pretty_print_dict(data: dict[str, str], tabs: int = 0) -> None:
def pretty_print_dict(data: dict[str, Any], tabs: int = 0) -> None:
max_key_length = max([len(key) for key in data.keys()])
for key, value in data.items():
tabs_str = " " * tabs
@ -142,19 +141,27 @@ class BenchmarkResult:
def get_measured_itl(self) -> list[float]:
return [(dt[-1] - dt[0]) / (len(dt) - 1) for dt in self.token_generation_times if len(dt) > 1]
def get_throughput(self, total_generated_tokens: int) -> list[float]:
return [total_generated_tokens / e2e_latency for e2e_latency in self.e2e_latency]
def get_throughput(self, batch_size: int) -> float:
return [
batch_size * len(dt) / e2e_latency
for e2e_latency, dt in zip(self.e2e_latency, self.token_generation_times)
]
def pprint(self, batch_size: int = 0, num_generated_tokens: int = 0, tabs: int = 0) -> None:
measurements = {
"E2E Latency": add_unit_to_duration(compute_basic_statistics(self.e2e_latency)),
"Time to First Token": add_unit_to_duration(compute_basic_statistics(self.get_measured_ttft())),
}
itl_values = self.get_measured_itl()
if len(itl_values) > 0:
measurements["Inter-Token Latency"] = add_unit_to_duration(compute_basic_statistics(itl_values))
def pprint(self, batch_size: int = 0, tabs: int = 0) -> None:
stats_to_collate = [
add_unit_to_duration(compute_basic_statistics(self.e2e_latency)),
add_unit_to_duration(compute_basic_statistics(self.get_measured_ttft())),
add_unit_to_duration(compute_basic_statistics(self.get_measured_itl())),
]
if batch_size > 0:
throughput_stats = compute_basic_statistics(self.get_throughput(batch_size * num_generated_tokens))
measurements["Throughput"] = {key: f"{value:.2f}tok/s" for key, value in throughput_stats.items()}
dict_to_pprint = equalize_lengths_and_collate(measurements)
throughput_stats = compute_basic_statistics(self.get_throughput(batch_size))
stats_to_collate.append({key: f"{value:.2f}tok/s" for key, value in throughput_stats.items()})
collated_stats = equalize_lengths_and_collate(stats_to_collate)
dict_to_pprint = {
"E2E Latency": collated_stats[0],
"Time to First Token": collated_stats[1],
"Inter-Token Latency": collated_stats[2],
}
if batch_size > 0:
dict_to_pprint["Throughput"] = collated_stats[3]
pretty_print_dict(dict_to_pprint, tabs=tabs)

View File

@ -2,5 +2,6 @@ numpy>=1.21.0
psutil>=5.8.0
gpustat>=1.0.0
torch>=2.0.0
transformers>=4.30.0
datasets>=2.10.0
huggingface_hub>=0.16.0

View File

@ -23,7 +23,7 @@ import logging
import sys
import uuid
from framework.benchmark_config import adapt_configs, get_config_by_level
from framework.benchmark_config import BenchmarkConfig, generate_all_configs, generate_main_configs
from framework.benchmark_runner import BenchmarkRunner
@ -40,14 +40,7 @@ if __name__ == "__main__":
parser.add_argument("--sequence-length", "-s", type=int, nargs="+", help="Sequence length")
parser.add_argument("--num-tokens-to-generate", "-n", type=int, nargs="+", help="Number of tokens to generate")
parser.add_argument(
"--level",
type=int,
default=1,
help="Level of coverage for the benchmark. 0: only the main config, 1: a few important configs, 2: a config for"
" each attn implementation an option, 3: cross-generate all combinations of configs, 4: cross-generate all"
" combinations of configs w/ all compile modes",
)
parser.add_argument("--cross-generate", action="store_true", help="Cross-generate all combinations of configs")
parser.add_argument("--num-tokens-to-profile", "-p", type=int, default=0, help="Number of tokens to profile")
parser.add_argument("--branch-name", type=str, help="Git branch name")
@ -80,34 +73,70 @@ if __name__ == "__main__":
logger.info(f"Benchmark run UUID: {benchmark_run_uuid}")
logger.info(f"Output directory: {args.output_dir}")
# We cannot compute ITL if we don't have at least two measurements
if any(n <= 1 for n in args.num_tokens_to_generate):
raise ValueError("--num_tokens_to_generate arguments should be larger than 1")
# Error out if one of the arguments is not provided
if len(args.batch_size) * len(args.sequence_length) * len(args.num_tokens_to_generate) == 0:
raise ValueError(
"At least one of the arguments --batch-size, --sequence-length, or --num-tokens-to-generate is required"
)
# Get the configs for the given coverage level
configs = get_config_by_level(args.level)
# Adapt the configs to the given arguments
configs = adapt_configs(
configs,
args.warmup,
args.iterations,
args.batch_size,
args.sequence_length,
args.num_tokens_to_generate,
not args.no_gpu_monitoring,
)
# If there is only one (batch_size, sequence_length, num_tokens_to_generate), we benchmark across configs
elif len(args.batch_size) * len(args.sequence_length) * len(args.num_tokens_to_generate) == 1:
if args.cross_generate:
benchmark_configs = generate_all_configs(
warmup_iterations=args.warmup,
measurement_iterations=args.iterations,
batch_size=args.batch_size[0],
sequence_length=args.sequence_length[0],
num_tokens_to_generate=args.num_tokens_to_generate[0],
gpu_monitoring=not args.no_gpu_monitoring,
)
else:
benchmark_configs = generate_main_configs(
warmup_iterations=args.warmup,
measurement_iterations=args.iterations,
batch_size=args.batch_size[0],
sequence_length=args.sequence_length[0],
num_tokens_to_generate=args.num_tokens_to_generate[0],
)
runner = BenchmarkRunner(logger, args.output_dir, args.branch_name, args.commit_id, args.commit_message)
# Otherwise, we benchmark across all combinations of dimensions
else:
main_config = generate_main_configs(
warmup_iterations=args.warmup,
measurement_iterations=args.iterations,
batch_size=args.batch_size[0],
sequence_length=args.sequence_length[0],
num_tokens_to_generate=args.num_tokens_to_generate[0],
)[0]
benchmark_configs = []
for num_tokens_to_generate in args.num_tokens_to_generate:
for sequence_length in args.sequence_length:
for batch_size in args.batch_size:
cfg_dict = main_config.to_dict()
cfg_dict["batch_size"] = batch_size
cfg_dict["sequence_length"] = sequence_length
cfg_dict["num_tokens_to_generate"] = num_tokens_to_generate
cfg_dict.pop("name")
benchmark_configs.append(BenchmarkConfig.from_dict(cfg_dict))
runner = BenchmarkRunner(
logger,
args.output_dir,
args.branch_name,
args.commit_id,
args.commit_message,
)
timestamp, results = runner.run_benchmarks(
args.model_id, configs, args.num_tokens_to_profile, pretty_print_summary=True
args.model_id,
benchmark_configs,
args.num_tokens_to_profile,
pretty_print_summary=True,
)
dataset_id = args.push_result_to_dataset
if dataset_id is not None and len(results) > 0:
runner.push_results_to_hub(dataset_id, results, timestamp)
runner.push_results_to_hub(
dataset_id,
results,
timestamp,
)

View File

@ -87,8 +87,6 @@ def pytest_configure(config):
config.addinivalue_line("markers", "not_device_test: mark the tests always running on cpu")
config.addinivalue_line("markers", "torch_compile_test: mark test which tests torch compile functionality")
config.addinivalue_line("markers", "torch_export_test: mark test which tests torch export functionality")
config.addinivalue_line("markers", "flash_attn_test: mark test which tests flash attention functionality")
config.addinivalue_line("markers", "flash_attn_3_test: mark test which tests flash attention 3 functionality")
os.environ["DISABLE_SAFETENSORS_CONVERSION"] = "true"

View File

@ -5,7 +5,7 @@ ARG REF=main
RUN apt-get update && apt-get install -y time git g++ pkg-config make git-lfs
ENV UV_PYTHON=/usr/local/bin/python
RUN pip install uv && uv pip install --no-cache-dir -U pip setuptools GitPython
RUN uv pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir --upgrade 'torch<2.9' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir pypi-kenlm
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[quality,testing,torch-speech,vision]"
RUN git lfs install

View File

@ -17,7 +17,7 @@ RUN make install -j 10
WORKDIR /
RUN uv pip install --no-cache --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache --upgrade 'torch<2.9' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,spacy,ftfy,rjieba]" unidic unidic-lite
# spacy is not used so not tested. Causes to failures. TODO fix later

View File

@ -5,7 +5,7 @@ USER root
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git-lfs ffmpeg curl
ENV UV_PYTHON=/usr/local/bin/python
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir 'torch<2.9' 'torchaudio' 'torchvision' 'torchcodec<0.8' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer

View File

@ -5,7 +5,7 @@ USER root
RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1 g++ tesseract-ocr git-lfs curl
ENV UV_PYTHON=/usr/local/bin/python
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir 'torch<2.9' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir --no-deps timm accelerate
RUN uv pip install -U --no-cache-dir pytesseract python-Levenshtein opencv-python nltk
# RUN uv pip install --no-cache-dir natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels

View File

@ -5,7 +5,7 @@ USER root
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git ffmpeg curl
ENV UV_PYTHON=/usr/local/bin/python
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir 'torch<2.9' 'torchaudio' 'torchvision' 'torchcodec<0.8' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"

View File

@ -5,7 +5,7 @@ USER root
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git-lfs ffmpeg curl
ENV UV_PYTHON=/usr/local/bin/python
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir 'torch<2.9' 'torchaudio' 'torchvision' 'torchcodec<0.8' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]"

View File

@ -9,15 +9,10 @@ SHELL ["sh", "-lc"]
# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
# to be used as arguments for docker build (so far).
ARG PYTORCH='2.9.0'
ARG PYTORCH='2.8.0'
# Example: `cu102`, `cu113`, etc.
ARG CUDA='cu126'
# This needs to be compatible with the above `PYTORCH`.
ARG TORCHCODEC='0.8.0'
ARG FLASH_ATTN='false'
RUN apt update
RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs
RUN git lfs install
@ -26,48 +21,15 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip
ARG REF=main
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev]
# 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future.
# 2. For `torchcodec`, use `cpu` as we don't have `libnvcuvid.so` on the host runner. See https://github.com/meta-pytorch/torchcodec/issues/912
# **Important**: We need to specify `torchcodec` version if the torch version is not the latest stable one.
# 3. `set -e` means "exit immediately if any command fails".
RUN set -e; \
# Determine torch version
if [ ${#PYTORCH} -gt 0 ] && [ "$PYTORCH" != "pre" ]; then \
VERSION="torch==${PYTORCH}.*"; \
TORCHCODEC_VERSION="torchcodec==${TORCHCODEC}.*"; \
else \
VERSION="torch"; \
TORCHCODEC_VERSION="torchcodec"; \
fi; \
\
# Log the version being installed
echo "Installing torch version: $VERSION"; \
\
# Install PyTorch packages
if [ "$PYTORCH" != "pre" ]; then \
python3 -m pip install --no-cache-dir -U \
$VERSION \
torchvision \
torchaudio \
--extra-index-url https://download.pytorch.org/whl/$CUDA; \
# We need to specify the version if the torch version is not the latest stable one.
python3 -m pip install --no-cache-dir -U \
$TORCHCODEC_VERSION --extra-index-url https://download.pytorch.org/whl/cpu; \
else \
python3 -m pip install --no-cache-dir -U --pre \
torch \
torchvision \
torchaudio \
--extra-index-url https://download.pytorch.org/whl/nightly/$CUDA; \
python3 -m pip install --no-cache-dir -U --pre \
torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/cpu; \
fi
# 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`.
# Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions).
# 3. For `torchcodec<0.8`: this is quickly added as torch 2.9.0 + torchcodec 0.8.0 fails on our CI env. Need to remove later once they work.
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio "torchcodec<0.8" --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
RUN python3 -m pip install --no-cache-dir -U timm
RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir --no-build-isolation git+https://github.com/facebookresearch/detectron2.git || echo "Don't install detectron2 with nightly torch"
RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git || echo "Don't install detectron2 with nightly torch"
RUN python3 -m pip install --no-cache-dir pytesseract
@ -92,7 +54,7 @@ RUN python3 -m pip install --no-cache-dir bitsandbytes
RUN python3 -m pip install --no-cache-dir quanto
# After using A10 as CI runner, let's run FA2 tests
RUN [ "$FLASH_ATTN" != "false" ] && python3 -m pip uninstall -y ninja && python3 -m pip install --no-cache-dir ninja && python3 -m pip install flash-attn --no-cache-dir --no-build-isolation || echo "Don't install FA2 with nightly torch"
RUN [ "$PYTORCH" != "pre" ] && python3 -m pip uninstall -y ninja && python3 -m pip install --no-cache-dir ninja && python3 -m pip install flash-attn --no-cache-dir --no-build-isolation || echo "Don't install FA2 with nightly torch"
# TODO (ydshieh): check this again
# `quanto` will install `ninja` which leads to many `CUDA error: an illegal memory access ...` in some model tests

View File

@ -10,7 +10,7 @@ RUN apt-get -y update && apt-get install -y libsndfile1-dev && apt install -y te
# Torch needs to be installed before deepspeed
RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed]
RUN python3 -m pip install --no-cache-dir --no-build-isolation torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract
RUN python3 -m pip install --no-cache-dir torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract
RUN python3 -m pip install -U "itsdangerous<2.1.0"
# Test if the image could successfully build the doc. before publishing the image

View File

@ -1,4 +1,4 @@
FROM rocm/pytorch:rocm7.0.2_ubuntu24.04_py3.12_pytorch_release_2.7.1
FROM rocm/pytorch:rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.7.1
LABEL maintainer="Hugging Face"
ARG DEBIAN_FRONTEND=noninteractive
@ -10,8 +10,8 @@ RUN apt update && \
RUN git lfs install
RUN python3 -m pip install --no-cache-dir --upgrade pip numpy importlib-metadata setuptools wheel ninja pytesseract "itsdangerous<2.1.0"
RUN python3 -m pip install --no-cache-dir --no-build-isolation git+https://github.com/facebookresearch/detectron2.git
RUN python3 -m pip install --no-cache-dir --upgrade pip numpy
RUN python3 -m pip install --no-cache-dir --upgrade importlib-metadata setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0"
ARG REF=main
WORKDIR /
@ -39,7 +39,6 @@ RUN python3 -m pip install --no-cache-dir "torchcodec==0.5"
# Install flash attention from source. Tested with commit 6387433156558135a998d5568a9d74c1778666d8
RUN git clone https://github.com/ROCm/flash-attention/ -b tridao && \
cd flash-attention && \
GPU_ARCHS="gfx942" python setup.py install
# GPU_ARCHS builds for MI300, MI325 but not MI355: we would need to add `;gfx950` but it takes too long to build.
GPU_ARCHS="gfx942" python setup.py install
RUN python3 -m pip install --no-cache-dir einops

View File

@ -29,7 +29,7 @@ RUN python3 -m pip uninstall -y apex torch torchvision torchaudio
RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM --no-cache-dir
# Pre-build DeepSpeed, so it's be ready for testing (to avoid timeout)
RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache-dir -v --disable-pip-version-check 2>&1
RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache-dir -v --disable-pip-version-check 2>&1
ARG REF=main
WORKDIR /

View File

@ -21,7 +21,7 @@ RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'p
# Install latest release PyTorch
# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
RUN python3 -m pip uninstall -y torch torchvision torchaudio torchcodec && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
RUN python3 -m pip uninstall -y torch torchvision torchaudio && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
@ -43,7 +43,7 @@ RUN python3 -m pip uninstall -y deepspeed
# This has to be run (again) inside the GPU VMs running the tests.
# The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
# TODO: Find out why test fail.
RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check 2>&1
RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
RUN python3 -m pip uninstall -y kernels

View File

@ -24,7 +24,7 @@ RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch';
RUN echo torch=$VERSION
# `torchvision` and `torchaudio` should be installed along with `torch`, especially for nightly build.
# Currently, let's just use their latest releases (when `torch` is installed with a release version)
RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
@ -50,7 +50,7 @@ RUN python3 -m pip install --no-cache-dir hqq
RUN python3 -m pip install --no-cache-dir gguf
# Add autoawq for quantization testing
RUN python3 -m pip install --no-cache-dir --no-build-isolation autoawq[kernels]
RUN python3 -m pip install --no-cache-dir autoawq[kernels]
# Add quanto for quantization testing
RUN python3 -m pip install --no-cache-dir optimum-quanto
@ -81,7 +81,7 @@ RUN python3 -m pip uninstall -y flash-attn
RUN cd transformers && python3 setup.py develop
# Add fp-quant for quantization testing
RUN python3 -m pip install --no-cache-dir "fp-quant>=0.3.2"
RUN python3 -m pip install --no-cache-dir "fp-quant>=0.2.0"
# Low usage or incompatible lib, will enable later on

View File

@ -24,7 +24,7 @@ pip install -e ".[dev]"
```
> [!NOTE]
> This command might fail for some OS that are missing dependencies. Check step 4 in [Create a Pull Request](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#create-a-pull-request) to work around it.
> This command might fail for some OS that are missing dependencies. Check step 4 in [Create a Pull Request](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#create-a-pull-request) to workaround it.
Then you need to install our special tool that builds the documentation:
@ -38,7 +38,7 @@ pip install git+https://github.com/huggingface/doc-builder
## Building the documentation
Once you have set up the `doc-builder` and additional packages, you can generate the documentation by
Once you have setup the `doc-builder` and additional packages, you can generate the documentation by
typing the following command:
```bash
@ -295,11 +295,12 @@ Here's an example of a tuple return, comprising several objects:
Due to the rapidly growing repository, it is important to make sure that no files that would significantly weigh down the repository are added. This includes images, videos, and other non-text files. We prefer to leverage a hf.co hosted `dataset` like
the ones hosted on [`hf-internal-testing`](https://huggingface.co/hf-internal-testing) in which to place these files and reference
them by URL. We recommend putting them in the following dataset: [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images).
If an external contribution, feel free to add the images to your PR and ask a Hugging Face member to migrate them to this dataset.
If an external contribution, feel free to add the images to your PR and ask a Hugging Face member to migrate your images
to this dataset.
## Styling the docstring
We have an automatic script running with the `make style` command that will make sure that:
We have an automatic script running with the `make style` comment that will make sure that:
- the docstrings fully take advantage of the line width
- all code examples are formatted using black, like the code of the Transformers library

View File

@ -258,6 +258,8 @@
# title: النماذج
# - local: main_classes/text_generation
# title: توليد النصوص
# - local: main_classes/onnx
# title: ONNX
# - local: main_classes/optimizer_schedules
# title: التحسين
# - local: main_classes/output

View File

@ -32,7 +32,7 @@
لتصدير نموذج 🤗 Transformers إلى ONNX، قم أولاً بتثبيت اعتماد إضافي:
```bash
pip install optimum-onnx
pip install optimum[exporters]
```
للاطلاع على جميع المعامﻻت المتاحة، يرجى الرجوع إلى [وثائق 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli)، أو عرض المساعدة في سطر الأوامر:
@ -111,3 +111,60 @@ optimum-cli export onnx --model keras-io/transformers-qa distilbert_base_cased_s
### تصدير نموذج لهندسة غير مدعومة
إذا كنت ترغب في المساهمة من خلال إضافة دعم لنموذج لا يُمكن تصديره حاليًا، فيجب عليك أولاً التحقق مما إذا كان مدعومًا في [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/exporters/onnx/overview)، وإذا لم يكن مدعومًا، [فيمكنك المساهمة في 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute) مُباشرةً.
### تصدير نموذج باستخدام `transformers.onnx`
<Tip warning={true}>
لم يعد يتم دعم `transformers.onnx` يُرجى تصدير النماذج باستخدام 🤗 Optimum كما هو موضح أعلاه. سيتم إزالة هذا القسم في الإصدارات القادمة.
</Tip>
لتصدير نموذج 🤗 Transformers إلى ONNX باستخدام `transformers.onnx`، ثبّت التبعيات الإضافية:
```bash
pip install transformers[onnx]
```
استخدم حزمة `transformers.onnx` كنموذج Python لتصدير نقطة حفظ باستخدام تكوين جاهز:
```bash
python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/
```
يُصدّر هذا رسمًا بيانيًا ONNX لنقطة الحفظ المُحددة بواسطة وسيطة `--model`. مرر أي نقطة حفظ على 🤗 Hub أو نقطة حفظ مُخزنة محليًا.
يُمكن بعد ذلك تشغيل ملف `model.onnx` الناتج على أحد المُسرعات العديدة التي تدعم معيار ONNX. على سبيل المثال، قم بتحميل وتشغيل النموذج باستخدام ONNX Runtime كما يلي:
```python
>>> from transformers import AutoTokenizer
>>> from onnxruntime import InferenceSession
>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
>>> session = InferenceSession("onnx/model.onnx")
>>> # يتوقع ONNX Runtime مصفوفات NumPy كمدخلات
>>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
>>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
```
يُمكن الحصول على أسماء المخرجات المطلوبة (مثل `["last_hidden_state"]`) من خلال إلقاء نظرة على تكوين ONNX لكل نموذج. على سبيل المثال، بالنسبة لـ DistilBERT، لدينا:
```python
>>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
>>> config = DistilBertConfig()
>>> onnx_config = DistilBertOnnxConfig(config)
>>> print(list(onnx_config.outputs.keys()))
["last_hidden_state"]
```
العمليات مُتطابقة لنقاط الحفظ TensorFlow على Hub. على سبيل المثال، صدّر نقطة حفظ TensorFlow خالصة كما يلي:
```bash
python -m transformers.onnx --model=keras-io/transformers-qa onnx/
```
لتصدير نموذج مُخزن محليًا، احفظ أوزان النموذج ومجزىء اللغوى في نفس الدليل (على سبيل المثال `local-pt-checkpoint`)، ثم قم بتصديره إلى ONNX عن طريق توجيه وسيط `--model` لحزمة `transformers.onnx` إلى الدليل المطلوب:
```bash
python -m transformers.onnx --model=local-pt-checkpoint onnx/
```

View File

@ -88,8 +88,6 @@
title: Tool use
- local: chat_templating_writing
title: Writing a chat template
- local: chat_response_parsing
title: Response parsing
title: Chat with models
- sections:
- local: serving
@ -119,8 +117,6 @@
title: Tools
- local: transformers_as_backend
title: Inference server backends
- local: continuous_batching
title: Continuous Batching
title: Inference
- isExpanded: false
sections:
@ -1008,8 +1004,6 @@
title: AltCLIP
- local: model_doc/aria
title: Aria
- local: model_doc/audioflamingo3
title: AudioFlamingo3
- local: model_doc/aya_vision
title: AyaVision
- local: model_doc/blip

View File

@ -95,12 +95,9 @@ print(tokenizer.decode(outputs[0][len(inputs["input_ids"][0]):]))
The chat model called the `get_current_temperature` tool with the correct parameters from the docstring. It inferred France as the location based on Paris, and that it should use Celsius for the units of temperature.
A model **cannot actually call the tool itself**. It requests a tool call, and it's your job to handle the call and append it and the result to the chat history. For
models that support [response parsing](./chat_response_parsing), the response parsing will be handled automatically, and you can just use
[`~PreTrainedTokenizer.parse_response] to extract the tool call. For other models, you'll need to manually translate the output
string into a tool call dict.
A model **cannot actually call the tool itself**. It requests a tool call, and it's your job to handle the call and append it and the result to the chat history.
Regardless of the approach you use, the tool call should go in the `tool_calls` key of an `assistant` message. This is the recommended API, and should be supported by the chat template of most tool-using models.
Hold the call in the `tool_calls` key of an `assistant` message. This is the recommended API, and should be supported by the chat template of most tool-using models.
> [!WARNING]
> Although `tool_calls` is similar to the OpenAI API, the OpenAI API uses a JSON string as its `tool_calls` format. This may cause errors or strange model behavior if used in Transformers, which expects a dict.

View File

@ -1,233 +0,0 @@
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# Response Parsing
It is increasingly common for chat models to generate structured outputs, rather than just a single reply string.
The most common uses for structured outputs are [tool calling](./chat_extras) and [reasoning models](https://huggingface.co/reasoning-course).
Tool calling models can output tool calls, containing the name of the tool to call and any arguments to be passed to it,
while reasoning models often output reasoning steps as a "chain of thought". Some recent models even use both of these,
and may output reasoning and/or one or more tool calls before their final answer.
Models with structured outputs pose a challenge for chat templating, because the output needs to be parsed before it
can be appended to the chat. For a concrete example, let's say we ask [GPT-OSS](https://huggingface.co/openai/gpt-oss-120b)
what the weather is like, and it thinks and decides to call a tool. Here's what the raw model output might look like:
```txt
<|start|>analysis<|message|>The user asks: "What is the weather like in SF?" We need to get the location of the user? The user explicitly asks about SF (San Francisco).
So we need to get the current weather in San Francisco, CA. We need to call get_current_weather function. But we need to call function to get weather data.
So we should call get_current_weather with location "San Francisco, CA". Let's do that.
We will call function get_current_weather.<|end|><|start|>commentary to=functions.get_current_weather<|channel|>commentary <|constrain|>json<|message|>{"location":"San Francisco, CA"}<|call|>
}
```
But if you want to append this to a chat, you'll need to format it as a chat message dict, like this:
```json
{
"role": "assistant",
"thinking": "The user asks: \"What is the weather like in SF?\" We need to get the location of the user? The user explicitly asks about SF (San Francisco). So we need to get the current weather in San Francisco, CA. We need to call get_current_weather function. But we need to call function to get weather data. So we should call get_current_weather with location \"San Francisco, CA\". Let's do that.",
"tool_calls": [
{
"name": "get_current_weather",
"arguments": {
"location": "San Francisco, CA"
}
}
]
}
```
Chat **templates** give us a way to turn messages into formatted input for a model, but we need something else to
parse model output back into a standard message dict. This is what chat **parsing** is for.
## The [parse_response](~PreTrainedTokenizerBase.parse_response) method
Parsing a chat response on a model that supports it is straightforward. Simply take the raw, decoded output from
[generate](`~generation.GenerationMixin.generate`), and pass it to the tokenizer's [parse_response](~PreTrainedTokenizerBase.parse_response) method:
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
checkpoint = "HuggingFaceTB/SmolLM3-3B"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, dtype="auto", device_map="auto")
messages = [
{
"role": "user",
"content": "Hey! Can you summarize the end of the Cold War as briefly as possible? Like, comically briefly. It should really leave out almost most of the relevant information."
}
]
input_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_tensors="pt"
).to(model.device)
outputs = model.generate(input_ids, max_new_tokens=1024)[0, input_ids.shape[1]:]
out_text = tokenizer.decode(outputs)
parsed = tokenizer.parse_response(out_text)
print(parsed.keys())
```
And you should get:
```text
dict_keys(['thinking', 'content'])
```
And that's all you need to start using response parsing! `parse_response` should return a complete message dict that is ready to be appended to the chat history.
When the tokenizer does not support response parsing, `parse_response` will throw an error. We hope to add support
to more tokenizers over time.
## Developers: Understanding a simple response schema
Under the hood, `parse_response` uses a **JSON schema** to parse the model output. A JSON schema represents
the structure of the output message dict. The schema is augmented with additional fields that indicate how the
output message string should be parsed into the expected format. Let's take a look at the schema for a SmolLM response,
excluding tool calls for now:
```python
{
"x-regex": "(?:<think>\n?(?P<thinking>.+?)\n?</think>)?\s*(?P<content>.+?)?\s*(?:<\|im_end\|>|$)",
"type": "object",
"properties": {
"role": {"const": "assistant"},
"content": {"type": "string"},
"thinking": {"type": "string"}
}
}
```
We can see that the schema describes a JSON "object" (a `dict`, in other words) with three keys: `role`, `content`, and `thinking`.
Because all assistant responses have the role "assistant", the `role` key is a `const`(ant). The other two keys are strings, extracted
from the named groups in the regex in the `x-regex` field.
Like chat templates, response schemas are set as a property of the tokenizer. To enable response parsing, all you need
to do is set `tokenizer.response_schema` to a valid schema dict, and `tokenizer.parse_response()` will work! Again, like
chat templates, this schema will be saved with the processor, so once you set it, you can use `save_pretrained()` or `push_to_hub()` to
save and share the schema.
## Developers: Complex schemas
Now, let's look at a more complex schema, which includes tool calls, to gain more of an understanding of the parser
internals. For this, we'll use the `GPT-OSS` schema. GPT-OSS emits both tool calls and thinking blocks, and it uses
an unusual format where model responses are tagged with one of three "channels": `commentary` for things like
tool calls, `analysis` for chain of thought blocks, and `final` for messages intended to be sent to the user.
A full message where the model calls a tool named `get_current_weather` might look like this, with some extra linebreaks added for clarity:
```text
<|channel|>analysis<|message|>
The user asks: "What is the weather like in SF?" So we need to get the current weather in San Francisco, CA.
We need to call get_current_weather function. So we should call get_current_weather with location "San Francisco, CA".
<|end|>
<|start|>assistant<|channel|>commentary
to=functions.get_current_weather <|constrain|>json<|message|>
{
"location": "San Francisco, CA"
}
<|call|>
```
Parsing proceeds recursively; the output of a regex (or other parser) at one level becomes the input to the nodes below it.
In other words, don't feel like you have to parse the entire output in one enormous regex! Instead, start with the schema,
and then add regexes to extract the relevant chunks as you go. Here's a schema that will parse it, with some
explanatory comments:
```python
{
"type": "object",
"properties": {
"role": {"const": "assistant"},
# "content" and "thinking" are both similar to the previous example, and just extract a single string
# However, rather than using a single regex with named groups to extract both, we use a regex in each subkey.
# When an object node has no parser/regex, the entire input string is passed to all of its children, so
# parsing can either be done with named groups at the object level, or with separate regexes at the property level.
"content": {"type": "string", "x-regex": r"<\|channel\|>final<\|message\|>(.*?)(?:<\|end\|>|$)"},
"thinking": {"type": "string", "x-regex": r"<\|channel\|>analysis<\|message\|>(.*?)<\|end\|>"},
"tool_calls": {
# "x-regex-iterator" uses re.findall to find multiple possible manages, and returns them as an
# array/list. You don't need to worry about array handling, though - each item in the array will be
# parsed by the `items` schema, so just write the schema for a single item.
"x-regex-iterator": r"<\|channel\|>commentary (to=functions\..*?<\|message\|>.*?)(?:<\|call\|>|$)",
"type": "array",
"items": {
"type": "object",
"properties": {
# A const property is a fixed value, and the input has no effect on it.
"type": {"const": "function"},
# Here, we wrap the entire tool call dict in a `{"function": ...}` block. The input string is passed through to it unchanged.
"function": {
"type": "object",
"properties": {
"name": {"type": "string", "x-regex": r"^to=functions\.(\w+)"},
"arguments": {
"type": "object",
"x-regex": "<\|message\|>(.*)",
# The "x-parser" field indicates that the extracted string should be parsed as JSON.
# The output is then passed to the schema nodes below and recursive parsing continues.
"x-parser": "json",
"additionalProperties": {"type": "any"},
},
},
},
},
},
},
},
}
```
## Developers: Understanding the parser logic
The parser follows a few simple rules:
1. Each level of the schema receives input from the level above, applies any regex or parser it has, and then passes the output to its children.
2. The root level receives the entire decoded model output string as input.
3. If a node has structured content after parsing (for example, if the regex has named groups and returns a dict, or if the parser returns a dict or list),
then that structured content is mapped to the node's children, and each child node receives its corresponding value as input.
4. If an `object` (dict) node has unstructured (string) output, then the entire string is passed to all of its children. This allows child nodes
to handle parsing individually rather than requiring a single parent regex to extract all keys at once.
5. If an `array` (list) node has unstructured (string) output, then this throws an error.
There is a small set of allowable `x-` keys that indicate how parsing should be done at each node:
- `x-regex`: A regex string to apply to the input. If the regex has named groups, the output is a dict of group names to values. Named groups should only be used in `object` nodes.
Otherwise, the regex must have exactly one unnamed capturing group, and the output is the value of that group as a string.
- `x-regex-iterator`: A regex string to apply to the input using `re.findall()`. The output is a list of all matches.
This should only be used in `array` nodes, and the regex must have exactly one unnamed capturing group. The output is distributed to
the node's `items` schema.
- `x-parser`: Calls a built-in parser to apply to the input. Currently, the only supported parser is `json`, which parses the input string as JSON.
The output is passed to the child nodes for further parsing. Note that the `json` parser can return deeply nested output - in this case, the output
will be progressively unwrapped as it is passed through child nodes. The child nodes do not need additional `x-parser` or `x-regex` fields in this case,
but their structure must match the structure of the parsed JSON.
- `x-parser-args`: Only allowed in conjunction with `x-parser`. This is a dict of additional arguments that control parsing. Right now, the only supported
argument is `transform`, which specifies a `jmespath` transformation to apply to the output. This is useful when the JSON parser returns a structure
that needs to be modified to match the schema.
- `x-regex-key-value`: This is rarely necessary, but it can be useful when parsing key-value pairs in non-JSON format where the names of the keys are not known
in advance, such as when a model emits XML tool calls with arbitrary argument names. The regex must have exactly two named capturing groups,
`key` and `value`, and the output is a dict mapping keys to values. This should only be used in `object` nodes.
In general, multiple regexes/parsers cannot be combined at the same level. The exception is that `x-regex`, returning a single string, can be combined with the other parsers. In this case,
`x-regex` is applied first, and then the output is passed to the other parser, either `x-regex-iterator`, `x-parser`, or `x-regex-key-value`.
Putting these ideas together, you can see that the input flows through the schema, being parsed at each level and then distributed to child nodes. Each level
only needs to extract the input content that is relevant for that part of the schema, and can then let its child nodes handle the rest. Internally, this is handled
with a parser function that receives input, applies any regexes/parsers at the current level, then maps the result to its child nodes before recursively calling itself on each of them.
Recursion terminates when it reaches leaf nodes, usually primitive types like `string` or `number`, which simply return the input they receive.

View File

@ -1,194 +0,0 @@
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# Continuous Batching
Continuous Batching (CB) is an advanced technique to optimize the inference of transformer models by dynamically grouping multiple requests into batches. This approach maximizes GPU utilization and throughput, specifically for workloads with many variable-length inputs.
We are particularly interested in having Continuous Batching in transformers for the following use cases:
- Evaluation of models on large datasets with variable-length inputs
- Generating outputs for multiple sequences for GRPO policies
CB is what makes inference engines like vLLM or SGLang efficient. That being said, transformers does not aim to be a production-ready inference engine, but a complete framework for model development. For this reason, CB is available in `transformers serve`.
If you are not familiar with some of the core concepts CB is built upon, we invite you to read the associated blog post: [Continuous Batching: Efficient Inference for Large Language Models](https://huggingface.co/blog/continuous-batching). _broken link for now_
## API Reference
## Usage Examples
The main way to use CB in transformers is via the `generate_batch` method.
Unlike `generate`, CB takes already tokenized inputs, known as input IDs. Each sequence of input IDs is represented as a list of integers, in python: `list[int]`. Since
For a more detailed example, please refer to: [examples/continuous_batching](./path/to/example)
### `generate_batch` example
We have created a `ContinuousMixin` that is inherited by the `GenerationMixin` so that all auto regressive text models support CB.
This adds the `generate_batch` method to all models that inherit from `GenerationMixin`.
You can use it as follows:
```py
import datasets
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen3-4B-Instruct-2507",
attn_implementation="spda_paged",
device_map="cuda", # if you need cuda
dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left")
# prepare a batch of inputs
dataset = datasets.load_dataset("openai/gsm8k", "socratic", split="test")
dataset = dataset.select(range(args.samples))
tokenized_datasets = dataset.map(lambda x: tokenizer(x["question"]), batched=True)
simple_batch_inputs = [item["input_ids"] for item in tokenized_datasets]
generation_config = GenerationConfig(
max_new_tokens=32,
use_cuda_graph=False, # Not supported for simple version
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
do_sample=False,
max_batch_tokens=512, # max number of tokens in a batch, this is just a default value you should tune based on your hardware
)
batch_outputs = model.generate_batch(
inputs=simple_batch_inputs,
generation_config=generation_config,
)
for request_id, output in batch_outputs.items():
generated_text = tokenizer.decode(output.generated_tokens, skip_special_tokens=True)
print(f"Request {request_id} output: {generated_text}")
```
### `ContinuousBatchingManager` example
If you want more control w.r.t. how you want to schedule requests using CB, you can use the `ContinuousBatchingManager` class directly.
This is what we use in `transformers serve` because requests arrive asynchronously and we can leverage the asynchronous nature of the CB process to make things more efficient.
Under the hood, the `ContinuousBatchingManager` creates a background thread that receives inputs from a python `queue.Queue` which it uses to get requests to batch in each forward pass.
Note that the manager is thread safe!
```py
import datasets
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
from transformers.generation.continuous_batching import RequestStatus
model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen3-4B-Instruct-2507",
attn_implementation="spda_paged",
device_map="cuda", # if you need cuda
dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left")
# prepare a batch of inputs
dataset = datasets.load_dataset("openai/gsm8k", "socratic", split="test")
dataset = dataset.select(range(args.samples))
tokenized_datasets = dataset.map(lambda x: tokenizer(x["question"]), batched=True)
simple_batch_inputs = [item["input_ids"] for item in tokenized_datasets]
# initialize the manager, available method thanks to the `ContinuousMixin`
manager = model.init_continuous_batching(generation_config=generation_config)
# start the background thread
manager.start()
# this is for demonstration purposes only, in practice this is most useful to do concurrently
for i, input in enumerate(simple_batch_inputs):
request_id = manager.add_request(input_ids=input, request_id=f"request_{i}") # if you do not specify a request_id, one will be generated for you
# Can be done in an other thread
for id, request in manager.get_result():
generated_text = tokenizer.decode(request.generated_tokens, skip_special_tokens=True)
print(f"Request {id} output: {generated_text}")
# you can also get results for a specific request id
result = manager.get_result(request_id="request_5") # this is blocking and will wait for the result to be ready
# or get results for a request that is streaming
manager.add_request(
input_ids=input,
request_id="streaming_request",
stream=True,
)
for chunk in manager.request_id_iter(request_id="streaming_request"):
generated_text = tokenizer.decode(chunk.generated_tokens, skip_special_tokens=True)
print(generated_text)
# FIXME: stop iteration in `request_id_iter` when finished instead of doing it externally
if chunk.status == RequestStatus.FINISHED:
break
# stop the background thread before exiting the process
manager.stop()
```
## Supported & Unsupported Features
### Supported Features
- Dynamic scheduling of variable-length requests
- Chunked prefill
- Paged Attention Cache
- Sliding window attention
- Chat templates
### Unsupported Features
At the moment, the following features are not supported with CB. We plan to add support to the following:
- Prefix caching
- Beam search
- tool calling
The others are unplanned, but depending on community requests we might consider adding them:
- MTP (multi token prediction)
- Medusa
## Performance Considerations
## Integration with Serving
You can use CB in `transformers serve` by passing the `--continuous-batching` flag when starting the server.
## Monitoring
We have added `opentelemetry` support to Continuous Batching to help you monitor its performance in production. To enable it, you need to install the `opentelemetry` extra when installing `transformers`:
```sh
# this installs `opentelemetry-api`, `opentelemetry-sdk` and `opentelemetry-exporter-otlp`
pip install transformers[open-telemetry]
```
This will enable traces and metrics collection in CB. You will then have to setup the backend to collect and visualize the traces and metrics.

View File

@ -393,9 +393,3 @@ model = AutoModelForCausalLM.from_pretrained(
"mistralai/Mistral-7B-v0.1", quantization_config=quant_config, device_map="auto"
)
```
## Continuous Batching
When serving LLMs for inference, you may have multiple requests arriving at different times. Continuous Batching (CB) is a technique that groups incoming requests into batches to maximize GPU utilization and throughput.
See the [Continuous Batching](./continuous_batching) guide for more details on how to use CB in transformers.

View File

@ -1,402 +0,0 @@
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
*This model was released on 2025-07-10 and added to Hugging Face Transformers on 2025-11-11.*
# Audio Flamingo 3
<div class="flex flex-wrap space-x-1">
<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
</div>
## Overview
Audio Flamingo 3 (AF3) is a fully open large audiolanguage model designed for robust understanding and reasoning over speech, environmental sounds, and music. AF3 pairs a Whisper-style audio encoder with a causal language model and performs replace-in-place audiotext fusion: the processor aligns post-pool audio frames to a dedicated placeholder token and the model replaces those token slots with projected audio embeddings during the forward pass.
The model checkpoint is available at: [nvidia/audio-flamingo-3-hf](https://huggingface.co/nvidia/audio-flamingo-3-hf)
Highlights:
- Unified audio encoder across speech, sound, and music.
- **Long-audio support via windowing and post-pool alignment (up to 10 minutes maximum).** The model processes audio in 30-second windows with a hard limit of 20 windows (10 minutes total). Audio longer than 10 minutes will be truncated.
- Deterministic fusion that preserves sequence length by replacing audio placeholder tokens with audio embeddings.
This model was contributed by [Lasha Koroshinadze](https://huggingface.co/lashahub) and [Eric Bezzam](https://huggingface.co/bezzam).
### Paper
[Audio Flamingo 3](https://huggingface.co/papers/2507.08128): Advancing Audio Intelligence with Fully Open Large Audio Language Models
A. Goel, S. Ghosh, J. Kim, S. Kumar, Z. Kong, S. Lee, C.-H. H. Yang, R. Duraiswami, D. Manocha, R. Valle, B. Catanzaro
NVIDIA and University of Maryland
Project: https://research.nvidia.com/labs/adlr/AF3/
## Usage
### Audio Instruct Mode
The model supports audio-text instructions, including multi-turn interactions, all processed in batches.
➡️ audio + text instruction
```python
from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
model_id = "nvidia/audio-flamingo-3-hf"
processor = AutoProcessor.from_pretrained(model_id)
model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
conversation = [
{
"role": "user",
"content": [
{"type": "text", "text": "Transcribe the input speech."},
{"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/WhDJDIviAOg_120_10.mp3"},
],
}
]
inputs = processor.apply_chat_template(
conversation,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
).to(model.device)
outputs = model.generate(**inputs, max_new_tokens=500)
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
print(decoded_outputs)
```
➡️ multi-turn:
```python
from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
model_id = "nvidia/audio-flamingo-3-hf"
processor = AutoProcessor.from_pretrained(model_id)
model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
conversation = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Instruction: How does the tone of female speech change throughout the audio? Choose the correct option among the options below: (A) Sad to happy (B) Happy to sad (C) Neutral to happy (D) Happy to neutral.",
},
{"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/000000786159.31.wav"},
],
},
{
"role": "assistant",
"content": [{"type": "text", "text": "(A) Sad to happy"}],
},
{
"role": "user",
"content": [
{"type": "text", "text": "Why do you think so?"},
],
},
]
inputs = processor.apply_chat_template(
conversation,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
).to(model.device)
outputs = model.generate(**inputs, max_new_tokens=500)
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
print(decoded_outputs)
```
➡️ text only:
```python
from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
model_id = "nvidia/audio-flamingo-3-hf"
processor = AutoProcessor.from_pretrained(model_id)
model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
conversation = [
{
"role": "user",
"content": [
{"type": "text", "text": "What is the capital of France?"},
],
}
]
inputs = processor.apply_chat_template(
conversation,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
).to(model.device)
outputs = model.generate(**inputs, max_new_tokens=500)
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
print(decoded_outputs)
```
➡️ audio only:
```python
from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
model_id = "nvidia/audio-flamingo-3-hf"
processor = AutoProcessor.from_pretrained(model_id)
model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
conversation = [
{
"role": "user",
"content": [
{"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/WhDJDIviAOg_120_10.mp3"},
],
}
]
inputs = processor.apply_chat_template(
conversation,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
).to(model.device)
outputs = model.generate(**inputs, max_new_tokens=500)
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
print(decoded_outputs)
```
➡️ batched inference!
```python
from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
model_id = "nvidia/audio-flamingo-3-hf"
processor = AutoProcessor.from_pretrained(model_id)
model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
conversations = [
[
{
"role": "user",
"content": [
{"type": "text", "text": "Transcribe the input speech."},
{
"type": "audio",
"path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav",
},
],
}
],
[
{
"role": "user",
"content": [
{
"type": "text",
"text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
},
{"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
],
}
],
]
inputs = processor.apply_chat_template(
conversations,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
).to(model.device)
outputs = model.generate(**inputs, max_new_tokens=500)
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
print(decoded_outputs)
```
➡️ Training:
```python
from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
model_id = "nvidia/audio-flamingo-3-hf"
processor = AutoProcessor.from_pretrained(model_id)
model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
model.train()
conversation = [
[
{
"role": "user",
"content": [
{"type": "text", "text": "Transcribe the input speech."},
{"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/WhDJDIviAOg_120_10.mp3"},
],
},
{
"role": "assistant",
"content": [{"type": "text", "text": "The transcription of the audio is 'summer follows spring the days grow longer and the nights are warm'."}],
}
],
[
{
"role": "user",
"content": [
{
"type": "text",
"text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
},
{"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
],
},
{
"role": "assistant",
"content": [{"type": "text", "text": "The transcription of the audio is 'some transcription of the audio'."}],
}
]
]
inputs = processor.apply_chat_template(
conversation,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
output_labels=True,
).to(model.device)
loss = model(**inputs).loss
loss.backward()
```
➡️ transcription shortcut
```python
from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
model_id = "nvidia/audio-flamingo-3-hf"
processor = AutoProcessor.from_pretrained(model_id)
model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
inputs = processor.apply_transcription_request(audio="https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=500)
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True, strip_prefix=True)
print(decoded_outputs)
```
The model is trained to emit transcriptions prefixed with assistant framing such as `The spoken content of the audio is "<text>".`. Use `strip_prefix=True` (as shown above) to remove the fixed assistant sentence and surrounding quotes so that only the transcription remains.
## How the model works
### Architecture
* **AudioFlamingo3Encoder**
Whisper-style feature extractor + encoder → average-pool over time (stride 2) → LayerNorm.
Produces per-frame hidden states at the post-pool rate.
* **AudioFlamingo3MultiModalProjector**
A small MLP that maps encoder features to the language models hidden size.
* **AudioFlamingo3ForConditionalGeneration**
A causal language model that accepts text embeddings where each audio placeholder token slot is replaced, in place, by an audio frame embedding. No sequence-length change is introduced by fusion.
### Processor-level alignment
1. Each raw waveform is split into fixed-length windows based on the feature extractors `chunk_length` (seconds) and `sampling_rate` (Hz).
2. For each window, the processor computes the number of post-pool frames `post_pool_len` that the encoder will output (matching the conv/pool schedule).
3. The processor expands the audio placeholder token by the total number of post-pool frames across all windows.
4. The model later replaces those token positions with the corresponding projected audio embeddings.
## Usage patterns
### Transcription shortcut
For automatic speech recognition you can skip writing the default instruction each time and call
[`~transformers.AudioFlamingo3Processor.apply_transcription_request`]:
```python
inputs = processor.apply_transcription_request(audio=audio_array)
```
Pass `prompt="Transcribe the input speech."` (or a list of prompts for batch audio) to customize the instruction while
keeping the audio placeholder handling.
`audio` accepts in-memory arrays, local file paths, or URLs. Any processor kwargs (`text_kwargs`, `audio_kwargs`, etc.)
are forwarded, so you can tweak padding or tensor formats just like when calling `processor(...)`.
## Long audio and windowing
**Important: Maximum audio length is 10 minutes.** Audio longer than this will be truncated.
* The default setup processes 30-second windows at 16 kHz mono.
* **The processor enforces a hard limit of 20 windows per sample, resulting in a maximum of 10 minutes of audio (20 windows × 30 seconds).**
* For each window:
* `mel_len` is the padded mel length.
* A conv stack reduces time as `conv_output_len = (mel_len - 1) // 2 + 1`.
* Post-pool frames per window: `post_pool_len = (conv_output_len - 2) // 2 + 1`.
* An audio placeholder token is expanded to the sum of `post_pool_len` across all windows.
## Padding, attention, and caching
* **Left padding vs right padding**
For generation with mixed prompt lengths in a batch, left padding is usually preferable.
For training, right padding is common; AF3s fusion mechanism itself is padding-agnostic because it replaces in place.
* **Attention masks**
The processor returns `attention_mask` (text) and `input_features_mask` (audio). The model builds an internal 4-D mask on the encoders pre-pool axis with negative infinity at pad positions.
* **Caching**
During generation, `input_features` and `input_features_mask` are only passed on the first step. Subsequent steps use cached keys/values from the language model.
## Troubleshooting
* Empty or truncated outputs when batching
Use left padding for batched generation and decode only the new tokens after the prompt length, as shown in the quickstart.
## AudioFlamingo3Config
[[autodoc]] AudioFlamingo3Config
## AudioFlamingo3EncoderConfig
[[autodoc]] AudioFlamingo3EncoderConfig
## AudioFlamingo3Processor
[[autodoc]] AudioFlamingo3Processor
## AudioFlamingo3Encoder
[[autodoc]] AudioFlamingo3Encoder
- forward
## AudioFlamingo3ForConditionalGeneration
[[autodoc]] AudioFlamingo3ForConditionalGeneration
- forward

View File

@ -158,24 +158,6 @@ print("Retrieval scores (query x image):")
print(scores)
```
You can also use checkpoints for `ColQwen2.5` that are **compatible with the ColQwen2 architecture**. This version of the model uses [Qwen2_5_VL](./qwen2_5_vl) as the backbone.
```python
import torch
from transformers import ColQwen2ForRetrieval, ColQwen2Processor
from transformers.utils.import_utils import is_flash_attn_2_available
model_name = "Sahil-Kabir/colqwen2.5-v0.2-hf" # An existing compatible checkpoint
model = ColQwen2ForRetrieval.from_pretrained(
model_name,
dtype=torch.bfloat16,
device_map="auto",
attn_implementation="flash_attention_2" if is_flash_attn_2_available() else "sdpa"
)
processor = ColQwen2Processor.from_pretrained(model_name)
```
## Notes
- [`~ColQwen2Processor.score_retrieval`] returns a 2D tensor where the first dimension is the number of queries and the second dimension is the number of images. A higher score indicates more similarity between the query and image.

View File

@ -169,9 +169,6 @@ print("Pooled output shape:", pooled_output.shape)
[[autodoc]] DINOv3ViTModel
- forward
## DINOv3ViTBackbone
[[autodoc]] DINOv3ViTBackbone
## DINOv3ConvNextModel
[[autodoc]] DINOv3ConvNextModel

View File

@ -75,11 +75,11 @@ A processor requires an image_processor and a tokenizer. Hence, inputs can be lo
from PIL import Image
from transformers import AutoTokenizer
from transformers.models.fuyu.processing_fuyu import FuyuProcessor
from transformers.models.fuyu.image_processing_fuyu_fast import FuyuImageProcessorFast
from transformers.models.fuyu.image_processing_fuyu import FuyuImageProcessor
tokenizer = AutoTokenizer.from_pretrained('adept-hf-collab/fuyu-8b')
image_processor = FuyuImageProcessorFast()
image_processor = FuyuImageProcessor()
processor = FuyuProcessor(image_processor=image_processor, tokenizer=tokenizer)
@ -118,11 +118,6 @@ The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece.
[[autodoc]] FuyuImageProcessor
- __call__
## FuyuImageProcessor
[[autodoc]] FuyuImageProcessorFast
- __call__
## FuyuProcessor
[[autodoc]] FuyuProcessor

View File

@ -61,11 +61,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
[[autodoc]] GLPNImageProcessor
- preprocess
## GLPNImageProcessorFast
[[autodoc]] GLPNImageProcessorFast
- preprocess
## GLPNModel
[[autodoc]] GLPNModel

View File

@ -88,16 +88,16 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
import torch
from PIL import Image
import requests
processor = AutoImageProcessor.from_pretrained("ETH-CVG/lightglue_superpoint")
model = AutoModel.from_pretrained("ETH-CVG/lightglue_superpoint")
# LightGlue requires pairs of images
images = [image1, image2]
inputs = processor(images, return_tensors="pt")
with torch.inference_mode():
outputs = model(**inputs)
# Extract matching information
keypoints0 = outputs.keypoints0 # Keypoints in first image
keypoints1 = outputs.keypoints1 # Keypoints in second image
@ -112,7 +112,7 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
# Process outputs for visualization
image_sizes = [[(image.height, image.width) for image in images]]
processed_outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
for i, output in enumerate(processed_outputs):
print(f"For the image pair {i}")
for keypoint0, keypoint1, matching_score in zip(
@ -147,13 +147,6 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
- post_process_keypoint_matching
- visualize_keypoint_matching
## LightGlueImageProcessorFast
[[autodoc]] LightGlueImageProcessorFast
- preprocess
- post_process_keypoint_matching
- visualize_keypoint_matching
## LightGlueForKeypointMatching
[[autodoc]] LightGlueForKeypointMatching

View File

@ -159,7 +159,7 @@ conversation3 = [
conversations = [conversation1, conversation2, conversation3]
inputs = processor.apply_chat_template(
conversations,
conversation,
add_generation_prompt=True,
tokenize=True,
return_dict=True,

View File

@ -154,7 +154,7 @@ pip install schedulefree
[Schedule Free optimizer (SFO)](https://hf.co/papers/2405.15682) replaces the base optimizers momentum with a combination of averaging and interpolation. Unlike a traditional scheduler, SFO completely removes the need to anneal the learning rate.
SFO supports the RAdam (`schedule_free_radam`), AdamW (`schedule_free_adamw`) and SGD (`schedule_free_sgd`) optimizers. The RAdam scheduler doesn't require `warmup_steps`.
SFO supports the RAdam (`schedule_free_radam`), AdamW (`schedule_free_adamw`) and SGD (`schedule_free_sgd`) optimizers. The RAdam scheduler doesn't require `warmup_steps` or `warmup_ratio`.
By default, it is recommended to set `lr_scheduler_type="constant"`. Other `lr_scheduler_type` values may also work, but combining SFO optimizers with other learning rate schedules could affect SFOs intended behavior and performance.

View File

@ -38,7 +38,7 @@ pip install transformers[dev]
or for an editable install:
```bash
pip install -e ".[dev]"
pip install -e .[dev]
```
inside the Transformers repo. Since the number of optional dependencies of Transformers has grown a lot, it's possible you don't manage to get all of them. If the dev install fails, make sure to install PyTorch then do
@ -50,7 +50,7 @@ pip install transformers[quality]
or for an editable install:
```bash
pip install -e ".[quality]"
pip install -e .[quality]
```
## Tests

View File

@ -40,7 +40,7 @@ You can choose between MXFP4 and NVFP4 with `FPQuantConfig(forward_dtype="mxfp4"
A **Blackwell-generation GPU is required** to run the kernels. Runtime support for FP-Quant is implemented through the [QuTLASS](https://github.com/IST-DASLab/qutlass) library and a lightweight PyTorch interface lib [`fp_quant`](https://github.com/IST-DASLab/FP-Quant/tree/master/inference_lib). We recommend installing the former **from source** and the latter with `pip install fp_quant`.
Users **without a Blackwell-generation GPU** , can use the method with `quantization_config=FPQuantConfig(pseudoquantization=True)` without having to install [QuTLASS](https://github.com/IST-DASLab/qutlass). This would provide no speedups but would fully emulate the effect of quantization.
Users **without a Blackwell-generation GPU** , can use the method with `quantization_config=FPQuantConfig(pseudoquant=True)` without having to install [QuTLASS](https://github.com/IST-DASLab/qutlass). This would provide no speedups but would fully emulate the effect of quantization.
> [!TIP]
> Find models pre-quantized with FP-Quant in the official ISTA-DASLab [collection](https://huggingface.co/collections/ISTA-DASLab/fp-quant-6877c186103a21d3a02568ee).

View File

@ -329,7 +329,7 @@ from torchao.dtypes import Int4XPULayout
from torchao.quantization.quant_primitives import ZeroPointDomain
quant_config = Int4WeightOnlyConfig(group_size=128, layout=Int4XPULayout(), zero_point_domain=ZeroPointDomain.INT, int4_packing_format="plain_int32")
quant_config = Int4WeightOnlyConfig(group_size=128, layout=Int4XPULayout(), zero_point_domain=ZeroPointDomain.INT)
quantization_config = TorchAoConfig(quant_type=quant_config)
# Load and quantize the model
@ -342,7 +342,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
input_text = "What are we having for dinner?"
input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device)
input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)
# auto-compile the quantized model with `cache_implementation="static"` to get speed up
output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
@ -395,7 +395,7 @@ from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
from torchao.quantization import Int4WeightOnlyConfig
from torchao.dtypes import Int4CPULayout
quant_config = Int4WeightOnlyConfig(group_size=128, layout=Int4CPULayout(), int4_packing_format="opaque")
quant_config = Int4WeightOnlyConfig(group_size=128, layout=Int4CPULayout())
quantization_config = TorchAoConfig(quant_type=quant_config)
# Load and quantize the model
@ -422,7 +422,7 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
#### 1. Skip quantization for certain layers
With `FqnToConfig` we can specify a default configuration for all layers while skipping quantization for certain layers.
With `ModuleFqnToConfig` we can specify a default configuration for all layers while skipping quantization for certain layers.
```py
import torch
@ -430,11 +430,11 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
model_id = "meta-llama/Llama-3.1-8B-Instruct"
from torchao.quantization import Int4WeightOnlyConfig, FqnToConfig
from torchao.quantization import Int4WeightOnlyConfig, ModuleFqnToConfig
config = Int4WeightOnlyConfig(group_size=128)
# set default to int4 (for linears), and skip quantizing `model.layers.0.self_attn.q_proj`
quant_config = FqnToConfig({"_default": config, "model.layers.0.self_attn.q_proj": None})
quant_config = ModuleFqnToConfig({"_default": config, "model.layers.0.self_attn.q_proj": None})
quantization_config = TorchAoConfig(quant_type=quant_config)
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", dtype=torch.bfloat16, quantization_config=quantization_config)
# lm_head is not quantized and model.layers.0.self_attn.q_proj is not quantized
@ -459,7 +459,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
model_id = "facebook/opt-125m"
from torchao.quantization import Int4WeightOnlyConfig, FqnToConfig, Int8DynamicActivationInt4WeightConfig, IntxWeightOnlyConfig, PerAxis, MappingType
from torchao.quantization import Int4WeightOnlyConfig, ModuleFqnToConfig, Int8DynamicActivationInt4WeightConfig, IntxWeightOnlyConfig, PerAxis, MappingType
weight_dtype = torch.int8
granularity = PerAxis(0)
@ -470,7 +470,7 @@ embedding_config = IntxWeightOnlyConfig(
mapping_type=mapping_type,
)
linear_config = Int8DynamicActivationInt4WeightConfig(group_size=128)
quant_config = FqnToConfig({"_default": linear_config, "model.decoder.embed_tokens": embedding_config, "model.decoder.embed_positions": None})
quant_config = ModuleFqnToConfig({"_default": linear_config, "model.decoder.embed_tokens": embedding_config, "model.decoder.embed_positions": None})
# set `include_embedding` to True in order to include embedding in quantization
# when `include_embedding` is True, we'll remove input embedding from `modules_not_to_convert` as well
quantization_config = TorchAoConfig(quant_type=quant_config, include_embedding=True)
@ -521,7 +521,7 @@ from torchao.quantization import (
IntxWeightOnlyConfig,
PerRow,
PerAxis,
FqnToConfig,
ModuleFqnToConfig,
Float8Tensor,
Int4TilePackedTo4dTensor,
IntxUnpackedToInt8Tensor,
@ -550,7 +550,7 @@ qconfig_dict = {
"_default": intxwo,
}
quant_config = FqnToConfig(qconfig_dict)
quant_config = ModuleFqnToConfig(qconfig_dict)
quantization_config = TorchAoConfig(quant_type=quant_config)
quantized_model = AutoModelForCausalLM.from_pretrained(
model_id,

View File

@ -33,7 +33,7 @@ Export a Transformers model to ONNX with the Optimum CLI or the `optimum.onnxrun
Run the command below to install Optimum and the [exporters](https://huggingface.co/docs/optimum/exporters/overview) module.
```bash
pip install optimum-onnx
pip install optimum[exporters]
```
> [!TIP]

View File

@ -383,30 +383,6 @@ transformers serve \
--attn_implementation "sdpa"
```
### Quantization
transformers serve is compatible with all [quantization methods](https://huggingface.co/docs/transformers/main/quantization/overview) supported in transformers. Quantization can significantly reduce memory usage and improve inference speed, with two main workflows: pre-quantized models and on-the-fly quantization.
#### Pre-quantized Models
For models that are already quantized (e.g., GPTQ, AWQ, bitsandbytes), simply choose a quantized model name for serving.
Make sure to install the required libraries listed in the quantization documentation.
> [!TIP]
> Pre-quantized models generally provide the best balance of performance and accuracy.
#### On the fly quantization
If you want to quantize a model at runtime, you can specify the --quantization flag in the CLI. Note that not all quantization methods support on-the-fly conversion. The full list of supported methods is available in the quantization [overview](https://huggingface.co/docs/transformers/main/quantization/overview).
Currently, with transformers serve, we only supports some methods: ["bnb-4bit", "bnb-8bit"]
For example, to enable 4-bit quantization with bitsandbytes, you need to pass add `--quantization bnb-4bit`:
```sh
transformers serve --quantization bnb-4bit
```
### Performance tips
- Use an efficient attention backend when available:
@ -421,4 +397,6 @@ transformers serve \
- `--dtype {bfloat16|float16}` typically improve throughput and memory use vs. `float32`
- `--load_in_4bit`/`--load_in_8bit` can reduce memory footprint for LoRA setups
- `--force-model <repo_id>` avoids per-request model hints and helps produce stable, repeatable runs

View File

@ -220,7 +220,7 @@ At this point, only three steps remain:
... gradient_accumulation_steps=4,
... per_device_eval_batch_size=32,
... num_train_epochs=10,
... warmup_steps=0.1,
... warmup_ratio=0.1,
... logging_steps=10,
... load_best_model_at_end=True,
... metric_for_best_model="accuracy",

View File

@ -211,7 +211,7 @@ At this point, only three steps remain:
... gradient_accumulation_steps=4,
... per_device_eval_batch_size=16,
... num_train_epochs=3,
... warmup_steps=0.1,
... warmup_ratio=0.1,
... logging_steps=10,
... load_best_model_at_end=True,
... metric_for_best_model="accuracy",

View File

@ -378,7 +378,7 @@ Most of the training arguments are self-explanatory, but one that is quite impor
... learning_rate=5e-5,
... per_device_train_batch_size=batch_size,
... per_device_eval_batch_size=batch_size,
... warmup_steps=0.1,
... warmup_ratio=0.1,
... logging_steps=10,
... load_best_model_at_end=True,
... metric_for_best_model="accuracy",

View File

@ -187,7 +187,7 @@ from torch import nn
from transformers import Trainer
class CustomTrainer(Trainer):
def compute_loss(self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], return_outputs: bool = False, num_items_in_batch: Optional[torch.Tensor] = None):
def compute_loss(self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], return_outputs: bool = False num_items_in_batch: Optional[torch.Tensor] = None):
labels = inputs.pop("labels")
# forward pass
outputs = model(**inputs)

View File

@ -37,7 +37,7 @@ pip install transformers[dev]
o una instalación editable:
```bash
pip install -e ".[dev]"
pip install -e .[dev]
```
del repositorio de Transformers.

View File

@ -220,7 +220,7 @@ Al llegar a este punto, solo quedan tres pasos:
... gradient_accumulation_steps=4,
... per_device_eval_batch_size=32,
... num_train_epochs=10,
... warmup_steps=0.1,
... warmup_ratio=0.1,
... logging_steps=10,
... load_best_model_at_end=True,
... metric_for_best_model="accuracy",

View File

@ -37,7 +37,7 @@ pip install transformers[dev]
o un'installazione modificabile:
```bash
pip install -e ".[dev]"
pip install -e .[dev]
```
all'interno del repo Transformers.

View File

@ -200,6 +200,8 @@
title: モデル
- local: main_classes/text_generation
title: テキストの生成
- local: main_classes/onnx
title: ONNX
- local: main_classes/optimizer_schedules
title: 最適化
- local: main_classes/output

View File

@ -1292,7 +1292,7 @@ DeepSpeed は、`LRRangeTest`、`OneCycle`、`WarmupLR`、および`WarmupDecayL
したがって、スケジューラを設定しない場合、これがデフォルトで設定されるスケジューラになります。
設定ファイルで `scheduler` エントリを設定しない場合、[`Trainer`] は
`--lr_scheduler_type`、`--learning_rate`、および `--warmup_steps` の値を設定します。
`--lr_scheduler_type`、`--learning_rate`、および `--warmup_steps` または `--warmup_ratio` の値を設定します。
🤗 それのトランスフォーマーバージョン。
以下は、`WarmupLR`の自動構成された`scheduler`エントリの例です。
@ -1316,7 +1316,8 @@ DeepSpeed は、`LRRangeTest`、`OneCycle`、`WarmupLR`、および`WarmupDecayL
- `warmup_min_lr` の値は `0` です。
- `warmup_max_lr` と `--learning_rate` の値。
- `warmup_num_steps` と `--warmup_steps` の値 (指定されている場合)
- `warmup_num_steps` と `--warmup_steps` の値 (指定されている場合)。それ以外の場合は `--warmup_ratio` を使用します
トレーニング ステップの数を乗算し、切り上げます。
- `total_num_steps` には `--max_steps` の値を指定するか、指定されていない場合は実行時に自動的に導出されます。
環境、データセットのサイズ、およびその他のコマンド ライン引数 (
`WarmupDecayLR`)。

View File

@ -0,0 +1,50 @@
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# Exporting 🤗 Transformers models to ONNX
🤗 Transformers は `transformers.onnx` パッケージを提供します。
設定オブジェクトを利用することで、モデルのチェックポイントをONNXグラフに変換することができます。
詳細は[ガイド](../serialization) を参照してください。
を参照してください。
## ONNX Configurations
以下の3つの抽象クラスを提供しています。
エクスポートしたいモデルアーキテクチャのタイプに応じて、継承すべき3つの抽象クラスを提供します
* エンコーダーベースのモデルは [`~onnx.config.OnnxConfig`] を継承します。
* デコーダーベースのモデルは [`~onnx.config.OnnxConfigWithPast`] を継承します。
* エンコーダー・デコーダーモデルは [`~onnx.config.OnnxSeq2SeqConfigWithPast`] を継承しています。
### OnnxConfig
[[autodoc]] onnx.config.OnnxConfig
### OnnxConfigWithPast
[[autodoc]] onnx.config.OnnxConfigWithPast
### OnnxSeq2SeqConfigWithPast
[[autodoc]] onnx.config.OnnxSeq2SeqConfigWithPast
## ONNX Features
各 ONNX 構成は、次のことを可能にする一連の _機能_ に関連付けられています。
さまざまなタイプのトポロジまたはタスクのモデルをエクスポートします。

View File

@ -40,7 +40,7 @@ pip install transformers[dev]
```bash
pip install -e ".[dev]"
pip install -e .[dev]
```
トランスフォーマーズのリポジトリ内で作業しています。トランスフォーマーズのオプションの依存関係の数が増えたため、すべてを取得できない可能性があります。開発用インストールが失敗した場合、作業しているディープラーニングフレームワークPyTorch、TensorFlow、および/またはFlaxをインストールし、次の手順を実行してください。
@ -53,7 +53,7 @@ pip install transformers[quality]
または編集可能なインストールの場合:
```bash
pip install -e ".[quality]"
pip install -e .[quality]
```
## Tests

View File

@ -47,7 +47,7 @@ ONNX形式にエクスポートされたモデルは、以下のように使用
🤗 TransformersモデルをONNXにエクスポートするには、まず追加の依存関係をインストールしてください
```bash
pip install optimum-onnx
pip install optimum[exporters]
```
すべての利用可能な引数を確認するには、[🤗 Optimumドキュメント](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli)を参照してください。または、コマンドラインでヘルプを表示することもできます:
@ -128,3 +128,64 @@ CLIの代わりに、🤗 TransformersモデルをONNXにプログラム的に
### Exporting a model for an unsupported architecture
現在エクスポートできないモデルをサポートするために貢献したい場合、まず[`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/exporters/onnx/overview)でサポートされているかどうかを確認し、サポートされていない場合は[🤗 Optimumに貢献](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute)してください。
### Exporting a model with `transformers.onnx`
<Tip warning={true}>
`transformers.onnx`はもはやメンテナンスされていないため、モデルを上記で説明したように🤗 Optimumでエクスポートしてください。このセクションは将来のバージョンで削除されます。
</Tip>
🤗 TransformersモデルをONNXにエクスポートするには、追加の依存関係をインストールしてください
```bash
pip install transformers[onnx]
```
`transformers.onnx`パッケージをPythonモジュールとして使用して、事前に用意された設定を使用してチェックポイントをエクスポートする方法は以下の通りです
```bash
python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/
```
この方法は、`--model`引数で定義されたチェックポイントのONNXグラフをエクスポートします。🤗 Hubのいずれかのチェックポイントまたはローカルに保存されたチェックポイントを渡すことができます。エクスポートされた`model.onnx`ファイルは、ONNX標準をサポートする多くのアクセラレータで実行できます。例えば、ONNX Runtimeを使用してモデルを読み込んで実行する方法は以下の通りです
```python
>>> from transformers import AutoTokenizer
>>> from onnxruntime import InferenceSession
>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
>>> session = InferenceSession("onnx/model.onnx")
>>> # ONNX Runtime expects NumPy arrays as input
>>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
>>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
```
必要な出力名(例: `["last_hidden_state"]`は、各モデルのONNX構成を確認することで取得できます。例えば、DistilBERTの場合、次のようになります
```python
>>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
>>> config = DistilBertConfig()
>>> onnx_config = DistilBertOnnxConfig(config)
>>> print(list(onnx_config.outputs.keys()))
["last_hidden_state"]
```
ハブから純粋なTensorFlowのチェックポイントをプログラム的にエクスポートするプロセスは、以下のように同様です
```bash
python -m transformers.onnx --model=keras-io/transformers-qa onnx/
```
ローカルに保存されたモデルをエクスポートする場合、モデルの重みとトークナイザのファイルを同じディレクトリに保存してください(例: `local-pt-checkpoint`)。その後、`transformers.onnx`パッケージの `--model`引数を希望するディレクトリに向けて設定して、ONNXにエクスポートします
```bash
python -m transformers.onnx --model=local-pt-checkpoint onnx/
```

View File

@ -219,7 +219,7 @@ MInDS-14 データセットのサンプリング レートは 8khz です (こ
... gradient_accumulation_steps=4,
... per_device_eval_batch_size=32,
... num_train_epochs=10,
... warmup_steps=0.1,
... warmup_ratio=0.1,
... logging_steps=10,
... load_best_model_at_end=True,
... metric_for_best_model="accuracy",

View File

@ -216,7 +216,7 @@ Datasets、🤗 データセット ライブラリから Food-101 データセ
... gradient_accumulation_steps=4,
... per_device_eval_batch_size=16,
... num_train_epochs=3,
... warmup_steps=0.1,
... warmup_ratio=0.1,
... logging_steps=10,
... load_best_model_at_end=True,
... metric_for_best_model="accuracy",

View File

@ -360,7 +360,7 @@ You should probably TRAIN this model on a down-stream task to be able to use it
... learning_rate=5e-5,
... per_device_train_batch_size=batch_size,
... per_device_eval_batch_size=batch_size,
... warmup_steps=0.1,
... warmup_ratio=0.1,
... logging_steps=10,
... load_best_model_at_end=True,
... metric_for_best_model="accuracy",

View File

@ -406,6 +406,8 @@
title: Models
- local: main_classes/text_generation
title: 텍스트 생성
- local: main_classes/onnx
title: ONNX
- local: main_classes/optimizer_schedules
title: 최적화
- local: main_classes/output

View File

@ -0,0 +1,45 @@
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# 🤗 Transformers 모델을 ONNX로 내보내기[[exporting--transformers-models-to-onnx]]
🤗 트랜스포머는 `transformers.onnx` 패키지를 제공하며, 이 패키지는 설정 객체를 활용하여 모델 체크포인트를 ONNX 그래프로 변환할 수 있게 합니다.
🤗 Transformers에 대한 자세한 내용은 [이 가이드](../serialization)를 참조하세요.
## ONNX 설정[[onnx-configurations]]
내보내려는(export) 모델 아키텍처의 유형에 따라 상속받아야 할 세 가지 추상 클래스를 제공합니다:
* 인코더 기반 모델은 [`~onnx.config.OnnxConfig`]을 상속받습니다.
* 디코더 기반 모델은 [`~onnx.config.OnnxConfigWithPast`]을 상속받습니다.
* 인코더-디코더 기반 모델은 [`~onnx.config.OnnxSeq2SeqConfigWithPast`]을 상속받습니다.
### OnnxConfig[[transformers.onnx.OnnxConfig]]
[[autodoc]] onnx.config.OnnxConfig
### OnnxConfigWithPast[[transformers.onnx.OnnxConfigWithPast]]
[[autodoc]] onnx.config.OnnxConfigWithPast
### OnnxSeq2SeqConfigWithPast[[OnnxSeq2SeqConfigWithPast]]
[[autodoc]] onnx.config.OnnxSeq2SeqConfigWithPast
## ONNX 특징[[onnx-features]]
각 ONNX 설정은 다양한 유형의 토폴로지나 작업에 대해 모델을 내보낼 수 있게(exporting) 해주는 _features_ 세트와 연관되어 있습니다.

View File

@ -154,7 +154,7 @@ pip install schedulefree
[Schedule Free optimizer (SFO)](https://hf.co/papers/2405.15682)는 기본 옵티마이저의 모멘텀 대신 평균화(averaging)와 보간(interpolation)을 조합하여 사용합니다. 덕분에 기존의 학습률 스케줄러와 달리, SFO는 학습률을 점진적으로 낮추는 절차가 아예 필요 없습니다.
SFO는 RAdam(`schedule_free_radam`), AdamW(`schedule_free_adamw`), SGD(`schedule_free_sgd`) 옵티마이저를 지원합니다. RAdam 스케줄러는 `warmup_steps`.
SFO는 RAdam(`schedule_free_radam`), AdamW(`schedule_free_adamw`), SGD(`schedule_free_sgd`) 옵티마이저를 지원합니다. RAdam 스케줄러는 `warmup_steps``warmup_ratio` 설정이 필요하지 않습니다.
기본적으로 `lr_scheduler_type="constant"`로 설정하는 것을 권장합니다. 다른 `lr_scheduler_type` 값도 동작할 순 있으나, SFO 옵티마이저와 다른 학습률 스케줄을 함께 사용하면 SFO의 의도된 동작과 성능에 영향을 줄 수 있습니다.

View File

@ -37,7 +37,7 @@ pip install transformers[dev]
또는 Transformers 저장소 내에 편집 가능한 설치가 필요합니다:
```bash
pip install -e ".[dev]"
pip install -e .[dev]
```
Transformers의 선택적 종속성 수가 많이 늘어났기 때문에 개발 설치를 실패할 수도 있습니다. 개발 설치가 실패하는 경우, 작업 중인 Deep Learning 프레임워크 (PyTorch, TensorFlow 및/또는 Flax)를 설치하고 다음 명령을 실행하세요.
@ -49,7 +49,7 @@ pip install transformers[quality]
편집 가능한 설치의 경우는 다음 명령을 실행하세요.
```bash
pip install -e ".[quality]"
pip install -e .[quality]
```

View File

@ -47,7 +47,7 @@ ONNX 형식으로 내보낸 모델은 다음과 같이 사용할 수 있습니
🤗 Transformers 모델을 ONNX로 내보내려면 먼저 추가 종속성을 설치하세요:
```bash
pip install optimum-onnx
pip install optimum[exporters]
```
사용 가능한 모든 인수를 확인하려면 [🤗 Optimum 문서](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli)를 참조하거나 명령줄에서 도움말을 보세요.
@ -123,3 +123,59 @@ CLI 대신에 `optimum.onnxruntime`을 사용하여 프로그래밍 방식으로
### 지원되지 않는 아키텍처의 모델 내보내기 [[exporting-a-model-for-an-unsupported-architecture]]
현재 내보낼 수 없는 모델을 지원하기 위해 기여하려면, 먼저 [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/exporters/onnx/overview)에서 지원되는지 확인한 후 지원되지 않는 경우에는 [🤗 Optimum에 기여](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute)하세요.
### `transformers.onnx`를 사용하여 모델 내보내기 [[exporting-a-model-with-transformersonnx]]
<Tip warning={true}>
`tranformers.onnx`는 더 이상 유지되지 않습니다. 위에서 설명한 대로 🤗 Optimum을 사용하여 모델을 내보내세요. 이 섹션은 향후 버전에서 제거될 예정입니다.
</Tip>
🤗 Transformers 모델을 ONNX로 내보내려면 추가 종속성을 설치하세요:
```bash
pip install transformers[onnx]
```
`transformers.onnx` 패키지를 Python 모듈로 사용하여 준비된 구성을 사용하여 체크포인트를 내보냅니다:
```bash
python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/
```
이렇게 하면 `--model` 인수에 정의된 체크포인트의 ONNX 그래프가 내보내집니다. 🤗 Hub에서 제공하는 체크포인트나 로컬에 저장된 체크포인트를 전달할 수 있습니다. 결과로 생성된 `model.onnx` 파일은 ONNX 표준을 지원하는 많은 가속기 중 하나에서 실행할 수 있습니다. 예를 들어, 다음과 같이 ONNX Runtime을 사용하여 모델을 로드하고 실행할 수 있습니다:
```python
>>> from transformers import AutoTokenizer
>>> from onnxruntime import InferenceSession
>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
>>> session = InferenceSession("onnx/model.onnx")
>>> # ONNX Runtime expects NumPy arrays as input
>>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
>>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
```
필요한 출력 이름(예: `["last_hidden_state"]`)은 각 모델의 ONNX 구성을 확인하여 얻을 수 있습니다. 예를 들어, DistilBERT의 경우 다음과 같습니다:
```python
>>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
>>> config = DistilBertConfig()
>>> onnx_config = DistilBertOnnxConfig(config)
>>> print(list(onnx_config.outputs.keys()))
["last_hidden_state"]
```
Hub의 TensorFlow 체크포인트에 대해서도 동일한 프로세스가 적용됩니다. 예를 들어, 다음과 같이 순수한 TensorFlow 체크포인트를 내보냅니다:
```bash
python -m transformers.onnx --model=keras-io/transformers-qa onnx/
```
로컬에 저장된 모델을 내보내려면 모델의 가중치 파일과 토크나이저 파일을 동일한 디렉토리에 저장한 다음, transformers.onnx 패키지의 --model 인수를 원하는 디렉토리로 지정하여 ONNX로 내보냅니다:
```bash
python -m transformers.onnx --model=local-pt-checkpoint onnx/
```

View File

@ -221,7 +221,7 @@ MinDS-14 데이터 세트의 샘플링 속도는 8khz이므로(이 정보는 [
... gradient_accumulation_steps=4,
... per_device_eval_batch_size=32,
... num_train_epochs=10,
... warmup_steps=0.1,
... warmup_ratio=0.1,
... logging_steps=10,
... load_best_model_at_end=True,
... metric_for_best_model="accuracy",

View File

@ -212,7 +212,7 @@ Hugging Face 계정에 로그인하여 모델을 업로드하고 커뮤니티에
... gradient_accumulation_steps=4,
... per_device_eval_batch_size=16,
... num_train_epochs=3,
... warmup_steps=0.1,
... warmup_ratio=0.1,
... logging_steps=10,
... load_best_model_at_end=True,
... metric_for_best_model="accuracy",

View File

@ -357,7 +357,7 @@ You should probably TRAIN this model on a down-stream task to be able to use it
... learning_rate=5e-5,
... per_device_train_batch_size=batch_size,
... per_device_eval_batch_size=batch_size,
... warmup_steps=0.1,
... warmup_ratio=0.1,
... logging_steps=10,
... load_best_model_at_end=True,
... metric_for_best_model="accuracy",

View File

@ -107,6 +107,8 @@
title: 模型
- local: main_classes/text_generation
title: 文本生成
- local: main_classes/onnx
title: ONNX
- local: main_classes/optimizer_schedules
title: Optimization
- local: main_classes/output

View File

@ -1206,7 +1206,7 @@ DeepSpeed支持`LRRangeTest`、`OneCycle`、`WarmupLR`和`WarmupDecayLR`学习
- 通过 `--lr_scheduler_type constant_with_warmup` 实现 `WarmupLR`
- 通过 `--lr_scheduler_type linear` 实现 `WarmupDecayLR`。这也是 `--lr_scheduler_type` 的默认值,因此,如果不配置调度器,这将是默认配置的调度器。
如果在配置文件中不配置 `scheduler` 条目,[`Trainer`] 将使用 `--lr_scheduler_type`、`--learning_rate` 和 `--warmup_steps` 的值来配置其🤗 Transformers 版本。
如果在配置文件中不配置 `scheduler` 条目,[`Trainer`] 将使用 `--lr_scheduler_type`、`--learning_rate` 和 `--warmup_steps` 或 `--warmup_ratio` 的值来配置其🤗 Transformers 版本。
以下是 `WarmupLR` 的自动配置示例:
@ -1227,7 +1227,7 @@ DeepSpeed支持`LRRangeTest`、`OneCycle`、`WarmupLR`和`WarmupDecayLR`学习
- `warmup_min_lr` 的值为 `0`。
- `warmup_max_lr` 的值为 `--learning_rate`。
- `warmup_num_steps` 的值为 `--warmup_steps`(如果提供)。
- `warmup_num_steps` 的值为 `--warmup_steps`(如果提供)。否则,将使用 `--warmup_ratio` 乘以训练步骤的数量,并四舍五入。
- `total_num_steps` 的值为 `--max_steps` 或者如果没有提供,将在运行时根据环境、数据集的大小和其他命令行参数(对于 `WarmupDecayLR` 来说需要)自动推导。
当然,您可以接管任何或所有的配置值,并自行设置这些值:

View File

@ -0,0 +1,45 @@
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# 导出 🤗 Transformers 模型到 ONNX
🤗 Transformers提供了一个`transformers.onnx`通过利用配置对象您可以将模型checkpoints转换为ONNX图。
有关更多详细信息,请参阅导出 🤗 Transformers 模型的[指南](../serialization)。
## ONNX Configurations
我们提供了三个抽象类,取决于您希望导出的模型架构类型:
* 基于编码器的模型继承 [`~onnx.config.OnnxConfig`]
* 基于解码器的模型继承 [`~onnx.config.OnnxConfigWithPast`]
* 编码器-解码器模型继承 [`~onnx.config.OnnxSeq2SeqConfigWithPast`]
### OnnxConfig
[[autodoc]] onnx.config.OnnxConfig
### OnnxConfigWithPast
[[autodoc]] onnx.config.OnnxConfigWithPast
### OnnxSeq2SeqConfigWithPast
[[autodoc]] onnx.config.OnnxSeq2SeqConfigWithPast
## ONNX Features
每个ONNX配置与一组 _特性_ 相关联,使您能够为不同类型的拓扑结构或任务导出模型。

View File

@ -47,7 +47,7 @@ rendered properly in your Markdown viewer.
要将 🤗 Transformers 模型导出为 ONNX首先需要安装额外的依赖项
```bash
pip install optimum-onnx
pip install optimum[exporters]
```
请参阅 [🤗 Optimum 文档](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli) 以查看所有可用参数,或者在命令行中查看帮助:
@ -117,3 +117,53 @@ optimum-cli export onnx --model local_path --task question-answering distilbert_
### 导出尚未支持的架构的模型
如果你想要为当前无法导出的模型添加支持,请先检查 [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/exporters/onnx/overview) 是否支持该模型,如果不支持,你可以 [直接为 🤗 Optimum 贡献代码](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute)。
### 使用 `transformers.onnx` 导出模型
<Tip warning={true}>
`transformers.onnx` 不再进行维护,请如上所述,使用 🤗 Optimum 导出模型。这部分内容将在未来版本中删除。
</Tip>
要使用 `transformers.onnx` 将 🤗 Transformers 模型导出为 ONNX请安装额外的依赖项
```bash
pip install transformers[onnx]
```
`transformers.onnx` 包作为 Python 模块使用,以使用现成的配置导出检查点:
```bash
python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/
```
以上代码将导出由 `--model` 参数定义的检查点的 ONNX 图。传入任何 🤗 Hub 上或者存储与本地的检查点。生成的 `model.onnx` 文件可以在支持 ONNX 标准的众多加速引擎上运行。例如,使用 ONNX Runtime 加载并运行模型,如下所示:
```python
>>> from transformers import AutoTokenizer
>>> from onnxruntime import InferenceSession
>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
>>> session = InferenceSession("onnx/model.onnx")
>>> # ONNX Runtime expects NumPy arrays as input
>>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
>>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
```
可以通过查看每个模型的 ONNX 配置来获取所需的输出名(例如 `["last_hidden_state"]`)。例如,对于 DistilBERT可以用以下代码获取输出名称
```python
>>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
>>> config = DistilBertConfig()
>>> onnx_config = DistilBertOnnxConfig(config)
>>> print(list(onnx_config.outputs.keys()))
["last_hidden_state"]
```
要导出本地存储的模型,请将模型的权重和分词器文件保存在同一目录中(例如 `local-pt-checkpoint`),然后通过将 `transformers.onnx` 包的 `--model` 参数指向该目录,将其导出为 ONNX
```bash
python -m transformers.onnx --model=local-pt-checkpoint onnx/
```

21
examples/legacy/README.md Normal file
View File

@ -0,0 +1,21 @@
<!---
Copyright 2020 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Legacy examples
This folder contains examples which are not actively maintained (mostly contributed by the community).
Using these examples together with a recent version of the library usually requires to make small (sometimes big) adaptations to get the scripts working.

View File

@ -0,0 +1,26 @@
<!---
Copyright 2020 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# 🤗 Benchmark results
Here, you can find a list of the different benchmark results created by the community.
If you would like to list benchmark results on your favorite models of the [model hub](https://huggingface.co/models) here, please open a Pull Request and add it below.
| Benchmark description | Results | Environment info | Author |
|:----------|:-------------|:-------------|------:|
| PyTorch Benchmark on inference for `google-bert/bert-base-cased` |[memory](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/inference_memory.csv) | [env](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/env.csv) | [Patrick von Platen](https://github.com/patrickvonplaten) |
| PyTorch Benchmark on inference for `google-bert/bert-base-cased` |[time](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/inference_time.csv) | [env](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/env.csv) | [Patrick von Platen](https://github.com/patrickvonplaten) |

View File

@ -0,0 +1,178 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import csv
from collections import defaultdict
from dataclasses import dataclass, field
from typing import Optional
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import ScalarFormatter
from transformers import HfArgumentParser
def list_field(default=None, metadata=None):
return field(default_factory=lambda: default, metadata=metadata)
@dataclass
class PlotArguments:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
"""
csv_file: str = field(
metadata={"help": "The csv file to plot."},
)
plot_along_batch: bool = field(
default=False,
metadata={"help": "Whether to plot along batch size or sequence length. Defaults to sequence length."},
)
is_time: bool = field(
default=False,
metadata={"help": "Whether the csv file has time results or memory results. Defaults to memory results."},
)
no_log_scale: bool = field(
default=False,
metadata={"help": "Disable logarithmic scale when plotting"},
)
is_train: bool = field(
default=False,
metadata={
"help": "Whether the csv file has training results or inference results. Defaults to inference results."
},
)
figure_png_file: Optional[str] = field(
default=None,
metadata={"help": "Filename under which the plot will be saved. If unused no plot is saved."},
)
short_model_names: Optional[list[str]] = list_field(
default=None, metadata={"help": "List of model names that are used instead of the ones in the csv file."}
)
def can_convert_to_int(string):
try:
int(string)
return True
except ValueError:
return False
def can_convert_to_float(string):
try:
float(string)
return True
except ValueError:
return False
class Plot:
def __init__(self, args):
self.args = args
self.result_dict = defaultdict(lambda: {"bsz": [], "seq_len": [], "result": {}})
with open(self.args.csv_file, newline="") as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
model_name = row["model"]
self.result_dict[model_name]["bsz"].append(int(row["batch_size"]))
self.result_dict[model_name]["seq_len"].append(int(row["sequence_length"]))
if can_convert_to_int(row["result"]):
# value is not None
self.result_dict[model_name]["result"][(int(row["batch_size"]), int(row["sequence_length"]))] = (
int(row["result"])
)
elif can_convert_to_float(row["result"]):
# value is not None
self.result_dict[model_name]["result"][(int(row["batch_size"]), int(row["sequence_length"]))] = (
float(row["result"])
)
def plot(self):
fig, ax = plt.subplots()
title_str = "Time usage" if self.args.is_time else "Memory usage"
title_str = title_str + " for training" if self.args.is_train else title_str + " for inference"
if not self.args.no_log_scale:
# set logarithm scales
ax.set_xscale("log")
ax.set_yscale("log")
for axis in [ax.xaxis, ax.yaxis]:
axis.set_major_formatter(ScalarFormatter())
for model_name_idx, model_name in enumerate(self.result_dict.keys()):
batch_sizes = sorted(set(self.result_dict[model_name]["bsz"]))
sequence_lengths = sorted(set(self.result_dict[model_name]["seq_len"]))
results = self.result_dict[model_name]["result"]
(x_axis_array, inner_loop_array) = (
(batch_sizes, sequence_lengths) if self.args.plot_along_batch else (sequence_lengths, batch_sizes)
)
label_model_name = (
model_name if self.args.short_model_names is None else self.args.short_model_names[model_name_idx]
)
for inner_loop_value in inner_loop_array:
if self.args.plot_along_batch:
y_axis_array = np.asarray(
[results[(x, inner_loop_value)] for x in x_axis_array if (x, inner_loop_value) in results],
dtype=int,
)
else:
y_axis_array = np.asarray(
[results[(inner_loop_value, x)] for x in x_axis_array if (inner_loop_value, x) in results],
dtype=np.float32,
)
(x_axis_label, inner_loop_label) = (
("batch_size", "len") if self.args.plot_along_batch else ("in #tokens", "bsz")
)
x_axis_array = np.asarray(x_axis_array, int)[: len(y_axis_array)]
plt.scatter(
x_axis_array, y_axis_array, label=f"{label_model_name} - {inner_loop_label}: {inner_loop_value}"
)
plt.plot(x_axis_array, y_axis_array, "--")
title_str += f" {label_model_name} vs."
title_str = title_str[:-4]
y_axis_label = "Time in s" if self.args.is_time else "Memory in MB"
# plot
plt.title(title_str)
plt.xlabel(x_axis_label)
plt.ylabel(y_axis_label)
plt.legend()
if self.args.figure_png_file is not None:
plt.savefig(self.args.figure_png_file)
else:
plt.show()
def main():
parser = HfArgumentParser(PlotArguments)
plot_args = parser.parse_args_into_dataclasses()[0]
plot = Plot(args=plot_args)
plot.plot()
if __name__ == "__main__":
main()

View File

@ -0,0 +1 @@
torch >= 1.3

Some files were not shown because too many files have changed in this diff Show More