mirror of
https://github.com/huggingface/transformers.git
synced 2025-11-12 01:04:36 +08:00
Compare commits
10 Commits
copilot/su
...
remove-use
| Author | SHA1 | Date | |
|---|---|---|---|
| e1f5ad7297 | |||
| 22137bf322 | |||
| 61560b295a | |||
| 74410d8c78 | |||
| 863cc2e234 | |||
| a2eab57312 | |||
| de014f92c2 | |||
| 16a25c15ff | |||
| 0911196aa6 | |||
| 19166645f3 |
1
.github/scripts/codeowners_for_review_action
vendored
1
.github/scripts/codeowners_for_review_action
vendored
@ -22,6 +22,7 @@ tests/generation/ @gante
|
||||
/src/transformers/models/auto/ @ArthurZucker
|
||||
/src/transformers/utils/ @ArthurZucker @Rocketknight1
|
||||
/src/transformers/loss/ @ArthurZucker
|
||||
/src/transformers/onnx/ @michaelbenayoun
|
||||
|
||||
# Specific files come after the sections/globs, so they take priority
|
||||
/.circleci/config.yml @ArthurZucker @ydshieh
|
||||
|
||||
33
.github/workflows/benchmark.yml
vendored
33
.github/workflows/benchmark.yml
vendored
@ -1,10 +1,7 @@
|
||||
name: Self-hosted runner (benchmark)
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
pull_request:
|
||||
types: [ opened, labeled, reopened, synchronize ]
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
@ -12,8 +9,6 @@ concurrency:
|
||||
|
||||
env:
|
||||
HF_HOME: /mnt/cache
|
||||
DATASET_ID: hf-benchmarks/transformers
|
||||
MODEL_ID: meta-llama/Llama-3.1-8B-Instruct
|
||||
|
||||
jobs:
|
||||
benchmark:
|
||||
@ -28,7 +23,7 @@ jobs:
|
||||
(github.event_name == 'pull_request' && contains( github.event.pull_request.labels.*.name, 'run-benchmark') )||
|
||||
(github.event_name == 'push' && github.ref == 'refs/heads/main')
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
image: huggingface/transformers-pytorch-gpu
|
||||
options: --gpus all --privileged --ipc host
|
||||
steps:
|
||||
- name: Get repo
|
||||
@ -36,12 +31,26 @@ jobs:
|
||||
with:
|
||||
ref: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
|
||||
- name: Install libpq-dev & psql
|
||||
run: |
|
||||
apt update
|
||||
apt install -y libpq-dev postgresql-client
|
||||
|
||||
- name: Install benchmark script dependencies
|
||||
run: python3 -m pip install -r benchmark_v2/requirements.txt kernels
|
||||
run: python3 -m pip install -r benchmark/requirements.txt
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: /transformers
|
||||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e ".[torch]" && python3 -m pip uninstall -y torchvision # temp fix
|
||||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e ".[torch]"
|
||||
|
||||
- name: Run database init script
|
||||
run: |
|
||||
psql -f benchmark/utils/init_db.sql
|
||||
env:
|
||||
PGDATABASE: metrics
|
||||
PGHOST: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGHOST }}
|
||||
PGUSER: transformers_benchmarks
|
||||
PGPASSWORD: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGPASSWORD }}
|
||||
|
||||
- name: Run benchmark
|
||||
run: |
|
||||
@ -52,11 +61,13 @@ jobs:
|
||||
commit_id=$GITHUB_SHA
|
||||
fi
|
||||
commit_msg=$(git show -s --format=%s | cut -c1-70)
|
||||
python3 benchmark_v2/run_benchmarks.py -b 32 -s 128 -n 256 --level 2 --branch-name "$BRANCH_NAME" --commit-id "$commit_id" --commit-message "$commit_msg" --model-id "$MODEL_ID" --log-level INFO --push-result-to-dataset "$DATASET_ID"
|
||||
python3 benchmark/benchmarks_entrypoint.py "huggingface/transformers" "$BRANCH_NAME" "$commit_id" "$commit_msg"
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||
PUSH_TO_HUB_TOKEN: ${{ secrets.PUSH_TO_HUB_TOKEN }}
|
||||
# Enable this to see debug logs
|
||||
# HF_HUB_VERBOSITY: debug
|
||||
# TRANSFORMERS_VERBOSITY: debug
|
||||
PGHOST: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGHOST }}
|
||||
PGUSER: transformers_benchmarks
|
||||
PGPASSWORD: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGPASSWORD }}
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
@ -9,7 +9,7 @@ jobs:
|
||||
uses: ./.github/workflows/benchmark_v2.yml
|
||||
with:
|
||||
runner: aws-g5-4xlarge-cache-use1-public-80
|
||||
container_image: huggingface/transformers-all-latest-gpu
|
||||
container_image: huggingface/transformers-pytorch-gpu
|
||||
container_options: --gpus all --privileged --ipc host --shm-size "16gb"
|
||||
commit_sha: ${{ github.sha }}
|
||||
run_id: ${{ github.run_id }}
|
||||
|
||||
184
.github/workflows/build-docker-images.yml
vendored
184
.github/workflows/build-docker-images.yml
vendored
@ -45,59 +45,33 @@ jobs:
|
||||
REF=main
|
||||
push: true
|
||||
tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}
|
||||
|
||||
- name: Post to Slack
|
||||
if: always()
|
||||
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
||||
with:
|
||||
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
|
||||
title: 🤗 Results of the transformers-all-latest-gpu docker build
|
||||
status: ${{ job.status }}
|
||||
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||
|
||||
flash-attn-ci-image:
|
||||
name: "PyTorch with Flash Attn [dev]"
|
||||
runs-on:
|
||||
group: aws-general-8-plus
|
||||
steps:
|
||||
# Push CI images still need to be re-built daily
|
||||
-
|
||||
name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
-
|
||||
name: Check out code
|
||||
uses: actions/checkout@v4
|
||||
-
|
||||
name: Login to DockerHub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||
-
|
||||
name: Build and push
|
||||
name: Build and push (for Push CI) in a daily basis
|
||||
# This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
|
||||
# The later case is useful for manual image building for debugging purpose. Use another tag in this case!
|
||||
if: inputs.image_postfix != '-push-ci'
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: ./docker/transformers-all-latest-gpu
|
||||
build-args: |
|
||||
REF=main
|
||||
PYTORCH=2.8.0
|
||||
TORCHCODEC=0.7.0
|
||||
FLASH_ATTN=yes
|
||||
push: true
|
||||
tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}:flash-attn
|
||||
tags: huggingface/transformers-all-latest-gpu-push-ci
|
||||
|
||||
- name: Post to Slack
|
||||
if: always()
|
||||
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
||||
with:
|
||||
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
|
||||
title: 🤗 Results of the transformers-all-latest-gpu docker build
|
||||
title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build
|
||||
status: ${{ job.status }}
|
||||
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||
|
||||
latest-torch-deepspeed-docker:
|
||||
name: "Latest PyTorch + DeepSpeed"
|
||||
runs-on:
|
||||
group: aws-general-8-plus
|
||||
group: aws-g4dn-2xlarge-cache
|
||||
steps:
|
||||
-
|
||||
name: Set up Docker Buildx
|
||||
@ -130,8 +104,51 @@ jobs:
|
||||
status: ${{ job.status }}
|
||||
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||
|
||||
# Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`)
|
||||
latest-torch-deepspeed-docker-for-push-ci-daily-build:
|
||||
name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)"
|
||||
runs-on:
|
||||
group: aws-general-8-plus
|
||||
steps:
|
||||
-
|
||||
name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
-
|
||||
name: Check out code
|
||||
uses: actions/checkout@v4
|
||||
-
|
||||
name: Login to DockerHub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||
# Push CI images still need to be re-built daily
|
||||
-
|
||||
name: Build and push (for Push CI) in a daily basis
|
||||
# This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
|
||||
# The later case is useful for manual image building for debugging purpose. Use another tag in this case!
|
||||
if: inputs.image_postfix != '-push-ci'
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: ./docker/transformers-pytorch-deepspeed-latest-gpu
|
||||
build-args: |
|
||||
REF=main
|
||||
push: true
|
||||
tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
|
||||
|
||||
- name: Post to Slack
|
||||
if: always()
|
||||
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
||||
with:
|
||||
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
|
||||
title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build
|
||||
status: ${{ job.status }}
|
||||
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||
|
||||
doc-builder:
|
||||
name: "Doc builder"
|
||||
# Push CI doesn't need this image
|
||||
if: inputs.image_postfix != '-push-ci'
|
||||
runs-on:
|
||||
group: aws-general-8-plus
|
||||
steps:
|
||||
@ -164,6 +181,44 @@ jobs:
|
||||
status: ${{ job.status }}
|
||||
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||
|
||||
latest-pytorch:
|
||||
name: "Latest PyTorch [dev]"
|
||||
# Push CI doesn't need this image
|
||||
if: inputs.image_postfix != '-push-ci'
|
||||
runs-on:
|
||||
group: aws-general-8-plus
|
||||
steps:
|
||||
-
|
||||
name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
-
|
||||
name: Check out code
|
||||
uses: actions/checkout@v4
|
||||
-
|
||||
name: Login to DockerHub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||
-
|
||||
name: Build and push
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: ./docker/transformers-pytorch-gpu
|
||||
build-args: |
|
||||
REF=main
|
||||
push: true
|
||||
tags: huggingface/transformers-pytorch-gpu
|
||||
|
||||
- name: Post to Slack
|
||||
if: always()
|
||||
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
||||
with:
|
||||
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
|
||||
title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build
|
||||
status: ${{ job.status }}
|
||||
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||
|
||||
latest-pytorch-amd:
|
||||
name: "Latest PyTorch (AMD) [dev]"
|
||||
runs-on:
|
||||
@ -190,47 +245,29 @@ jobs:
|
||||
REF=main
|
||||
push: true
|
||||
tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }}
|
||||
# Push CI images still need to be re-built daily
|
||||
-
|
||||
name: Build and push (for Push CI) in a daily basis
|
||||
# This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
|
||||
# The later case is useful for manual image building for debugging purpose. Use another tag in this case!
|
||||
if: inputs.image_postfix != '-push-ci'
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: ./docker/transformers-pytorch-amd-gpu
|
||||
build-args: |
|
||||
REF=main
|
||||
push: true
|
||||
tags: huggingface/transformers-pytorch-amd-gpu-push-ci
|
||||
|
||||
- name: Post to Slack
|
||||
if: always()
|
||||
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
||||
with:
|
||||
slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
|
||||
title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu build
|
||||
title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build
|
||||
status: ${{ job.status }}
|
||||
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||
|
||||
cache-latest-pytorch-amd:
|
||||
name: "Cache Latest Pytorch (AMD) Image"
|
||||
needs: latest-pytorch-amd
|
||||
runs-on:
|
||||
group: amd-mi325-1gpu
|
||||
steps:
|
||||
-
|
||||
name: Login to DockerHub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||
|
||||
-
|
||||
name: Pull and save docker image to cache
|
||||
run: |
|
||||
image="huggingface/transformers-pytorch-amd-gpu"
|
||||
final_path="/mnt/image-cache/transformers-pytorch-amd-gpu.tar"
|
||||
tmp_path="${final_path}.tmp"
|
||||
|
||||
echo "Pulling image: ${image}"
|
||||
docker pull "${image}"
|
||||
|
||||
echo "Saving to temp file: ${tmp_path}"
|
||||
docker save "${image}" -o "${tmp_path}"
|
||||
|
||||
echo "Moving to final path: ${final_path}"
|
||||
mv -f "${tmp_path}" "${final_path}"
|
||||
|
||||
echo "Cache populated successfully at ${final_path}"
|
||||
|
||||
latest-pytorch-deepspeed-amd:
|
||||
name: "PyTorch + DeepSpeed (AMD) [dev]"
|
||||
runs-on:
|
||||
@ -257,6 +294,19 @@ jobs:
|
||||
REF=main
|
||||
push: true
|
||||
tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}
|
||||
# Push CI images still need to be re-built daily
|
||||
-
|
||||
name: Build and push (for Push CI) in a daily basis
|
||||
# This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
|
||||
# The later case is useful for manual image building for debugging purpose. Use another tag in this case!
|
||||
if: inputs.image_postfix != '-push-ci'
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: ./docker/transformers-pytorch-deepspeed-amd-gpu
|
||||
build-args: |
|
||||
REF=main
|
||||
push: true
|
||||
tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci
|
||||
|
||||
- name: Post to Slack
|
||||
if: always()
|
||||
@ -269,6 +319,8 @@ jobs:
|
||||
|
||||
latest-quantization-torch-docker:
|
||||
name: "Latest Pytorch + Quantization [dev]"
|
||||
# Push CI doesn't need this image
|
||||
if: inputs.image_postfix != '-push-ci'
|
||||
runs-on:
|
||||
group: aws-general-8-plus
|
||||
steps:
|
||||
|
||||
171
.github/workflows/check_failed_tests.yml
vendored
171
.github/workflows/check_failed_tests.yml
vendored
@ -6,6 +6,9 @@ on:
|
||||
docker:
|
||||
required: true
|
||||
type: string
|
||||
start_sha:
|
||||
required: true
|
||||
type: string
|
||||
job:
|
||||
required: true
|
||||
type: string
|
||||
@ -21,13 +24,7 @@ on:
|
||||
commit_sha:
|
||||
required: false
|
||||
type: string
|
||||
pr_number:
|
||||
required: false
|
||||
type: string
|
||||
outputs:
|
||||
report:
|
||||
description: "Content of the report of new failures"
|
||||
value: ${{ jobs.process_new_failures_with_commit_info.outputs.report }}
|
||||
|
||||
|
||||
env:
|
||||
HF_HOME: /mnt/cache
|
||||
@ -44,14 +41,9 @@ env:
|
||||
|
||||
jobs:
|
||||
check_new_failures:
|
||||
name: "Find commits for new failing tests"
|
||||
strategy:
|
||||
matrix:
|
||||
run_idx: [1]
|
||||
name: " "
|
||||
runs-on:
|
||||
group: aws-g5-4xlarge-cache
|
||||
outputs:
|
||||
process: ${{ steps.check_file.outputs.process }}
|
||||
container:
|
||||
image: ${{ inputs.docker }}
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@ -62,17 +54,14 @@ jobs:
|
||||
path: /transformers/ci_results_${{ inputs.job }}
|
||||
|
||||
- name: Check file
|
||||
id: check_file
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
if [ -f ci_results_${{ inputs.job }}/new_failures.json ]; then
|
||||
echo "`ci_results_${{ inputs.job }}/new_failures.json` exists, continue ..."
|
||||
echo "process=true" >> $GITHUB_ENV
|
||||
echo "process=true" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "`ci_results_${{ inputs.job }}/new_failures.json` doesn't exist, abort."
|
||||
echo "process=false" >> $GITHUB_ENV
|
||||
echo "process=false" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- uses: actions/download-artifact@v4
|
||||
@ -91,55 +80,27 @@ jobs:
|
||||
echo "PREV_WORKFLOW_RUN_ID=" >> $GITHUB_ENV
|
||||
fi
|
||||
|
||||
if [ -f setup_values/other_workflow_run_id.txt ]; then
|
||||
echo "OTHER_WORKFLOW_RUN_ID=$(cat setup_values/other_workflow_run_id.txt)" >> $GITHUB_ENV
|
||||
else
|
||||
echo "OTHER_WORKFLOW_RUN_ID=" >> $GITHUB_ENV
|
||||
fi
|
||||
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
if: ${{ env.process == 'true' }}
|
||||
run: |
|
||||
git fetch origin ${{ inputs.commit_sha || github.sha }}
|
||||
git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
|
||||
run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
|
||||
|
||||
- name: Get `START_SHA`
|
||||
- name: Get target commit
|
||||
working-directory: /transformers/utils
|
||||
if: ${{ env.process == 'true' }}
|
||||
run: |
|
||||
echo "START_SHA=${{ inputs.commit_sha || github.sha }}" >> $GITHUB_ENV
|
||||
|
||||
# This is used if the CI is triggered from a pull request `self-comment-ci.yml` (after security check is verified)
|
||||
- name: Extract the base commit on `main` (of the merge commit created by Github) if it is a PR
|
||||
id: pr_info
|
||||
if: ${{ env.process == 'true' && inputs.pr_number != '' }}
|
||||
uses: actions/github-script@v6
|
||||
with:
|
||||
script: |
|
||||
const { data: pr } = await github.rest.pulls.get({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
pull_number: ${{ inputs.pr_number }}
|
||||
});
|
||||
|
||||
const { data: merge_commit } = await github.rest.repos.getCommit({
|
||||
owner: pr.base.repo.owner.login,
|
||||
repo: pr.base.repo.name,
|
||||
ref: pr.merge_commit_sha,
|
||||
});
|
||||
|
||||
core.setOutput('merge_commit_base_sha', merge_commit.parents[0].sha);
|
||||
|
||||
# Usually, `END_SHA` should be the commit of the last previous workflow run of the **SAME** (scheduled) workflow.
|
||||
# (This is why we don't need to specify `workflow_id` which would be fetched automatically in the python script.)
|
||||
- name: Get `END_SHA` from previous CI runs of the same workflow
|
||||
working-directory: /transformers/utils
|
||||
if: ${{ env.process == 'true' && inputs.pr_number == '' }}
|
||||
run: |
|
||||
echo "END_SHA=$(TOKEN=${{ secrets.ACCESS_REPO_INFO_TOKEN }} python3 -c 'import os; from get_previous_daily_ci import get_last_daily_ci_run_commit; commit=get_last_daily_ci_run_commit(token=os.environ["TOKEN"], workflow_run_id=os.environ["PREV_WORKFLOW_RUN_ID"]); print(commit)')" >> $GITHUB_ENV
|
||||
|
||||
# However, for workflow runs triggered by `issue_comment` (for pull requests), we want to check against the
|
||||
# parent commit (on `main`) of the `merge_commit` (dynamically created by GitHub). In this case, the goal is to
|
||||
# see if a reported failing test is actually ONLY failing on the `merge_commit`.
|
||||
- name: Set `END_SHA`
|
||||
if: ${{ env.process == 'true' && inputs.pr_number != '' }}
|
||||
run: |
|
||||
echo "END_SHA=${{ steps.pr_info.outputs.merge_commit_base_sha }}" >> $GITHUB_ENV
|
||||
- name: Checkout to `start_sha`
|
||||
working-directory: /transformers
|
||||
if: ${{ env.process == 'true' }}
|
||||
run: git fetch && git checkout ${{ inputs.start_sha }}
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: /transformers
|
||||
@ -157,10 +118,6 @@ jobs:
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Install pytest-flakefinder
|
||||
if: ${{ env.process == 'true' }}
|
||||
run: python3 -m pip install pytest-flakefinder
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
working-directory: /transformers
|
||||
if: ${{ env.process == 'true' }}
|
||||
@ -169,67 +126,37 @@ jobs:
|
||||
- name: Check failed tests
|
||||
working-directory: /transformers
|
||||
if: ${{ env.process == 'true' }}
|
||||
run: python3 utils/check_bad_commit.py --start_commit ${{ env.START_SHA }} --end_commit ${{ env.END_SHA }} --file ci_results_${{ inputs.job }}/new_failures.json --output_file new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
|
||||
run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_${{ inputs.job }}/new_failures.json --output_file new_failures_with_bad_commit.json
|
||||
|
||||
- name: Show results
|
||||
working-directory: /transformers
|
||||
if: ${{ env.process == 'true' }}
|
||||
run: |
|
||||
ls -l new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
|
||||
cat new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
|
||||
ls -l new_failures_with_bad_commit.json
|
||||
cat new_failures_with_bad_commit.json
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}
|
||||
path: /transformers/new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
|
||||
|
||||
process_new_failures_with_commit_info:
|
||||
name: "process bad commit reports"
|
||||
needs: check_new_failures
|
||||
if: needs.check_new_failures.outputs.process == 'true'
|
||||
runs-on:
|
||||
group: aws-g5-4xlarge-cache
|
||||
outputs:
|
||||
report: ${{ steps.set_output.outputs.report }}
|
||||
container:
|
||||
image: ${{ inputs.docker }}
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: ci_results_${{ inputs.job }}
|
||||
path: /transformers/ci_results_${{ inputs.job }}
|
||||
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
pattern: new_failures_with_bad_commit_${{ inputs.job }}*
|
||||
path: /transformers/new_failures_with_bad_commit_${{ inputs.job }}
|
||||
merge-multiple: true
|
||||
|
||||
- name: Check files
|
||||
- name: Checkout back
|
||||
working-directory: /transformers
|
||||
if: ${{ env.process == 'true' }}
|
||||
run: |
|
||||
ls -la /transformers
|
||||
ls -la /transformers/new_failures_with_bad_commit_${{ inputs.job }}
|
||||
|
||||
# Currently, we only run with a single runner by using `run_idx: [1]`. We might try to run with multiple runners
|
||||
# to further reduce the false positive caused by flaky tests, which requires further processing to merge reports.
|
||||
- name: Merge files
|
||||
shell: bash
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
cp /transformers/new_failures_with_bad_commit_${{ inputs.job }}/new_failures_with_bad_commit_${{ inputs.job }}_1.json new_failures_with_bad_commit.json
|
||||
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
git fetch origin ${{ inputs.commit_sha || github.sha }}
|
||||
git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
|
||||
git checkout ${{ inputs.start_sha }}
|
||||
|
||||
- name: Process report
|
||||
shell: bash
|
||||
working-directory: /transformers
|
||||
if: ${{ env.process == 'true' }}
|
||||
env:
|
||||
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
|
||||
JOB_NAME: ${{ inputs.job }}
|
||||
REPORT_REPO_ID: ${{ inputs.report_repo_id }}
|
||||
run: |
|
||||
python3 utils/process_bad_commit_report.py
|
||||
|
||||
- name: Process report
|
||||
shell: bash
|
||||
working-directory: /transformers
|
||||
if: ${{ env.process == 'true' }}
|
||||
env:
|
||||
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
|
||||
@ -242,37 +169,15 @@ jobs:
|
||||
echo EOF
|
||||
} >> "$GITHUB_ENV"
|
||||
|
||||
# The output is useful if a caller needs more processing, for example, we have a chain
|
||||
# self-comment-ci.yml -> self-scheduled.yml -> this one (check_failed_tests.yml),
|
||||
# and `self-comment-ci.yml` needs further processing before sending a GitHub comment to the pull request page.
|
||||
- name: Show results & Set outputs
|
||||
id: set_output
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
ls -l new_failures_with_bad_commit.json
|
||||
cat new_failures_with_bad_commit.json
|
||||
|
||||
{
|
||||
echo 'report<<EOF'
|
||||
cat new_failures_with_bad_commit.json
|
||||
echo '' # Force a newline
|
||||
echo EOF
|
||||
} >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: new_failures_with_bad_commit_${{ inputs.job }}
|
||||
path: /transformers/new_failures_with_bad_commit.json
|
||||
|
||||
- name: Prepare Slack report title
|
||||
working-directory: /transformers
|
||||
if: ${{ env.process == 'true' }}
|
||||
run: |
|
||||
pip install slack_sdk
|
||||
echo "title=$(python3 -c 'import sys; sys.path.append("utils"); from utils.notification_service import job_to_test_map; ci_event = "${{ inputs.ci_event }}"; job = "${{ inputs.job }}"; test_name = job_to_test_map[job]; title = f"New failed tests of {ci_event}" + ":" + f" {test_name}"; print(title)')" >> $GITHUB_ENV
|
||||
|
||||
- name: Send processed report
|
||||
if: ${{ !endsWith(env.REPORT_TEXT, '{}') }}
|
||||
if: ${{ env.process == 'true' && !endsWith(env.REPORT_TEXT, '{}') }}
|
||||
uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
|
||||
with:
|
||||
# Slack channel id, channel name, or user id to post message.
|
||||
|
||||
9
.github/workflows/get-pr-info.yml
vendored
9
.github/workflows/get-pr-info.yml
vendored
@ -39,9 +39,6 @@ on:
|
||||
PR_MERGE_COMMIT_SHA:
|
||||
description: "The sha of the merge commit for the pull request (created by GitHub) in the base repository"
|
||||
value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_SHA }}
|
||||
PR_MERGE_COMMIT_BASE_SHA:
|
||||
description: "The sha of the parent commit of the the merge commit on the target branch in the base repository"
|
||||
value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_BASE_SHA }}
|
||||
PR_HEAD_COMMIT_DATE:
|
||||
description: "The date of the head sha of the pull request branch in the head repository"
|
||||
value: ${{ jobs.get-pr-info.outputs.PR_HEAD_COMMIT_DATE }}
|
||||
@ -77,7 +74,6 @@ jobs:
|
||||
PR_BASE_REF: ${{ steps.pr_info.outputs.base_ref }}
|
||||
PR_HEAD_SHA: ${{ steps.pr_info.outputs.head_sha }}
|
||||
PR_BASE_SHA: ${{ steps.pr_info.outputs.base_sha }}
|
||||
PR_MERGE_COMMIT_BASE_SHA: ${{ steps.pr_info.outputs.merge_commit_base_sha }}
|
||||
PR_MERGE_COMMIT_SHA: ${{ steps.pr_info.outputs.merge_commit_sha }}
|
||||
PR_HEAD_COMMIT_DATE: ${{ steps.pr_info.outputs.head_commit_date }}
|
||||
PR_MERGE_COMMIT_DATE: ${{ steps.pr_info.outputs.merge_commit_date }}
|
||||
@ -126,7 +122,6 @@ jobs:
|
||||
core.setOutput('base_ref', pr.base.ref);
|
||||
core.setOutput('head_sha', pr.head.sha);
|
||||
core.setOutput('base_sha', pr.base.sha);
|
||||
core.setOutput('merge_commit_base_sha', merge_commit.parents[0].sha);
|
||||
core.setOutput('merge_commit_sha', pr.merge_commit_sha);
|
||||
core.setOutput('pr', pr);
|
||||
|
||||
@ -147,10 +142,6 @@ jobs:
|
||||
date: merge_commit.commit.committer.date
|
||||
});
|
||||
|
||||
console.log('PR Info:', {
|
||||
pr_info: pr
|
||||
});
|
||||
|
||||
- name: Convert dates to timestamps
|
||||
id: get_timestamps
|
||||
run: |
|
||||
|
||||
11
.github/workflows/model_jobs.yml
vendored
11
.github/workflows/model_jobs.yml
vendored
@ -28,9 +28,6 @@ on:
|
||||
report_repo_id:
|
||||
required: false
|
||||
type: string
|
||||
pytest_marker:
|
||||
required: false
|
||||
type: string
|
||||
|
||||
env:
|
||||
HF_HOME: /mnt/cache
|
||||
@ -80,9 +77,7 @@ jobs:
|
||||
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
git fetch origin ${{ inputs.commit_sha || github.sha }}
|
||||
git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
|
||||
run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: /transformers
|
||||
@ -142,7 +137,7 @@ jobs:
|
||||
- name: Run all tests on GPU
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports python3 -m pytest -rsfE -v -m '${{ inputs.pytest_marker }}' --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports tests/${{ matrix.folders }}" test_outputs.txt
|
||||
script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports tests/${{ matrix.folders }}" test_outputs.txt
|
||||
ls -la
|
||||
# Extract the exit code from the output file
|
||||
EXIT_CODE=$(tail -1 test_outputs.txt | grep -o 'COMMAND_EXIT_CODE="[0-9]*"' | cut -d'"' -f2)
|
||||
@ -176,7 +171,7 @@ jobs:
|
||||
|
||||
collated_reports:
|
||||
name: Collated Reports
|
||||
if: ${{ always() && inputs.runner_type != '' }}
|
||||
if: ${{ always() }}
|
||||
needs: run_models_gpu
|
||||
uses: huggingface/transformers/.github/workflows/collated-reports.yml@main
|
||||
with:
|
||||
|
||||
4
.github/workflows/push-important-models.yml
vendored
4
.github/workflows/push-important-models.yml
vendored
@ -149,9 +149,9 @@ jobs:
|
||||
with:
|
||||
job: run_models_gpu
|
||||
slack_report_channel: "#transformers-ci-push"
|
||||
docker: huggingface/transformers-all-latest-gpu:flash-attn
|
||||
docker: huggingface/transformers-all-latest-gpu
|
||||
ci_event: push
|
||||
report_repo_id: hf-internal-testing/transformers_ci_push
|
||||
commit_sha: ${{ github.sha }}
|
||||
subdirs: ${{ needs.get_modified_models.outputs.matrix }}
|
||||
models: ${{ needs.get_modified_models.outputs.matrix }}
|
||||
secrets: inherit
|
||||
|
||||
481
.github/workflows/self-comment-ci.yml
vendored
481
.github/workflows/self-comment-ci.yml
vendored
@ -23,34 +23,62 @@ env:
|
||||
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||
CUDA_VISIBLE_DEVICES: 0,1
|
||||
|
||||
|
||||
jobs:
|
||||
get-pr-number:
|
||||
runs-on: ubuntu-22.04
|
||||
name: Get PR number
|
||||
# For security: only allow team members to run
|
||||
if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "eustlb", "MekkCyber", "vasqu", "ivarflakstad", "stevhliu", "ebezzam", "remi-or", "itazap"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
|
||||
uses: ./.github/workflows/get-pr-number.yml
|
||||
outputs:
|
||||
PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
|
||||
steps:
|
||||
- name: Get PR number
|
||||
shell: bash
|
||||
run: |
|
||||
if [[ "${{ github.event.issue.number }}" != "" && "${{ github.event.issue.pull_request }}" != "" ]]; then
|
||||
echo "PR_NUMBER=${{ github.event.issue.number }}" >> $GITHUB_ENV
|
||||
else
|
||||
echo "PR_NUMBER=" >> $GITHUB_ENV
|
||||
fi
|
||||
|
||||
get-pr-info:
|
||||
name: Get PR commit SHA
|
||||
- name: Check PR number
|
||||
shell: bash
|
||||
run: |
|
||||
echo "${{ env.PR_NUMBER }}"
|
||||
|
||||
- name: Set PR number
|
||||
id: set_pr_number
|
||||
run: echo "PR_NUMBER=${{ env.PR_NUMBER }}" >> "$GITHUB_OUTPUT"
|
||||
|
||||
get-sha:
|
||||
runs-on: ubuntu-22.04
|
||||
needs: get-pr-number
|
||||
if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
|
||||
uses: ./.github/workflows/get-pr-info.yml
|
||||
with:
|
||||
pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
|
||||
|
||||
check-timestamps:
|
||||
name: Check timestamps (security check)
|
||||
runs-on: ubuntu-22.04
|
||||
needs: get-pr-info
|
||||
outputs:
|
||||
PR_HEAD_SHA: ${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}
|
||||
PR_MERGE_SHA: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_SHA }}
|
||||
PR_HEAD_SHA: ${{ steps.get_sha.outputs.PR_HEAD_SHA }}
|
||||
PR_MERGE_SHA: ${{ steps.get_sha.outputs.PR_MERGE_SHA }}
|
||||
steps:
|
||||
- name: Verify `merge_commit` timestamp is older than the issue comment timestamp
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: "0"
|
||||
ref: "refs/pull/${{needs.get-pr-number.outputs.PR_NUMBER}}/merge"
|
||||
|
||||
- name: Get SHA (and verify timestamps against the issue comment date)
|
||||
id: get_sha
|
||||
env:
|
||||
PR_NUMBER: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
|
||||
COMMENT_DATE: ${{ github.event.comment.created_at }}
|
||||
PR_MERGE_COMMIT_TIMESTAMP: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
|
||||
run: |
|
||||
git fetch origin refs/pull/$PR_NUMBER/head:refs/remotes/pull/$PR_NUMBER/head
|
||||
git checkout refs/remotes/pull/$PR_NUMBER/head
|
||||
echo "PR_HEAD_SHA: $(git log -1 --format=%H)"
|
||||
echo "PR_HEAD_SHA=$(git log -1 --format=%H)" >> "$GITHUB_OUTPUT"
|
||||
git fetch origin refs/pull/$PR_NUMBER/merge:refs/remotes/pull/$PR_NUMBER/merge
|
||||
git checkout refs/remotes/pull/$PR_NUMBER/merge
|
||||
echo "PR_MERGE_SHA: $(git log -1 --format=%H)"
|
||||
echo "PR_MERGE_SHA=$(git log -1 --format=%H)" >> "$GITHUB_OUTPUT"
|
||||
PR_MERGE_COMMIT_TIMESTAMP=$(git log -1 --date=unix --format=%cd)
|
||||
echo "PR_MERGE_COMMIT_TIMESTAMP: $PR_MERGE_COMMIT_TIMESTAMP"
|
||||
COMMENT_TIMESTAMP=$(date -d "${COMMENT_DATE}" +"%s")
|
||||
echo "COMMENT_DATE: $COMMENT_DATE"
|
||||
echo "COMMENT_TIMESTAMP: $COMMENT_TIMESTAMP"
|
||||
@ -59,10 +87,13 @@ jobs:
|
||||
exit -1;
|
||||
fi
|
||||
|
||||
# use a python script to handle this complex logic.
|
||||
# use a python script to handle this complex logic
|
||||
# case 1: `run-slow` (auto. infer with limited number of models, but in particular, new model)
|
||||
# case 2: `run-slow model_1, model_2`
|
||||
get-tests:
|
||||
runs-on: ubuntu-22.04
|
||||
needs: [get-pr-number, check-timestamps]
|
||||
needs: [get-pr-number, get-sha]
|
||||
if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
|
||||
outputs:
|
||||
models: ${{ steps.models_to_run.outputs.models }}
|
||||
quantizations: ${{ steps.models_to_run.outputs.quantizations }}
|
||||
@ -70,11 +101,11 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: "0"
|
||||
ref: "refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge"
|
||||
ref: "refs/pull/${{needs.get-pr-number.outputs.PR_NUMBER}}/merge"
|
||||
|
||||
- name: Verify merge commit SHA
|
||||
env:
|
||||
VERIFIED_PR_MERGE_SHA: ${{ needs.check-timestamps.outputs.PR_MERGE_SHA }}
|
||||
VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }}
|
||||
run: |
|
||||
PR_MERGE_SHA=$(git log -1 --format=%H)
|
||||
if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
|
||||
@ -88,39 +119,19 @@ jobs:
|
||||
run: |
|
||||
python -m pip install GitPython
|
||||
python utils/pr_slow_ci_models.py --message "$PR_COMMENT" | tee output.txt
|
||||
echo 'models=$(tail -n 1 output.txt)' >> $GITHUB_ENV
|
||||
echo "models=$(tail -n 1 output.txt)" >> $GITHUB_ENV
|
||||
python utils/pr_slow_ci_models.py --message "$PR_COMMENT" --quantization | tee output2.txt
|
||||
echo 'quantizations=$(tail -n 1 output2.txt)' >> $GITHUB_ENV
|
||||
echo "quantizations=$(tail -n 1 output2.txt)" >> $GITHUB_ENV
|
||||
|
||||
- name: Show models to test
|
||||
id: models_to_run
|
||||
run: |
|
||||
echo "${{ env.models }}"
|
||||
echo "models=${{ env.models }}" >> $GITHUB_ENV
|
||||
echo "models=${{ env.models }}" >> $GITHUB_OUTPUT
|
||||
echo "${{ env.quantizations }}"
|
||||
echo "quantizations=${{ env.quantizations }}" >> $GITHUB_OUTPUT
|
||||
|
||||
# Report back if we are not able to get the tests (for example, security check is failing)
|
||||
report_error_earlier:
|
||||
name: Report error earlier
|
||||
if: ${{ always() && needs.get-pr-info.result == 'success' && needs.get-tests.result != 'success' }}
|
||||
needs: [get-pr-number, get-pr-info, get-tests]
|
||||
permissions:
|
||||
pull-requests: write
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Reply to the comment
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
run: |
|
||||
gh api \
|
||||
--method POST \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \
|
||||
-f body="💔 This comment contains \`run-slow\`, but unknown error occurred and [the workflow run]($GITHUB_RUN_URL) aborted!"
|
||||
|
||||
reply_to_comment:
|
||||
name: Reply to the comment
|
||||
if: ${{ needs.get-tests.outputs.models != '[]' || needs.get-tests.outputs.quantizations != '[]' }}
|
||||
@ -132,18 +143,20 @@ jobs:
|
||||
- name: Reply to the comment
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
BODY: '\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}'
|
||||
MODELS: ${{ needs.get-tests.outputs.models }}
|
||||
BODY: "\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}"
|
||||
run: |
|
||||
gh api \
|
||||
--method POST \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \
|
||||
-f body="This comment contains \`run-slow\`, running the specified jobs: $(echo -e '${{ env.BODY }}')"
|
||||
-f "body=This comment contains run-slow, running the specified jobs: ${{ env.BODY }} ..."
|
||||
|
||||
create_run:
|
||||
name: Create run
|
||||
needs: [check-timestamps, reply_to_comment]
|
||||
if: ${{ needs.get-tests.outputs.models != '[]' || needs.get-tests.outputs.quantizations != '[]' }}
|
||||
needs: [get-sha, get-tests, reply_to_comment]
|
||||
permissions:
|
||||
statuses: write
|
||||
runs-on: ubuntu-22.04
|
||||
@ -160,179 +173,243 @@ jobs:
|
||||
--method POST \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
repos/${{ github.repository }}/statuses/${{ needs.check-timestamps.outputs.PR_HEAD_SHA }} \
|
||||
repos/${{ github.repository }}/statuses/${{ needs.get-sha.outputs.PR_HEAD_SHA }} \
|
||||
-f "target_url=$GITHUB_RUN_URL" -f "state=pending" -f "description=Slow CI job" -f "context=pytest/custom-tests"
|
||||
|
||||
model-ci:
|
||||
name: Model CI
|
||||
run_models_gpu:
|
||||
name: Run all tests for the model
|
||||
if: ${{ needs.get-tests.outputs.models != '[]' }}
|
||||
uses: ./.github/workflows/self-scheduled.yml
|
||||
needs: [get-pr-number, check-timestamps, get-tests, create_run]
|
||||
with:
|
||||
job: run_models_gpu
|
||||
slack_report_channel: "#transformers-ci-pr"
|
||||
docker: huggingface/transformers-all-latest-gpu
|
||||
ci_event: PR Comment CI
|
||||
report_repo_id: hf-internal-testing/transformers_pr_ci
|
||||
commit_sha: ${{ needs.check-timestamps.outputs.PR_MERGE_SHA }}
|
||||
subdirs: ${{ needs.get-tests.outputs.models }}
|
||||
pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
|
||||
secrets: inherit
|
||||
needs: [get-pr-number, get-sha, get-tests, create_run]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.get-tests.outputs.models) }}
|
||||
machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
|
||||
runs-on:
|
||||
group: '${{ matrix.machine_type }}'
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: Echo input and matrix info
|
||||
shell: bash
|
||||
run: |
|
||||
echo "${{ matrix.folders }}"
|
||||
|
||||
quantization-ci:
|
||||
name: Quantization CI
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
|
||||
# set the artifact folder names (because the character `/` is not allowed).
|
||||
run: |
|
||||
echo "${{ matrix.folders }}"
|
||||
matrix_folders=${{ matrix.folders }}
|
||||
matrix_folders=${matrix_folders/'models/'/'models_'}
|
||||
echo "$matrix_folders"
|
||||
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
||||
|
||||
- name: Checkout to PR merge commit
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
|
||||
git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
|
||||
git log -1 --format=%H
|
||||
|
||||
- name: Verify merge commit SHA
|
||||
env:
|
||||
VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }}
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
PR_MERGE_SHA=$(git log -1 --format=%H)
|
||||
if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
|
||||
echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!";
|
||||
exit -1;
|
||||
fi
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: /transformers
|
||||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Set `machine_type` for report and artifact names
|
||||
working-directory: /transformers
|
||||
shell: bash
|
||||
run: |
|
||||
echo "${{ matrix.machine_type }}"
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
fi
|
||||
echo "$machine_type"
|
||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||
|
||||
- name: Environment
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
working-directory: /transformers
|
||||
run: pip freeze
|
||||
|
||||
- name: Run all tests on GPU
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
export CUDA_VISIBLE_DEVICES="$(python3 utils/set_cuda_devices_for_ci.py --test_folder ${{ matrix.folders }})"
|
||||
echo $CUDA_VISIBLE_DEVICES
|
||||
python3 -m pytest -v -rsfE --make-reports=${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: cat /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
|
||||
|
||||
- name: Make sure report directory exists
|
||||
shell: bash
|
||||
run: |
|
||||
mkdir -p /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
|
||||
echo "hello" > /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
|
||||
echo "${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
|
||||
path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
|
||||
|
||||
run_quantization_torch_gpu:
|
||||
name: Run all tests for a quantization
|
||||
if: ${{ needs.get-tests.outputs.quantizations != '[]' }}
|
||||
uses: ./.github/workflows/self-scheduled.yml
|
||||
needs: [get-pr-number, check-timestamps, get-tests, create_run]
|
||||
with:
|
||||
job: run_quantization_torch_gpu
|
||||
slack_report_channel: "#transformers-ci-pr"
|
||||
docker: huggingface/transformers-quantization-latest-gpu
|
||||
ci_event: PR Comment CI
|
||||
report_repo_id: hf-internal-testing/transformers_pr_ci
|
||||
commit_sha: ${{ needs.check-timestamps.outputs.PR_MERGE_SHA }}
|
||||
subdirs: ${{ needs.get-tests.outputs.quantizations }}
|
||||
pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
|
||||
secrets: inherit
|
||||
needs: [get-pr-number, get-sha, get-tests, create_run]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.get-tests.outputs.quantizations) }}
|
||||
machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
|
||||
runs-on:
|
||||
group: '${{ matrix.machine_type }}'
|
||||
container:
|
||||
image: huggingface/transformers-quantization-latest-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
run: |
|
||||
echo "${{ matrix.folders }}"
|
||||
matrix_folders=${{ matrix.folders }}
|
||||
matrix_folders=${matrix_folders/'quantization/'/'quantization_'}
|
||||
echo "$matrix_folders"
|
||||
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
||||
|
||||
report:
|
||||
name: Check & Report
|
||||
needs: [get-pr-number, check-timestamps, create_run, model-ci, quantization-ci]
|
||||
- name: Checkout to PR merge commit
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
|
||||
git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
|
||||
git log -1 --format=%H
|
||||
|
||||
- name: Verify merge commit SHA
|
||||
env:
|
||||
VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }}
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
PR_MERGE_SHA=$(git log -1 --format=%H)
|
||||
if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
|
||||
echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!";
|
||||
exit -1;
|
||||
fi
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: /transformers
|
||||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Set `machine_type` for report and artifact names
|
||||
working-directory: /transformers
|
||||
shell: bash
|
||||
run: |
|
||||
echo "${{ matrix.machine_type }}"
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
fi
|
||||
echo "$machine_type"
|
||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||
|
||||
- name: Environment
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
working-directory: /transformers
|
||||
run: pip freeze
|
||||
|
||||
- name: Run quantization tests on GPU
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: cat /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
|
||||
|
||||
- name: Make sure report directory exists
|
||||
shell: bash
|
||||
run: |
|
||||
mkdir -p /transformers/reports/${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports
|
||||
echo "hello" > /transformers/reports/${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports/hello.txt
|
||||
echo "${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports"
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports
|
||||
path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports
|
||||
|
||||
update_run_status:
|
||||
name: Update Check Run Status
|
||||
needs: [get-sha, create_run, run_models_gpu, run_quantization_torch_gpu]
|
||||
permissions:
|
||||
pull-requests: write
|
||||
statuses: write
|
||||
if: ${{ always() && needs.create_run.result == 'success' }}
|
||||
runs-on: ubuntu-22.04
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
STATUS_OK: ${{ contains(fromJSON('["skipped", "success"]'), needs.run_models_gpu.result) && contains(fromJSON('["skipped", "success"]'), needs.run_quantization_torch_gpu.result) }}
|
||||
steps:
|
||||
- name: Show reports from jobs
|
||||
- name: Get `run_models_gpu` job status
|
||||
run: |
|
||||
echo "${{ needs.model-ci.outputs.report }}"
|
||||
echo "${{ needs.quantization-ci.outputs.report }}"
|
||||
|
||||
- name: Process and filter reports
|
||||
env:
|
||||
MODEL_REPORT: ${{ needs.model-ci.outputs.report }}
|
||||
QUANT_REPORT: ${{ needs.quantization-ci.outputs.report }}
|
||||
run: |
|
||||
# Preprocess with Python
|
||||
python3 << 'PYTHON_SCRIPT'
|
||||
import json
|
||||
import os
|
||||
|
||||
def filter_and_format_report(data):
|
||||
"""
|
||||
Filter out entries where commit is `None` (failing tests who status is not certain) and format as text
|
||||
"""
|
||||
lines = []
|
||||
|
||||
for model, model_result in data.items():
|
||||
model_lines = []
|
||||
for device, failures in model_result.items():
|
||||
|
||||
# Filter out None commits and extract just the test names
|
||||
test_names = [
|
||||
failure['test']
|
||||
for failure in failures
|
||||
if isinstance(failure, dict) and failure.get('commit') is not None
|
||||
]
|
||||
|
||||
# Add tests to model lines
|
||||
for idx, test_name in enumerate(test_names):
|
||||
if idx == 0:
|
||||
job_link = failures[idx]['job_link']
|
||||
model_lines.append(f"- [{model}]({job_link}):")
|
||||
|
||||
model_lines.append(f" {test_name}")
|
||||
|
||||
# Only add model section if it has tests
|
||||
if len(model_lines) > 0:
|
||||
lines.extend(model_lines)
|
||||
lines.append("") # Empty line between models
|
||||
|
||||
return "\n".join(lines).strip()
|
||||
|
||||
# Load and filter reports
|
||||
model_report_str = os.environ.get('MODEL_REPORT', '{}')
|
||||
quant_report_str = os.environ.get('QUANT_REPORT', '{}')
|
||||
|
||||
model_report = json.loads(model_report_str) if model_report_str else {}
|
||||
quant_report = json.loads(quant_report_str) if quant_report_str else {}
|
||||
|
||||
formatted_model = filter_and_format_report(model_report)
|
||||
formatted_quant = filter_and_format_report(quant_report)
|
||||
|
||||
# Write to files
|
||||
with open('model_ci.txt', 'w') as f:
|
||||
f.write(formatted_model)
|
||||
if formatted_model:
|
||||
f.write('\n')
|
||||
|
||||
with open('quantization_ci.txt', 'w') as f:
|
||||
f.write(formatted_quant)
|
||||
if formatted_quant:
|
||||
f.write('\n')
|
||||
PYTHON_SCRIPT
|
||||
|
||||
- name: Post results as PR comment
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
run: |
|
||||
{
|
||||
echo '## CI Results'
|
||||
echo "[Workflow Run ⚙️]($GITHUB_RUN_URL)"
|
||||
echo ''
|
||||
|
||||
# Check if both jobs were skipped or cancelled
|
||||
if [[ "${{ needs.model-ci.result }}" == "skipped" || "${{ needs.model-ci.result }}" == "cancelled" ]] && \
|
||||
[[ "${{ needs.quantization-ci.result }}" == "skipped" || "${{ needs.quantization-ci.result }}" == "cancelled" ]]; then
|
||||
echo '⚠️ No test being reported (jobs are skipped or cancelled)!'
|
||||
echo "STATUS=error" >> $GITHUB_ENV
|
||||
|
||||
# Check if either file has content
|
||||
elif [ -s model_ci.txt ] || [ -s quantization_ci.txt ]; then
|
||||
echo "STATUS=failure" >> $GITHUB_ENV
|
||||
|
||||
# Check if model_ci.txt has content
|
||||
if [ -s model_ci.txt ]; then
|
||||
echo '### Model CI Report'
|
||||
echo ''
|
||||
echo '#### ❌ Failed tests'
|
||||
echo ''
|
||||
cat model_ci.txt
|
||||
echo ''
|
||||
fi
|
||||
|
||||
# Check if quantization_ci.txt has content
|
||||
if [ -s quantization_ci.txt ]; then
|
||||
echo '### Quantization CI Report'
|
||||
echo ''
|
||||
echo '#### ❌ Failed tests'
|
||||
echo ''
|
||||
cat quantization_ci.txt
|
||||
echo ''
|
||||
fi
|
||||
else
|
||||
echo "STATUS=success" >> $GITHUB_ENV
|
||||
echo '✅ No failing test specific to this PR 🎉 !'
|
||||
fi
|
||||
} > comment_body.txt
|
||||
|
||||
gh api \
|
||||
--method POST \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \
|
||||
-F body=@comment_body.txt
|
||||
echo "${{ needs.run_models_gpu.result }}"
|
||||
echo "${{ needs.run_quantization_torch_gpu.result }}"
|
||||
echo $STATUS_OK
|
||||
if [ "$STATUS_OK" = "true" ]; then
|
||||
echo "STATUS=success" >> $GITHUB_ENV
|
||||
else
|
||||
echo "STATUS=failure" >> $GITHUB_ENV
|
||||
fi
|
||||
|
||||
- name: Update PR commit statuses
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
run: |
|
||||
echo "${{ needs.run_models_gpu.result }}"
|
||||
echo "${{ env.STATUS }}"
|
||||
gh api \
|
||||
--method POST \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
repos/${{ github.repository }}/statuses/${{ needs.check-timestamps.outputs.PR_HEAD_SHA }} \
|
||||
repos/${{ github.repository }}/statuses/${{ needs.get-sha.outputs.PR_HEAD_SHA }} \
|
||||
-f "target_url=$GITHUB_RUN_URL" -f "state=${{ env.STATUS }}" -f "description=Slow CI job" -f "context=pytest/custom-tests"
|
||||
|
||||
1
.github/workflows/self-nightly-caller.yml
vendored
1
.github/workflows/self-nightly-caller.yml
vendored
@ -51,7 +51,6 @@ jobs:
|
||||
slack_report_channel: "#transformers-ci-past-future"
|
||||
docker: huggingface/transformers-all-latest-torch-nightly-gpu
|
||||
ci_event: Nightly CI
|
||||
runner_type: "a10"
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci_with_torch_nightly
|
||||
commit_sha: ${{ github.event.workflow_run.head_sha || github.sha }}
|
||||
secrets: inherit
|
||||
|
||||
25
.github/workflows/self-push-amd-mi210-caller.yml
vendored
Normal file
25
.github/workflows/self-push-amd-mi210-caller.yml
vendored
Normal file
@ -0,0 +1,25 @@
|
||||
name: Self-hosted runner (AMD mi210 CI caller)
|
||||
|
||||
on:
|
||||
#workflow_run:
|
||||
# workflows: ["Self-hosted runner (push-caller)"]
|
||||
# branches: ["main"]
|
||||
# types: [completed]
|
||||
push:
|
||||
branches:
|
||||
- run_amd_push_ci_caller*
|
||||
paths:
|
||||
- "src/**"
|
||||
- "tests/**"
|
||||
- ".github/**"
|
||||
- "templates/**"
|
||||
- "utils/**"
|
||||
|
||||
jobs:
|
||||
run_amd_ci:
|
||||
name: AMD mi210
|
||||
if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
|
||||
uses: ./.github/workflows/self-push-amd.yml
|
||||
with:
|
||||
gpu_flavor: mi210
|
||||
secrets: inherit
|
||||
25
.github/workflows/self-push-amd-mi250-caller.yml
vendored
Normal file
25
.github/workflows/self-push-amd-mi250-caller.yml
vendored
Normal file
@ -0,0 +1,25 @@
|
||||
name: Self-hosted runner (AMD mi250 CI caller)
|
||||
|
||||
on:
|
||||
#workflow_run:
|
||||
# workflows: ["Self-hosted runner (push-caller)"]
|
||||
# branches: ["main"]
|
||||
# types: [completed]
|
||||
push:
|
||||
branches:
|
||||
- run_amd_push_ci_caller*
|
||||
paths:
|
||||
- "src/**"
|
||||
- "tests/**"
|
||||
- ".github/**"
|
||||
- "templates/**"
|
||||
- "utils/**"
|
||||
|
||||
jobs:
|
||||
run_amd_ci:
|
||||
name: AMD mi250
|
||||
if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
|
||||
uses: ./.github/workflows/self-push-amd.yml
|
||||
with:
|
||||
gpu_flavor: mi250
|
||||
secrets: inherit
|
||||
334
.github/workflows/self-push-amd.yml
vendored
Normal file
334
.github/workflows/self-push-amd.yml
vendored
Normal file
@ -0,0 +1,334 @@
|
||||
name: Self-hosted runner AMD GPU (push)
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
gpu_flavor:
|
||||
required: true
|
||||
type: string
|
||||
|
||||
env:
|
||||
HF_HOME: /mnt/cache
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
OMP_NUM_THREADS: 8
|
||||
MKL_NUM_THREADS: 8
|
||||
PYTEST_TIMEOUT: 60
|
||||
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||
|
||||
jobs:
|
||||
check_runner_status:
|
||||
name: Check Runner Status
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Checkout transformers
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Check Runner Status
|
||||
run: python utils/check_self_hosted_runner.py --target_runners amd-mi210-single-gpu-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
|
||||
check_runners:
|
||||
name: Check Runners
|
||||
needs: check_runner_status
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
||||
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: ROCM-SMI
|
||||
run: |
|
||||
rocm-smi
|
||||
- name: ROCM-INFO
|
||||
run: |
|
||||
rocminfo | grep "Agent" -A 14
|
||||
- name: Show ROCR environment
|
||||
run: |
|
||||
echo "ROCR: $ROCR_VISIBLE_DEVICES"
|
||||
|
||||
setup_gpu:
|
||||
name: Setup
|
||||
needs: check_runners
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
||||
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
outputs:
|
||||
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||
test_map: ${{ steps.set-matrix.outputs.test_map }}
|
||||
env:
|
||||
# `CI_BRANCH_PUSH`: The branch name from the push event
|
||||
# `CI_BRANCH_WORKFLOW_RUN`: The name of the branch on which this workflow is triggered by `workflow_run` event
|
||||
# `CI_SHA_PUSH`: The commit SHA from the push event
|
||||
# `CI_SHA_WORKFLOW_RUN`: The commit SHA that triggers this workflow by `workflow_run` event
|
||||
CI_BRANCH_PUSH: ${{ github.event.ref }}
|
||||
CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
|
||||
CI_SHA_PUSH: ${{ github.event.head_commit.id }}
|
||||
CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
|
||||
steps:
|
||||
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
|
||||
# We also take into account the `push` event (we might want to test some changes in a branch)
|
||||
- name: Prepare custom environment variables
|
||||
shell: bash
|
||||
# `CI_BRANCH`: The non-empty branch name from the above two (one and only one of them is empty)
|
||||
# `CI_SHA`: The non-empty commit SHA from the above two (one and only one of them is empty)
|
||||
run: |
|
||||
CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
|
||||
echo $CI_BRANCH_PUSH
|
||||
echo $CI_BRANCH_WORKFLOW_RUN
|
||||
echo $CI_SHA_PUSH
|
||||
echo $CI_SHA_WORKFLOW_RUN
|
||||
[[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
|
||||
[[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
|
||||
|
||||
- name: print environment variables
|
||||
run: |
|
||||
echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
|
||||
echo "env.CI_SHA = ${{ env.CI_SHA }}"
|
||||
|
||||
- name: Update clone using environment variables
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
echo "original branch = $(git branch --show-current)"
|
||||
git fetch && git checkout ${{ env.CI_BRANCH }}
|
||||
echo "updated branch = $(git branch --show-current)"
|
||||
git checkout ${{ env.CI_SHA }}
|
||||
echo "log = $(git log -n 1)"
|
||||
|
||||
- name: Cleanup
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
rm -rf tests/__pycache__
|
||||
rm -rf tests/models/__pycache__
|
||||
rm -rf reports
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
working-directory: /transformers
|
||||
run: pip freeze
|
||||
|
||||
- name: Fetch the tests to run
|
||||
working-directory: /transformers
|
||||
# TODO: add `git-python` in the docker images
|
||||
run: |
|
||||
pip install --upgrade git-python
|
||||
python3 utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
|
||||
|
||||
- name: Report fetched tests
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: test_fetched
|
||||
path: /transformers/test_preparation.txt
|
||||
|
||||
- id: set-matrix
|
||||
name: Organize tests into models
|
||||
working-directory: /transformers
|
||||
# The `keys` is used as GitHub actions matrix for jobs, i.e. `models/bert`, `tokenization`, `pipeline`, etc.
|
||||
# The `test_map` is used to get the actual identified test files under each key.
|
||||
# If no test to run (so no `test_map.json` file), create a dummy map (empty matrix will fail)
|
||||
run: |
|
||||
if [ -f test_map.json ]; then
|
||||
keys=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); d = list(test_map.keys()); print(d)')
|
||||
test_map=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); print(test_map)')
|
||||
else
|
||||
keys=$(python3 -c 'keys = ["dummy"]; print(keys)')
|
||||
test_map=$(python3 -c 'test_map = {"dummy": []}; print(test_map)')
|
||||
fi
|
||||
echo $keys
|
||||
echo $test_map
|
||||
echo "matrix=$keys" >> $GITHUB_OUTPUT
|
||||
echo "test_map=$test_map" >> $GITHUB_OUTPUT
|
||||
|
||||
run_models_gpu:
|
||||
name: Model tests
|
||||
needs: setup_gpu
|
||||
# `dummy` means there is no test to run
|
||||
if: contains(fromJson(needs.setup_gpu.outputs.matrix), 'dummy') != true
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }}
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
||||
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
env:
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
CI_BRANCH_PUSH: ${{ github.event.ref }}
|
||||
CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
|
||||
CI_SHA_PUSH: ${{ github.event.head_commit.id }}
|
||||
CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
|
||||
steps:
|
||||
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
|
||||
# We also take into account the `push` event (we might want to test some changes in a branch)
|
||||
- name: Prepare custom environment variables
|
||||
shell: bash
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
run: |
|
||||
CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
|
||||
echo $CI_BRANCH_PUSH
|
||||
echo $CI_BRANCH_WORKFLOW_RUN
|
||||
echo $CI_SHA_PUSH
|
||||
echo $CI_SHA_WORKFLOW_RUN
|
||||
[[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
|
||||
[[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
|
||||
|
||||
- name: print environment variables
|
||||
run: |
|
||||
echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
|
||||
echo "env.CI_SHA = ${{ env.CI_SHA }}"
|
||||
|
||||
- name: Update clone using environment variables
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
echo "original branch = $(git branch --show-current)"
|
||||
git fetch && git checkout ${{ env.CI_BRANCH }}
|
||||
echo "updated branch = $(git branch --show-current)"
|
||||
git checkout ${{ env.CI_SHA }}
|
||||
echo "log = $(git log -n 1)"
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: /transformers
|
||||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
|
||||
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
|
||||
# set the artifact folder names (because the character `/` is not allowed).
|
||||
run: |
|
||||
echo "${{ matrix.folders }}"
|
||||
echo "${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }}"
|
||||
matrix_folders=${{ matrix.folders }}
|
||||
matrix_folders=${matrix_folders/'models/'/'models_'}
|
||||
echo "$matrix_folders"
|
||||
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
||||
|
||||
- name: ROCM-SMI
|
||||
run: |
|
||||
rocm-smi
|
||||
- name: ROCM-INFO
|
||||
run: |
|
||||
rocminfo | grep "Agent" -A 14
|
||||
- name: Show ROCR environment
|
||||
run: |
|
||||
echo "ROCR: $ROCR_VISIBLE_DEVICES"
|
||||
|
||||
- name: Environment
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
working-directory: /transformers
|
||||
run: pip freeze
|
||||
|
||||
- name: Run all non-slow selected tests on GPU
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports ${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }} -m "not not_device_test"
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: cat /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
|
||||
path: /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
|
||||
|
||||
send_results:
|
||||
name: Send results to webhook
|
||||
runs-on: ubuntu-22.04
|
||||
if: always()
|
||||
needs: [
|
||||
check_runner_status,
|
||||
check_runners,
|
||||
setup_gpu,
|
||||
run_models_gpu,
|
||||
# run_tests_torch_cuda_extensions_single_gpu,
|
||||
# run_tests_torch_cuda_extensions_multi_gpu
|
||||
]
|
||||
env:
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
CI_BRANCH_PUSH: ${{ github.event.ref }}
|
||||
CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
|
||||
CI_SHA_PUSH: ${{ github.event.head_commit.id }}
|
||||
CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
|
||||
steps:
|
||||
- name: Preliminary job status
|
||||
shell: bash
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
run: |
|
||||
echo "Runner availability: ${{ needs.check_runner_status.result }}"
|
||||
echo "Setup status: ${{ needs.setup_gpu.result }}"
|
||||
echo "Runner status: ${{ needs.check_runners.result }}"
|
||||
|
||||
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
|
||||
# We also take into account the `push` event (we might want to test some changes in a branch)
|
||||
- name: Prepare custom environment variables
|
||||
shell: bash
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
run: |
|
||||
CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
|
||||
echo $CI_BRANCH_PUSH
|
||||
echo $CI_BRANCH_WORKFLOW_RUN
|
||||
echo $CI_SHA_PUSH
|
||||
echo $CI_SHA_WORKFLOW_RUN
|
||||
[[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
|
||||
[[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
|
||||
|
||||
- name: print environment variables
|
||||
run: |
|
||||
echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
|
||||
echo "env.CI_SHA = ${{ env.CI_SHA }}"
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
# To avoid failure when multiple commits are merged into `main` in a short period of time.
|
||||
# Checking out to an old commit beyond the fetch depth will get an error `fatal: reference is not a tree: ...
|
||||
# (Only required for `workflow_run` event, where we get the latest HEAD on `main` instead of the event commit)
|
||||
with:
|
||||
fetch-depth: 20
|
||||
|
||||
- name: Update clone using environment variables
|
||||
run: |
|
||||
echo "original branch = $(git branch --show-current)"
|
||||
git fetch && git checkout ${{ env.CI_BRANCH }}
|
||||
echo "updated branch = $(git branch --show-current)"
|
||||
git checkout ${{ env.CI_SHA }}
|
||||
echo "log = $(git log -n 1)"
|
||||
|
||||
- uses: actions/download-artifact@v4
|
||||
- name: Send message to Slack
|
||||
env:
|
||||
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
|
||||
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
|
||||
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
|
||||
CI_SLACK_CHANNEL_ID_AMD: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }}
|
||||
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
|
||||
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }}
|
||||
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
CI_EVENT: Push CI (AMD) - ${{ inputs.gpu_flavor }}
|
||||
CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
|
||||
CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
|
||||
CI_SHA: ${{ env.CI_SHA }}
|
||||
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
|
||||
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
|
||||
SETUP_STATUS: ${{ needs.setup_gpu.result }}
|
||||
|
||||
# We pass `needs.setup_gpu.outputs.matrix` as the argument. A processing in `notification_service.py` to change
|
||||
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
|
||||
run: |
|
||||
pip install huggingface_hub
|
||||
pip install slack_sdk
|
||||
pip show slack_sdk
|
||||
python utils/notification_service.py "${{ needs.setup_gpu.outputs.matrix }}"
|
||||
54
.github/workflows/self-push-caller.yml
vendored
Normal file
54
.github/workflows/self-push-caller.yml
vendored
Normal file
@ -0,0 +1,54 @@
|
||||
# Used to trigger self-push CI
|
||||
name: Self-hosted runner (push-caller)
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "src/**"
|
||||
- "tests/**"
|
||||
- ".github/**"
|
||||
- "templates/**"
|
||||
- "utils/**"
|
||||
|
||||
jobs:
|
||||
check-for-setup:
|
||||
runs-on: ubuntu-22.04
|
||||
name: Check if setup was changed
|
||||
outputs:
|
||||
changed: ${{ steps.was_changed.outputs.changed }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: "2"
|
||||
|
||||
- name: Get changed files
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@1c8e6069583811afb28f97afeaf8e7da80c6be5c
|
||||
|
||||
- name: Was setup changed
|
||||
id: was_changed
|
||||
run: |
|
||||
for file in ${{ steps.changed-files.outputs.all_changed_files }}; do
|
||||
if [ `basename "${file}"` = "setup.py" ]; then
|
||||
echo "changed=1" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
done
|
||||
|
||||
build-docker-containers:
|
||||
needs: check-for-setup
|
||||
if: (github.event_name == 'push') && (needs.check-for-setup.outputs.changed == '1')
|
||||
uses: ./.github/workflows/build-docker-images.yml
|
||||
with:
|
||||
image_postfix: "-push-ci"
|
||||
secrets: inherit
|
||||
|
||||
run_push_ci:
|
||||
name: Trigger Push CI
|
||||
runs-on: ubuntu-22.04
|
||||
if: ${{ always() }}
|
||||
needs: build-docker-containers
|
||||
steps:
|
||||
- name: Trigger push CI via workflow_run
|
||||
run: echo "Trigger push CI via workflow_run"
|
||||
652
.github/workflows/self-push.yml
vendored
Normal file
652
.github/workflows/self-push.yml
vendored
Normal file
@ -0,0 +1,652 @@
|
||||
name: Self-hosted runner (push)
|
||||
|
||||
on:
|
||||
workflow_run:
|
||||
workflows: ["Self-hosted runner (push-caller)"]
|
||||
branches: ["main"]
|
||||
types: [completed]
|
||||
push:
|
||||
branches:
|
||||
- ci_*
|
||||
- ci-*
|
||||
paths:
|
||||
- "src/**"
|
||||
- "tests/**"
|
||||
- ".github/**"
|
||||
- "templates/**"
|
||||
- "utils/**"
|
||||
repository_dispatch:
|
||||
|
||||
env:
|
||||
HF_HOME: /mnt/cache
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
OMP_NUM_THREADS: 8
|
||||
MKL_NUM_THREADS: 8
|
||||
PYTEST_TIMEOUT: 60
|
||||
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||
CUDA_VISIBLE_DEVICES: 0,1
|
||||
|
||||
jobs:
|
||||
setup:
|
||||
name: Setup
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
|
||||
runs-on:
|
||||
group: '${{ matrix.machine_type }}'
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu-push-ci
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
outputs:
|
||||
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||
test_map: ${{ steps.set-matrix.outputs.test_map }}
|
||||
env:
|
||||
# `CI_BRANCH_PUSH`: The branch name from the push event
|
||||
# `CI_BRANCH_WORKFLOW_RUN`: The name of the branch on which this workflow is triggered by `workflow_run` event
|
||||
# `CI_SHA_PUSH`: The commit SHA from the push event
|
||||
# `CI_SHA_WORKFLOW_RUN`: The commit SHA that triggers this workflow by `workflow_run` event
|
||||
CI_BRANCH_PUSH: ${{ github.event.ref }}
|
||||
CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
|
||||
CI_SHA_PUSH: ${{ github.event.head_commit.id }}
|
||||
CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
|
||||
steps:
|
||||
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
|
||||
# We also take into account the `push` event (we might want to test some changes in a branch)
|
||||
- name: Prepare custom environment variables
|
||||
shell: bash
|
||||
# `CI_BRANCH`: The non-empty branch name from the above two (one and only one of them is empty)
|
||||
# `CI_SHA`: The non-empty commit SHA from the above two (one and only one of them is empty)
|
||||
run: |
|
||||
CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
|
||||
echo $CI_BRANCH_PUSH
|
||||
echo $CI_BRANCH_WORKFLOW_RUN
|
||||
echo $CI_SHA_PUSH
|
||||
echo $CI_SHA_WORKFLOW_RUN
|
||||
[[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
|
||||
[[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
|
||||
|
||||
- name: print environment variables
|
||||
run: |
|
||||
echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
|
||||
echo "env.CI_SHA = ${{ env.CI_SHA }}"
|
||||
|
||||
- name: Update clone using environment variables
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
echo "original branch = $(git branch --show-current)"
|
||||
git fetch && git checkout ${{ env.CI_BRANCH }}
|
||||
echo "updated branch = $(git branch --show-current)"
|
||||
git checkout ${{ env.CI_SHA }}
|
||||
echo "log = $(git log -n 1)"
|
||||
|
||||
- name: Cleanup
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
rm -rf tests/__pycache__
|
||||
rm -rf tests/models/__pycache__
|
||||
rm -rf reports
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
working-directory: /transformers
|
||||
run: pip freeze
|
||||
|
||||
- name: Fetch the tests to run
|
||||
working-directory: /transformers
|
||||
# TODO: add `git-python` in the docker images
|
||||
run: |
|
||||
pip install --upgrade git-python
|
||||
python3 utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
|
||||
|
||||
- name: Report fetched tests
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: test_fetched
|
||||
path: /transformers/test_preparation.txt
|
||||
|
||||
- id: set-matrix
|
||||
name: Organize tests into models
|
||||
working-directory: /transformers
|
||||
# The `keys` is used as GitHub actions matrix for jobs, i.e. `models/bert`, `tokenization`, `pipeline`, etc.
|
||||
# The `test_map` is used to get the actual identified test files under each key.
|
||||
# If no test to run (so no `test_map.json` file), create a dummy map (empty matrix will fail)
|
||||
run: |
|
||||
if [ -f test_map.json ]; then
|
||||
keys=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); d = list(test_map.keys()); print(d)')
|
||||
test_map=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); print(test_map)')
|
||||
else
|
||||
keys=$(python3 -c 'keys = ["dummy"]; print(keys)')
|
||||
test_map=$(python3 -c 'test_map = {"dummy": []}; print(test_map)')
|
||||
fi
|
||||
echo $keys
|
||||
echo $test_map
|
||||
echo "matrix=$keys" >> $GITHUB_OUTPUT
|
||||
echo "test_map=$test_map" >> $GITHUB_OUTPUT
|
||||
|
||||
run_tests_single_gpu:
|
||||
name: Model tests
|
||||
needs: setup
|
||||
# `dummy` means there is no test to run
|
||||
if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
||||
machine_type: [aws-g5-4xlarge-cache]
|
||||
runs-on:
|
||||
group: '${{ matrix.machine_type }}'
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu-push-ci
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
env:
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
CI_BRANCH_PUSH: ${{ github.event.ref }}
|
||||
CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
|
||||
CI_SHA_PUSH: ${{ github.event.head_commit.id }}
|
||||
CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
|
||||
steps:
|
||||
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
|
||||
# We also take into account the `push` event (we might want to test some changes in a branch)
|
||||
- name: Prepare custom environment variables
|
||||
shell: bash
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
run: |
|
||||
CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
|
||||
echo $CI_BRANCH_PUSH
|
||||
echo $CI_BRANCH_WORKFLOW_RUN
|
||||
echo $CI_SHA_PUSH
|
||||
echo $CI_SHA_WORKFLOW_RUN
|
||||
[[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
|
||||
[[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
|
||||
|
||||
- name: print environment variables
|
||||
run: |
|
||||
echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
|
||||
echo "env.CI_SHA = ${{ env.CI_SHA }}"
|
||||
|
||||
- name: Set `machine_type` for report and artifact names
|
||||
working-directory: /transformers
|
||||
shell: bash
|
||||
run: |
|
||||
echo "${{ matrix.machine_type }}"
|
||||
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
fi
|
||||
|
||||
echo "$machine_type"
|
||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||
|
||||
- name: Update clone using environment variables
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
echo "original branch = $(git branch --show-current)"
|
||||
git fetch && git checkout ${{ env.CI_BRANCH }}
|
||||
echo "updated branch = $(git branch --show-current)"
|
||||
git checkout ${{ env.CI_SHA }}
|
||||
echo "log = $(git log -n 1)"
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: /transformers
|
||||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
|
||||
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
|
||||
# set the artifact folder names (because the character `/` is not allowed).
|
||||
run: |
|
||||
echo "${{ matrix.folders }}"
|
||||
echo "${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}"
|
||||
matrix_folders=${{ matrix.folders }}
|
||||
matrix_folders=${matrix_folders/'models/'/'models_'}
|
||||
echo "$matrix_folders"
|
||||
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Environment
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
working-directory: /transformers
|
||||
run: pip freeze
|
||||
|
||||
- name: Run all non-slow selected tests on GPU
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: cat /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
|
||||
path: /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}
|
||||
|
||||
run_tests_multi_gpu:
|
||||
name: Model tests
|
||||
needs: setup
|
||||
# `dummy` means there is no test to run
|
||||
if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
||||
machine_type: [aws-g5-12xlarge-cache]
|
||||
runs-on:
|
||||
group: '${{ matrix.machine_type }}'
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu-push-ci
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
env:
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
CI_BRANCH_PUSH: ${{ github.event.ref }}
|
||||
CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
|
||||
CI_SHA_PUSH: ${{ github.event.head_commit.id }}
|
||||
CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
|
||||
steps:
|
||||
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
|
||||
# We also take into account the `push` event (we might want to test some changes in a branch)
|
||||
- name: Prepare custom environment variables
|
||||
shell: bash
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
run: |
|
||||
CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
|
||||
echo $CI_BRANCH_PUSH
|
||||
echo $CI_BRANCH_WORKFLOW_RUN
|
||||
echo $CI_SHA_PUSH
|
||||
echo $CI_SHA_WORKFLOW_RUN
|
||||
[[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
|
||||
[[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
|
||||
|
||||
- name: print environment variables
|
||||
run: |
|
||||
echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
|
||||
echo "env.CI_SHA = ${{ env.CI_SHA }}"
|
||||
|
||||
- name: Set `machine_type` for report and artifact names
|
||||
working-directory: /transformers
|
||||
shell: bash
|
||||
run: |
|
||||
echo "${{ matrix.machine_type }}"
|
||||
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
fi
|
||||
|
||||
echo "$machine_type"
|
||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||
|
||||
- name: Update clone using environment variables
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
echo "original branch = $(git branch --show-current)"
|
||||
git fetch && git checkout ${{ env.CI_BRANCH }}
|
||||
echo "updated branch = $(git branch --show-current)"
|
||||
git checkout ${{ env.CI_SHA }}
|
||||
echo "log = $(git log -n 1)"
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: /transformers
|
||||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
|
||||
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
|
||||
# set the artifact folder names (because the character `/` is not allowed).
|
||||
run: |
|
||||
echo "${{ matrix.folders }}"
|
||||
echo "${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}"
|
||||
matrix_folders=${{ matrix.folders }}
|
||||
matrix_folders=${matrix_folders/'models/'/'models_'}
|
||||
echo "$matrix_folders"
|
||||
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Environment
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
working-directory: /transformers
|
||||
run: pip freeze
|
||||
|
||||
- name: Run all non-slow selected tests on GPU
|
||||
env:
|
||||
MKL_SERVICE_FORCE_INTEL: 1
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: cat /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
|
||||
path: /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}
|
||||
|
||||
run_tests_torch_cuda_extensions_single_gpu:
|
||||
name: Torch CUDA extension tests
|
||||
needs: setup
|
||||
if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [aws-g5-4xlarge-cache]
|
||||
runs-on:
|
||||
group: '${{ matrix.machine_type }}'
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
env:
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
CI_BRANCH_PUSH: ${{ github.event.ref }}
|
||||
CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
|
||||
CI_SHA_PUSH: ${{ github.event.head_commit.id }}
|
||||
CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
|
||||
steps:
|
||||
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
|
||||
# We also take into account the `push` event (we might want to test some changes in a branch)
|
||||
- name: Prepare custom environment variables
|
||||
shell: bash
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
run: |
|
||||
CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
|
||||
echo $CI_BRANCH_PUSH
|
||||
echo $CI_BRANCH_WORKFLOW_RUN
|
||||
echo $CI_SHA_PUSH
|
||||
echo $CI_SHA_WORKFLOW_RUN
|
||||
[[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
|
||||
[[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
|
||||
|
||||
- name: print environment variables
|
||||
run: |
|
||||
echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
|
||||
echo "env.CI_SHA = ${{ env.CI_SHA }}"
|
||||
|
||||
- name: Set `machine_type` for report and artifact names
|
||||
working-directory: /workspace/transformers
|
||||
shell: bash
|
||||
run: |
|
||||
echo "${{ matrix.machine_type }}"
|
||||
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
fi
|
||||
|
||||
echo "$machine_type"
|
||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||
|
||||
- name: Update clone using environment variables
|
||||
working-directory: /workspace/transformers
|
||||
run: |
|
||||
echo "original branch = $(git branch --show-current)"
|
||||
git fetch && git checkout ${{ env.CI_BRANCH }}
|
||||
echo "updated branch = $(git branch --show-current)"
|
||||
git checkout ${{ env.CI_SHA }}
|
||||
echo "log = $(git log -n 1)"
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: /workspace/transformers
|
||||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
|
||||
|
||||
- name: Remove cached torch extensions
|
||||
run: rm -rf /github/home/.cache/torch_extensions/
|
||||
|
||||
# To avoid unknown test failures
|
||||
- name: Pre build DeepSpeed *again*
|
||||
working-directory: /workspace
|
||||
run: |
|
||||
python3 -m pip uninstall -y deepspeed
|
||||
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Environment
|
||||
working-directory: /workspace/transformers
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
working-directory: /workspace/transformers
|
||||
run: pip freeze
|
||||
|
||||
- name: Run all non-slow selected tests on GPU
|
||||
working-directory: /workspace/transformers
|
||||
# TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests.
|
||||
run: |
|
||||
python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: cat /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
|
||||
path: /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
|
||||
|
||||
run_tests_torch_cuda_extensions_multi_gpu:
|
||||
name: Torch CUDA extension tests
|
||||
needs: setup
|
||||
if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [aws-g5-12xlarge-cache]
|
||||
runs-on:
|
||||
group: '${{ matrix.machine_type }}'
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
env:
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
CI_BRANCH_PUSH: ${{ github.event.ref }}
|
||||
CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
|
||||
CI_SHA_PUSH: ${{ github.event.head_commit.id }}
|
||||
CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
|
||||
steps:
|
||||
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
|
||||
# We also take into account the `push` event (we might want to test some changes in a branch)
|
||||
- name: Prepare custom environment variables
|
||||
shell: bash
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
run: |
|
||||
CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
|
||||
echo $CI_BRANCH_PUSH
|
||||
echo $CI_BRANCH_WORKFLOW_RUN
|
||||
echo $CI_SHA_PUSH
|
||||
echo $CI_SHA_WORKFLOW_RUN
|
||||
[[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
|
||||
[[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
|
||||
|
||||
- name: print environment variables
|
||||
run: |
|
||||
echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
|
||||
echo "env.CI_SHA = ${{ env.CI_SHA }}"
|
||||
|
||||
- name: Set `machine_type` for report and artifact names
|
||||
working-directory: /workspace/transformers
|
||||
shell: bash
|
||||
run: |
|
||||
echo "${{ matrix.machine_type }}"
|
||||
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
||||
machine_type=single-gpu
|
||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
||||
machine_type=multi-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
fi
|
||||
|
||||
echo "$machine_type"
|
||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||
|
||||
- name: Update clone using environment variables
|
||||
working-directory: /workspace/transformers
|
||||
run: |
|
||||
echo "original branch = $(git branch --show-current)"
|
||||
git fetch && git checkout ${{ env.CI_BRANCH }}
|
||||
echo "updated branch = $(git branch --show-current)"
|
||||
git checkout ${{ env.CI_SHA }}
|
||||
echo "log = $(git log -n 1)"
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: /workspace/transformers
|
||||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
|
||||
|
||||
- name: Remove cached torch extensions
|
||||
run: rm -rf /github/home/.cache/torch_extensions/
|
||||
|
||||
# To avoid unknown test failures
|
||||
- name: Pre build DeepSpeed *again*
|
||||
working-directory: /workspace
|
||||
run: |
|
||||
python3 -m pip uninstall -y deepspeed
|
||||
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Environment
|
||||
working-directory: /workspace/transformers
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
working-directory: /workspace/transformers
|
||||
run: pip freeze
|
||||
|
||||
- name: Run all non-slow selected tests on GPU
|
||||
working-directory: /workspace/transformers
|
||||
# TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests.
|
||||
run: |
|
||||
python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: cat /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
|
||||
path: /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
|
||||
|
||||
send_results:
|
||||
name: Send results to webhook
|
||||
runs-on: ubuntu-22.04
|
||||
if: always()
|
||||
needs: [
|
||||
setup,
|
||||
run_tests_single_gpu,
|
||||
run_tests_multi_gpu,
|
||||
run_tests_torch_cuda_extensions_single_gpu,
|
||||
run_tests_torch_cuda_extensions_multi_gpu
|
||||
]
|
||||
env:
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
CI_BRANCH_PUSH: ${{ github.event.ref }}
|
||||
CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
|
||||
CI_SHA_PUSH: ${{ github.event.head_commit.id }}
|
||||
CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
|
||||
steps:
|
||||
- name: Preliminary job status
|
||||
shell: bash
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
run: |
|
||||
echo "Setup status: ${{ needs.setup.result }}"
|
||||
|
||||
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
|
||||
# We also take into account the `push` event (we might want to test some changes in a branch)
|
||||
- name: Prepare custom environment variables
|
||||
shell: bash
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
run: |
|
||||
CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
|
||||
echo $CI_BRANCH_PUSH
|
||||
echo $CI_BRANCH_WORKFLOW_RUN
|
||||
echo $CI_SHA_PUSH
|
||||
echo $CI_SHA_WORKFLOW_RUN
|
||||
[[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
|
||||
[[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
|
||||
|
||||
- name: print environment variables
|
||||
run: |
|
||||
echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
|
||||
echo "env.CI_SHA = ${{ env.CI_SHA }}"
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
# To avoid failure when multiple commits are merged into `main` in a short period of time.
|
||||
# Checking out to an old commit beyond the fetch depth will get an error `fatal: reference is not a tree: ...
|
||||
# (Only required for `workflow_run` event, where we get the latest HEAD on `main` instead of the event commit)
|
||||
with:
|
||||
fetch-depth: 20
|
||||
|
||||
- name: Update clone using environment variables
|
||||
run: |
|
||||
echo "original branch = $(git branch --show-current)"
|
||||
git fetch && git checkout ${{ env.CI_BRANCH }}
|
||||
echo "updated branch = $(git branch --show-current)"
|
||||
git checkout ${{ env.CI_SHA }}
|
||||
echo "log = $(git log -n 1)"
|
||||
|
||||
- uses: actions/download-artifact@v4
|
||||
- name: Send message to Slack
|
||||
env:
|
||||
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
|
||||
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
|
||||
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
|
||||
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
|
||||
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
|
||||
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
CI_EVENT: push
|
||||
CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
|
||||
CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
|
||||
CI_SHA: ${{ env.CI_SHA }}
|
||||
SETUP_STATUS: ${{ needs.setup.result }}
|
||||
|
||||
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
|
||||
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
|
||||
run: |
|
||||
pip install huggingface_hub
|
||||
pip install slack_sdk
|
||||
pip show slack_sdk
|
||||
python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
|
||||
@ -21,7 +21,7 @@ jobs:
|
||||
job: run_models_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_group: hfc-amd-mi355
|
||||
docker: huggingface/transformers-pytorch-amd-gpu
|
||||
docker: huggingface/testing-rocm7.0-preview
|
||||
ci_event: Scheduled CI (AMD) - mi355
|
||||
report_repo_id: hf-transformers-bot/transformers-ci-dummy
|
||||
secrets: inherit
|
||||
@ -33,7 +33,7 @@ jobs:
|
||||
job: run_pipelines_torch_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_group: hfc-amd-mi355
|
||||
docker: huggingface/transformers-pytorch-amd-gpu
|
||||
docker: huggingface/testing-rocm7.0-preview
|
||||
ci_event: Scheduled CI (AMD) - mi355
|
||||
report_repo_id: hf-transformers-bot/transformers-ci-dummy
|
||||
secrets: inherit
|
||||
@ -45,7 +45,7 @@ jobs:
|
||||
job: run_examples_gpu
|
||||
slack_report_channel: "#amd-hf-ci"
|
||||
runner_group: hfc-amd-mi355
|
||||
docker: huggingface/transformers-pytorch-amd-gpu
|
||||
docker: huggingface/testing-rocm7.0-preview
|
||||
ci_event: Scheduled CI (AMD) - mi355
|
||||
report_repo_id: hf-transformers-bot/transformers-ci-dummy
|
||||
secrets: inherit
|
||||
|
||||
14
.github/workflows/self-scheduled-caller.yml
vendored
14
.github/workflows/self-scheduled-caller.yml
vendored
@ -63,7 +63,7 @@ jobs:
|
||||
with:
|
||||
job: run_pipelines_torch_gpu
|
||||
slack_report_channel: "#transformers-ci-daily-pipeline-torch"
|
||||
docker: huggingface/transformers-all-latest-gpu
|
||||
docker: huggingface/transformers-pytorch-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
commit_sha: ${{ github.sha }}
|
||||
@ -118,15 +118,3 @@ jobs:
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
commit_sha: ${{ github.sha }}
|
||||
secrets: inherit
|
||||
|
||||
kernels-ci:
|
||||
name: Kernels CI
|
||||
uses: ./.github/workflows/self-scheduled.yml
|
||||
with:
|
||||
job: run_kernels_gpu
|
||||
slack_report_channel: "#transformers-ci-daily-kernels"
|
||||
docker: huggingface/transformers-all-latest-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
commit_sha: ${{ github.sha }}
|
||||
secrets: inherit
|
||||
@ -1,60 +0,0 @@
|
||||
name: Nvidia CI - Flash Attn
|
||||
|
||||
on:
|
||||
repository_dispatch:
|
||||
schedule:
|
||||
- cron: "17 2 * * *"
|
||||
push:
|
||||
branches:
|
||||
- run_nvidia_ci_flash_attn*
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
prev_workflow_run_id:
|
||||
description: 'previous workflow run id to compare'
|
||||
type: string
|
||||
required: false
|
||||
default: ""
|
||||
other_workflow_run_id:
|
||||
description: 'other workflow run id to compare'
|
||||
type: string
|
||||
required: false
|
||||
default: ""
|
||||
|
||||
|
||||
# Used for `push` to easily modify the target workflow runs to compare against
|
||||
env:
|
||||
prev_workflow_run_id: ""
|
||||
other_workflow_run_id: ""
|
||||
|
||||
|
||||
jobs:
|
||||
setup:
|
||||
name: Setup
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Setup
|
||||
run: |
|
||||
mkdir "setup_values"
|
||||
echo "${{ inputs.prev_workflow_run_id || env.prev_workflow_run_id }}" > "setup_values/prev_workflow_run_id.txt"
|
||||
echo "${{ inputs.other_workflow_run_id || env.other_workflow_run_id }}" > "setup_values/other_workflow_run_id.txt"
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: setup_values
|
||||
path: setup_values
|
||||
|
||||
|
||||
model-ci:
|
||||
name: Model CI
|
||||
uses: ./.github/workflows/self-scheduled.yml
|
||||
with:
|
||||
job: run_models_gpu
|
||||
slack_report_channel: "#transformers-ci-flash-attn"
|
||||
docker: huggingface/transformers-all-latest-gpu:flash-attn
|
||||
ci_event: Daily CI
|
||||
runner_type: "a10"
|
||||
report_repo_id: hf-internal-testing/transformers_flash_attn_ci
|
||||
commit_sha: ${{ github.sha }}
|
||||
pytest_marker: "flash_attn_test or flash_attn_3_test"
|
||||
secrets: inherit
|
||||
98
.github/workflows/self-scheduled.yml
vendored
98
.github/workflows/self-scheduled.yml
vendored
@ -34,20 +34,10 @@ on:
|
||||
runner_type:
|
||||
required: false
|
||||
type: string
|
||||
subdirs:
|
||||
models:
|
||||
default: ""
|
||||
required: false
|
||||
type: string
|
||||
pytest_marker:
|
||||
required: false
|
||||
type: string
|
||||
pr_number:
|
||||
required: false
|
||||
type: string
|
||||
outputs:
|
||||
report:
|
||||
description: "Content of the report of new failures"
|
||||
value: ${{ jobs.check_new_failures.outputs.report }}
|
||||
|
||||
env:
|
||||
HF_HOME: /mnt/cache
|
||||
@ -82,7 +72,6 @@ jobs:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
git fetch origin ${{ inputs.commit_sha || github.sha }}
|
||||
git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
|
||||
|
||||
- name: Cleanup
|
||||
@ -102,10 +91,8 @@ jobs:
|
||||
working-directory: /transformers/tests
|
||||
run: |
|
||||
if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
|
||||
python3 ../utils/split_model_tests.py --subdirs '${{ inputs.subdirs }}' --num_splits ${{ env.NUM_SLICES }} > folder_slices.txt
|
||||
echo "folder_slices=$(cat folder_slices.txt)" >> $GITHUB_OUTPUT
|
||||
python3 -c "import ast; folder_slices = ast.literal_eval(open('folder_slices.txt').read()); open('slice_ids.txt', 'w').write(str(list(range(len(folder_slices)))))"
|
||||
echo "slice_ids=$(cat slice_ids.txt)" >> $GITHUB_OUTPUT
|
||||
echo "folder_slices=$(python3 ../utils/split_model_tests.py --models '${{ inputs.models }}' --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
|
||||
echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
|
||||
elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
|
||||
echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
|
||||
echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
|
||||
@ -116,7 +103,7 @@ jobs:
|
||||
name: Identify quantization method to test
|
||||
working-directory: /transformers/tests
|
||||
run: |
|
||||
echo "quantization_matrix=$(python3 -c 'import ast; import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); subdirs = ast.literal_eval(${{ inputs.subdirs || '"None"' }}); quantization_tests = [x.removeprefix("quantization/") for x in subdirs] if subdirs is not None else quantization_tests; d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ; print(d)')" >> $GITHUB_OUTPUT
|
||||
echo "quantization_matrix=$(python3 -c 'import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ; print(d)')" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
@ -140,7 +127,6 @@ jobs:
|
||||
commit_sha: ${{ inputs.commit_sha || github.sha }}
|
||||
runner_type: ${{ inputs.runner_type }}
|
||||
report_repo_id: ${{ inputs.report_repo_id }}
|
||||
pytest_marker: ${{ inputs.pytest_marker }}
|
||||
secrets: inherit
|
||||
|
||||
run_trainer_and_fsdp_gpu:
|
||||
@ -174,7 +160,7 @@ jobs:
|
||||
runs-on:
|
||||
group: '${{ matrix.machine_type }}'
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
image: huggingface/transformers-pytorch-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: Update clone
|
||||
@ -338,7 +324,7 @@ jobs:
|
||||
working-directory: ${{ inputs.working-directory-prefix }}/
|
||||
run: |
|
||||
python3 -m pip uninstall -y deepspeed
|
||||
DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check
|
||||
DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||
|
||||
# To avoid unknown test failures
|
||||
- name: Pre build DeepSpeed *again* (for nightly & Past CI)
|
||||
@ -348,7 +334,7 @@ jobs:
|
||||
python3 -m pip uninstall -y deepspeed
|
||||
rm -rf DeepSpeed
|
||||
git clone https://github.com/deepspeedai/DeepSpeed && cd DeepSpeed && rm -rf build
|
||||
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check
|
||||
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
@ -477,70 +463,6 @@ jobs:
|
||||
name: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports
|
||||
path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports
|
||||
|
||||
run_kernels_gpu:
|
||||
if: ${{ inputs.job == 'run_kernels_gpu' }}
|
||||
name: Kernel tests
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [aws-g5-4xlarge-cache]
|
||||
runs-on:
|
||||
group: '${{ matrix.machine_type }}'
|
||||
container:
|
||||
image: ${{ inputs.docker }}
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
|
||||
|
||||
- name: Reinstall transformers in edit mode
|
||||
working-directory: /transformers
|
||||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .[testing]
|
||||
|
||||
- name: Install kernels
|
||||
working-directory: /transformers
|
||||
run: python3 -m pip install -U kernels
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Environment
|
||||
working-directory: /transformers
|
||||
run: python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
working-directory: /transformers
|
||||
run: pip freeze
|
||||
|
||||
- name: Set `machine_type` for report and artifact names
|
||||
working-directory: /transformers
|
||||
shell: bash
|
||||
run: |
|
||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
||||
machine_type=single-gpu
|
||||
else
|
||||
machine_type=${{ matrix.machine_type }}
|
||||
fi
|
||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||
|
||||
- name: Run kernel tests on GPU
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_kernels_gpu_test_reports tests/kernels/test_kernels.py
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: cat /transformers/reports/${{ env.machine_type }}_run_kernels_gpu_test_reports/failures_short.txt
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_kernels_gpu_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ env.machine_type }}_run_kernels_gpu_test_reports
|
||||
path: /transformers/reports/${{ env.machine_type }}_run_kernels_gpu_test_reports
|
||||
|
||||
run_extract_warnings:
|
||||
# Let's only do this for the job `run_models_gpu` to simplify the (already complex) logic.
|
||||
if: ${{ always() && inputs.job == 'run_models_gpu' }}
|
||||
@ -593,7 +515,6 @@ jobs:
|
||||
run_examples_gpu,
|
||||
run_torch_cuda_extensions_gpu,
|
||||
run_quantization_torch_gpu,
|
||||
run_kernels_gpu,
|
||||
run_extract_warnings
|
||||
]
|
||||
if: always() && !cancelled()
|
||||
@ -613,17 +534,16 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
check_new_failures:
|
||||
if: ${{ always() && needs.send_results.result == 'success' }}
|
||||
if: ${{ always() && inputs.ci_event == 'Daily CI' && needs.send_results.result == 'success' }}
|
||||
name: Check new failures
|
||||
needs: send_results
|
||||
uses: ./.github/workflows/check_failed_tests.yml
|
||||
with:
|
||||
docker: ${{ inputs.docker }}
|
||||
commit_sha: ${{ inputs.commit_sha || github.sha }}
|
||||
start_sha: ${{ inputs.commit_sha || github.sha }}
|
||||
job: ${{ inputs.job }}
|
||||
slack_report_channel: ${{ inputs.slack_report_channel }}
|
||||
ci_event: ${{ inputs.ci_event }}
|
||||
report_repo_id: ${{ inputs.report_repo_id }}
|
||||
pr_number: ${{ inputs.pr_number }}
|
||||
|
||||
secrets: inherit
|
||||
|
||||
16
.github/workflows/ssh-runner.yml
vendored
16
.github/workflows/ssh-runner.yml
vendored
@ -4,7 +4,7 @@ on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
runner_type:
|
||||
description: 'Type of runner to test (a10)'
|
||||
description: 'Type of runner to test (a10 or t4)'
|
||||
required: true
|
||||
docker_image:
|
||||
description: 'Name of the Docker image'
|
||||
@ -36,10 +36,14 @@ jobs:
|
||||
NUM_GPUS: ${{ github.event.inputs.num_gpus }}
|
||||
RUNNER_TYPE: ${{ github.event.inputs.runner_type }}
|
||||
run: |
|
||||
if [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "a10" ]]; then
|
||||
echo "RUNNER=aws-g5-4xlarge-cache-ssh" >> $GITHUB_ENV
|
||||
if [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "t4" ]]; then
|
||||
echo "RUNNER=aws-g4dn-4xlarge-cache" >> $GITHUB_ENV
|
||||
elif [[ "$NUM_GPUS" == "multi" && "$RUNNER_TYPE" == "t4" ]]; then
|
||||
echo "RUNNER=aws-g4dn-12xlarge-cache" >> $GITHUB_ENV
|
||||
elif [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "a10" ]]; then
|
||||
echo "RUNNER=aws-g5-4xlarge-cache" >> $GITHUB_ENV
|
||||
elif [[ "$NUM_GPUS" == "multi" && "$RUNNER_TYPE" == "a10" ]]; then
|
||||
echo "RUNNER=aws-g5-12xlarge-cache-ssh" >> $GITHUB_ENV
|
||||
echo "RUNNER=aws-g5-12xlarge-cache" >> $GITHUB_ENV
|
||||
else
|
||||
echo "RUNNER=" >> $GITHUB_ENV
|
||||
fi
|
||||
@ -57,6 +61,8 @@ jobs:
|
||||
group: ${{ needs.get_runner.outputs.RUNNER }}
|
||||
container:
|
||||
image: ${{ github.event.inputs.docker_image }}
|
||||
options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
@ -100,7 +106,7 @@ jobs:
|
||||
else
|
||||
echo "SLACKCHANNEL=${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}" >> $GITHUB_ENV
|
||||
fi
|
||||
|
||||
|
||||
- name: Tailscale # In order to be able to SSH when a test fails
|
||||
uses: huggingface/tailscale-action@main
|
||||
with:
|
||||
|
||||
@ -14,7 +14,7 @@ This AGENTS.md file provides guidance for code agents working with this codebase
|
||||
|
||||
- PRs should be as brief as possible. Bugfix PRs in particular can often be only one or two lines long, and do not need large comments, docstrings or new functions in this case. Aim to minimize the size of the diff.
|
||||
- When writing tests, they should be added to an existing file. The only exception is for PRs to add a new model, when a new test directory should be created for that model.
|
||||
- Code style is enforced in the CI. You can install the style tools with `pip install -e ".[quality]"`. You can then run `make fixup` to apply style and consistency fixes to your code.
|
||||
- Code style is enforced in the CI. You can install the style tools with `pip install -e .[quality]`. You can then run `make fixup` to apply style and consistency fixes to your code.
|
||||
|
||||
## Copying and inheritance
|
||||
|
||||
@ -36,4 +36,4 @@ After making changes, you should usually run `make fixup` to ensure any copies a
|
||||
the model you made the changes in and any other models that were updated by `make fixup`. Tests can be run with `pytest tests/models/[name]/test_modeling_[name].py`
|
||||
If your changes affect code in other classes like tokenizers or processors, you should run those tests instead, like `test_processing_[name].py` or `test_tokenization_[name].py`.
|
||||
|
||||
In order to run tests, you may need to install dependencies. You can do this with `pip install -e ".[testing]"`. You will probably also need to `pip install torch accelerate` if your environment does not already have them.
|
||||
In order to run tests, you may need to install dependencies. You can do this with `pip install -e .[testing]`. You will probably also need to `pip install torch accelerate` if your environment does not already have them.
|
||||
120
CONTRIBUTING.md
120
CONTRIBUTING.md
@ -112,125 +112,7 @@ New models are constantly released and if you want to implement a new model, ple
|
||||
|
||||
If you are willing to contribute the model yourself, let us know so we can help you add it to 🤗 Transformers!
|
||||
|
||||
We have a technical guide for [how to add a model to 🤗 Transformers](https://huggingface.co/docs/transformers/modular_transformers).
|
||||
|
||||
### Vision-Language Model Contribution Checklist
|
||||
|
||||
If you're contributing a **vision-language model** (or any multimodal model that processes images/videos), please follow this checklist. Maintainers will use this to review your PR, and completing these steps will significantly increase the likelihood of your PR being merged quickly.
|
||||
|
||||
**Required checklist for all vision-language model contributions:**
|
||||
|
||||
☐ **1. Implement a modular file**
|
||||
|
||||
All new models should use the modular architecture pattern. Create a `modular_<model_name>.py` file using the modular model converter:
|
||||
|
||||
- Use the CLI, [`transformers add-new-model-like`](https://github.com/huggingface/transformers/blob/main/src/transformers/cli/add_new_model_like.py) to generate a modular skeleton and get started
|
||||
- All code should be in the modular file if possible. Modeling must be in it, it's better if configuration is in it as well.
|
||||
- Reuse existing patterns from similar models as much as possible
|
||||
|
||||
To verify your modular file is correct, run:
|
||||
|
||||
```bash
|
||||
python utils/modular_model_converter.py <model_name>
|
||||
```
|
||||
|
||||
This will generate the separate files (`modeling_*.py`, `configuration_*.py`, etc.) from your modular file. The CI will enforce that these generated files match your modular file.
|
||||
|
||||
☐ **2. Add a fast image processor (for image models)**
|
||||
|
||||
If your model processes images, implement a fast image processor that uses `torch` and `torchvision` instead of PIL/numpy for better inference performance:
|
||||
|
||||
- See the detailed guide in [#36978](https://github.com/huggingface/transformers/issues/36978)
|
||||
- Fast processors inherit from `BaseImageProcessorFast`
|
||||
- Examples: `LlavaOnevisionImageProcessorFast`, `Idefics2ImageProcessorFast`
|
||||
|
||||
☐ **3. Create a weight conversion script**
|
||||
|
||||
Add a `convert_<model_name>_to_hf.py` script that converts the original model weights to the HuggingFace format:
|
||||
|
||||
- Script should handle checkpoint loading, key mapping, and saving in HF format
|
||||
- Include usage examples and documentation in the script
|
||||
- Examples: [`convert_llava_onevision_weights_to_hf.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py), [`convert_idefics2_weights_to_hf.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py)
|
||||
|
||||
☐ **4. Add integration tests with exact output matching**
|
||||
|
||||
At minimum, add an `IntegrationTest` class that tests end-to-end generation (processing and modelling) with **exact** output matching:
|
||||
|
||||
- For generative models: test that generated text matches expected output exactly
|
||||
- For non-generative models: test that output logits match expected values
|
||||
- Tests should use real checkpoints (load in 4-bit or half precision if the checkpoint is too big to fit in our CI runners) and real inputs
|
||||
- Example pattern:
|
||||
|
||||
```python
|
||||
class MyModelIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
def test_model_integration(self):
|
||||
model = MyModelForConditionalGeneration.from_pretrained("org/model-name")
|
||||
processor = AutoProcessor.from_pretrained("org/model-name")
|
||||
|
||||
inputs = processor(images=image, text=prompt, return_tensors="pt")
|
||||
output = model.generate(**inputs, max_new_tokens=20)
|
||||
|
||||
EXPECTED_TEXT = "exact expected output"
|
||||
self.assertEqual(processor.decode(output[0]), EXPECTED_TEXT)
|
||||
```
|
||||
|
||||
See `tests/models/llava_onevision/test_modeling_llava_onevision.py` for complete examples.
|
||||
|
||||
☐ **5. Update documentation**
|
||||
|
||||
Add or update model documentation:
|
||||
|
||||
- Create if the cli hasn't `docs/source/en/model_doc/<model_name>.md` with usage examples
|
||||
- Include model description, paper link, and basic usage with `Pipeline` and `AutoModel`
|
||||
- Add the model to the appropriate TOC files
|
||||
|
||||
☐ **6. Look for reusable patterns**
|
||||
|
||||
The library has 400+ models with many established patterns:
|
||||
|
||||
- Search for similar models (e.g., other vision-language models)
|
||||
- Reuse attention mechanisms, layer implementations, and processing patterns
|
||||
- Check models like LLaVA, Idefics2, Fuyu for vision-language patterns
|
||||
- Use provided decorators like (`auto_docstring`, `can_return_tuple`, `check_model_inputs` and `_can_record_outputs`) where relevant.
|
||||
- Don't reinvent the wheel
|
||||
|
||||
☐ **7. Run quality checks and read the output**
|
||||
|
||||
Before submitting your PR, install quality dependencies and run the full check suite:
|
||||
|
||||
```bash
|
||||
pip install -e ".[quality]"
|
||||
make fixup
|
||||
```
|
||||
|
||||
**Important**: Take time to read the output of `make fixup`. It will:
|
||||
- Lint and format your code automatically
|
||||
- Run consistency checks (imports, docstrings, etc.)
|
||||
- Show any remaining issues that need manual fixes
|
||||
|
||||
All checks must pass before your PR can be merged.
|
||||
|
||||
**If this checklist is complete, your PR has a very high likelihood of being merged!** Following these steps makes the maintainers' work much easier and will reduce the number of review iterations, getting your important work out there faster.
|
||||
|
||||
#### Copy-pastable checklist for maintainers
|
||||
|
||||
Here's a condensed version maintainers can copy into PRs:
|
||||
|
||||
```markdown
|
||||
## Multimodal Model Addition Checklist
|
||||
|
||||
Please ensure your PR completes all following items. See the [full checklist](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#vision-language-model-contribution-checklist) for details.
|
||||
|
||||
- [ ] **Modular file**: `modular_<model_name>.py` implemented and verified with `python utils/modular_model_converter.py <model_name>`
|
||||
- [ ] **Fast image processor**: Implemented using `BaseImageProcessorFast` (see [#36978](https://github.com/huggingface/transformers/issues/36978))
|
||||
- [ ] **Conversion script**: `convert_<model_name>_to_hf.py` added with usage examples
|
||||
- [ ] **Integration tests**: End-to-end tests with exact output matching (text or logits)
|
||||
- [ ] **Documentation**: Model docs added/updated in `docs/source/en/model_doc/`
|
||||
- [ ] **Pattern reuse**: Verified against similar models (LLaVA, Idefics2, etc.)
|
||||
- [ ] **Quality checks**: `make fixup` passes with no errors
|
||||
|
||||
```
|
||||
We have a technical guide for [how to add a model to 🤗 Transformers](https://huggingface.co/docs/transformers/add_new_model).
|
||||
|
||||
## Do you want to add documentation?
|
||||
|
||||
|
||||
@ -64,8 +64,8 @@ limitations under the License.
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_as_a_model_definition.png"/>
|
||||
</h3>
|
||||
|
||||
Transformers acts as the model-definition framework for state-of-the-art machine learning with text, computer
|
||||
vision, audio, video, and multimodal models, for both inference and training.
|
||||
Transformers acts as the model-definition framework for state-of-the-art machine learning models in text, computer
|
||||
vision, audio, video, and multimodal model, for both inference and training.
|
||||
|
||||
It centralizes the model definition so that this definition is agreed upon across the ecosystem. `transformers` is the
|
||||
pivot across frameworks: if a model definition is supported, it will be compatible with the majority of training
|
||||
|
||||
@ -9,12 +9,6 @@ In this list, we showcase incredibly impactful and novel projects that have push
|
||||
adding other projects to the list. If you believe a project should be here and it's not, then please, open a PR
|
||||
to add it.
|
||||
|
||||
## [◉ Universal Intelligence](https://github.com/blueraai/universal-intelligence)
|
||||
|
||||
[Universal Intelligence](https://github.com/blueraai/universal-intelligence) aims to standardize models, tools, and agents —transforming them into simple, composable, portable, interoperable, framework-agnostic, hardware-agnostic interfaces (through auto-negotiation and resource sharing); for fast and accessible development of AI applications.
|
||||
|
||||
Keywords: Protocol, Open-source, LLMs, Large Language Models, Agents, Low-code
|
||||
|
||||
## [gpt4all](https://github.com/nomic-ai/gpt4all)
|
||||
|
||||
[gpt4all](https://github.com/nomic-ai/gpt4all) is an ecosystem of open-source chatbots trained on massive collections of clean assistant data including code, stories and dialogue. It offers open-source, large language models such as LLaMA and GPT-J trained in an assistant-style.
|
||||
|
||||
@ -1,11 +1,8 @@
|
||||
import hashlib
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from transformers.utils.import_utils import is_flash_attn_2_available
|
||||
|
||||
|
||||
KERNELIZATION_AVAILABLE = False
|
||||
try:
|
||||
@ -21,22 +18,11 @@ logger = logging.getLogger(__name__)
|
||||
class BenchmarkConfig:
|
||||
"""Configuration for a single benchmark scenario."""
|
||||
|
||||
all_attn_implementations = [
|
||||
("flash_attention_2", None),
|
||||
("eager", None),
|
||||
("sdpa", "math"),
|
||||
("sdpa", "flash_attention"),
|
||||
("flex_attention", None),
|
||||
]
|
||||
|
||||
all_compiled_modes = [None, "default", "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
warmup_iterations: int = 5,
|
||||
measurement_iterations: int = 20,
|
||||
gpu_monitoring: bool = True, # NOTE: you may want to disable this at times as we have obsvered it could heavily slow down benchmarks on AMD
|
||||
continuous_batching: bool = False,
|
||||
gpu_monitoring: bool = False, # False by default because it slows down the benchmark by a lot
|
||||
batch_size: int = 1,
|
||||
sequence_length: int = 128,
|
||||
num_tokens_to_generate: int = 128,
|
||||
@ -52,7 +38,6 @@ class BenchmarkConfig:
|
||||
self.warmup_iterations = warmup_iterations
|
||||
self.measurement_iterations = measurement_iterations
|
||||
self.gpu_monitoring = gpu_monitoring
|
||||
self.continuous_batching = continuous_batching
|
||||
# Input parameters
|
||||
self.batch_size = batch_size
|
||||
self.sequence_length = sequence_length
|
||||
@ -74,35 +59,12 @@ class BenchmarkConfig:
|
||||
def check_validity(self, skip_validity_check: bool = False) -> None:
|
||||
if skip_validity_check:
|
||||
return
|
||||
# Check FA is installed
|
||||
if self.attn_implementation == "flash_attention_2" and not is_flash_attn_2_available():
|
||||
logger.warning(
|
||||
"Flash attention does not support compile mode. Defaulting to SDPA w/ flash attention backend."
|
||||
)
|
||||
self.attn_implementation = "sdpa"
|
||||
self.sdpa_backend = "flash_attention"
|
||||
# Flash attention does not support compile mode, so we turn it off # FIXME: it would be better to support it
|
||||
is_fa = self.attn_implementation == "flash_attention_2"
|
||||
is_fa |= self.attn_implementation == "sdpa" and self.sdpa_backend == "flash_attention"
|
||||
if is_fa:
|
||||
logger.warning("Flash attention does not support compile mode. Turning off compile mode.")
|
||||
self.compile_mode = None
|
||||
# Handle SDPA backend if not determined by the config (needs to be done before skipping duplicates)
|
||||
if self.attn_implementation == "sdpa" and self.sdpa_backend is None:
|
||||
default_backend = "flash_attention" # FIXME: torch has a _cur_sdpa_kernel_backends but it fails
|
||||
logger.warning(f"No SDPA backend provided, using {default_backend} instead.")
|
||||
self.sdpa_backend = default_backend
|
||||
if self.continuous_batching:
|
||||
if self.attn_implementation == "flex_attention":
|
||||
logger.error(
|
||||
"disabling continuous batching because of invalid configuration: flex attention is not supported"
|
||||
)
|
||||
self.continuous_batching = False
|
||||
elif self.attn_implementation == "sdpa" and self.sdpa_backend is not None:
|
||||
logger.warning(
|
||||
"when continuous batching is enabled, sdpa_backend must be None because of the attention mask, setting it to None"
|
||||
)
|
||||
self.sdpa_backend = "math"
|
||||
|
||||
@property
|
||||
def hash(self) -> str:
|
||||
@ -118,7 +80,6 @@ class BenchmarkConfig:
|
||||
attn_code += f"_{self.sdpa_backend}" if self.attn_implementation == "sdpa" else ""
|
||||
compile_str = f"compiled_{self.compile_mode}" if self.compile_mode is not None else "uncompiled"
|
||||
kernelize_str = "kernelized" if self.kernelize else "unkernelized"
|
||||
continuous_batching_str = "cb" if self.continuous_batching else "generate"
|
||||
sep = "-"
|
||||
else:
|
||||
iter_str = f"{self.warmup_iterations} warmup, {self.measurement_iterations} iterations"
|
||||
@ -128,11 +89,8 @@ class BenchmarkConfig:
|
||||
attn_code += f" with {self.sdpa_backend} backend" if self.attn_implementation == "sdpa" else ""
|
||||
compile_str = "compiled" if self.compile_mode is not None else "not compiled"
|
||||
kernelize_str = "kernelized" if self.kernelize else "not kernelized"
|
||||
continuous_batching_str = "continuous batching" if self.continuous_batching else "regular generate"
|
||||
sep = ", "
|
||||
return sep.join(
|
||||
[iter_str, gpu_monitor_str, dimensions_str, attn_code, compile_str, kernelize_str, continuous_batching_str]
|
||||
)
|
||||
return sep.join([iter_str, gpu_monitor_str, dimensions_str, attn_code, compile_str, kernelize_str])
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
@ -140,7 +98,6 @@ class BenchmarkConfig:
|
||||
"warmup_iterations": self.warmup_iterations,
|
||||
"measurement_iterations": self.measurement_iterations,
|
||||
"gpu_monitoring": self.gpu_monitoring,
|
||||
"continuous_batching": self.continuous_batching,
|
||||
"batch_size": self.batch_size,
|
||||
"sequence_length": self.sequence_length,
|
||||
"num_tokens_to_generate": self.num_tokens_to_generate,
|
||||
@ -157,7 +114,6 @@ class BenchmarkConfig:
|
||||
warmup_iterations=data.get("warmup_iterations", 5),
|
||||
measurement_iterations=data.get("measurement_iterations", 20),
|
||||
gpu_monitoring=data.get("gpu_monitoring", False),
|
||||
continuous_batching=data.get("continuous_batching", False),
|
||||
batch_size=data.get("batch_size", 1),
|
||||
sequence_length=data.get("sequence_length", 128),
|
||||
num_tokens_to_generate=data.get("num_tokens_to_generate", 128),
|
||||
@ -171,79 +127,89 @@ class BenchmarkConfig:
|
||||
)
|
||||
|
||||
|
||||
def adapt_configs(
|
||||
configs: list[BenchmarkConfig],
|
||||
warmup_iterations: int | list[int] = 5,
|
||||
measurement_iterations: int | list[int] = 20,
|
||||
batch_size: int | list[int] = 1,
|
||||
sequence_length: int | list[int] = 128,
|
||||
num_tokens_to_generate: int | list[int] = 128,
|
||||
gpu_monitoring: bool | list[bool] = True,
|
||||
def cross_generate_configs(
|
||||
attn_impl_and_sdpa_backend: list[tuple[str, str | None]],
|
||||
compiled_mode: list[str | None],
|
||||
kernelized: list[bool],
|
||||
warmup_iterations: int = 5,
|
||||
measurement_iterations: int = 20,
|
||||
batch_size: int = 1,
|
||||
sequence_length: int = 128,
|
||||
num_tokens_to_generate: int = 128,
|
||||
gpu_monitoring: bool = False, # this slows down the benchmark by a lot so we disable it by default
|
||||
) -> list[BenchmarkConfig]:
|
||||
parameters = (
|
||||
x if isinstance(x, list) else [x]
|
||||
for x in [
|
||||
warmup_iterations,
|
||||
measurement_iterations,
|
||||
batch_size,
|
||||
sequence_length,
|
||||
num_tokens_to_generate,
|
||||
gpu_monitoring,
|
||||
]
|
||||
)
|
||||
iterator = itertools.product(*parameters)
|
||||
|
||||
adapted_configs = []
|
||||
for warmup_iters, measurement_iters, bs, seqlen, ntok, monitor in iterator:
|
||||
for config in configs:
|
||||
config = config.to_dict()
|
||||
config["warmup_iterations"] = warmup_iters
|
||||
config["measurement_iterations"] = measurement_iters
|
||||
config["batch_size"] = bs
|
||||
config["sequence_length"] = seqlen
|
||||
config["num_tokens_to_generate"] = ntok
|
||||
config["gpu_monitoring"] = monitor
|
||||
adapted_configs.append(BenchmarkConfig.from_dict(config))
|
||||
return adapted_configs
|
||||
|
||||
|
||||
def get_config_by_level(level: int) -> list[BenchmarkConfig]:
|
||||
# Create kwargs common to all configs
|
||||
kwargs = {
|
||||
"warmup_iterations": warmup_iterations,
|
||||
"measurement_iterations": measurement_iterations,
|
||||
"batch_size": batch_size,
|
||||
"sequence_length": sequence_length,
|
||||
"num_tokens_to_generate": num_tokens_to_generate,
|
||||
"gpu_monitoring": gpu_monitoring,
|
||||
}
|
||||
# Cross-generate all combinations of attn_implementation, compiled_mode, and kernelized
|
||||
configs = []
|
||||
# Early return if level is greater than 3: we generate all combinations of configs, maybe even w/ all compile modes
|
||||
if level >= 3:
|
||||
for attn_implementation, sdpa_backend in BenchmarkConfig.all_attn_implementations:
|
||||
# Usually there is not much to gain by compiling with other modes, but we allow it for level 4
|
||||
compile_modes = BenchmarkConfig.all_compiled_modes if level >= 4 else [None, "default"]
|
||||
for cm in compile_modes:
|
||||
for kernelize_on in {False, KERNELIZATION_AVAILABLE}:
|
||||
for cb_on in [False, True]:
|
||||
configs.append(
|
||||
BenchmarkConfig(
|
||||
attn_implementation=attn_implementation,
|
||||
sdpa_backend=sdpa_backend,
|
||||
compile_mode=cm,
|
||||
kernelize=kernelize_on,
|
||||
continuous_batching=cb_on,
|
||||
)
|
||||
)
|
||||
return configs
|
||||
# Otherwise, we add the configs for the given level
|
||||
if level >= 0:
|
||||
configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default"))
|
||||
if level >= 1:
|
||||
configs.append(BenchmarkConfig(attn_implementation="flash_attention_2"))
|
||||
configs.append(BenchmarkConfig(attn_implementation="eager", compile_mode="default"))
|
||||
configs.append(
|
||||
BenchmarkConfig(attn_implementation="paged|sdpa", compile_mode="default", continuous_batching=True)
|
||||
)
|
||||
if level >= 2:
|
||||
configs.append(BenchmarkConfig(attn_implementation="sdpa", compile_mode="default"))
|
||||
configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", kernelize=True))
|
||||
configs.append(BenchmarkConfig(attn_implementation="flash_attention_2", kernelize=True))
|
||||
configs.append(
|
||||
BenchmarkConfig(attn_implementation="paged|sdpa", compile_mode="default", continuous_batching=True)
|
||||
)
|
||||
configs.append(
|
||||
BenchmarkConfig(attn_implementation="flash_attention_2", kernelize=True, continuous_batching=True)
|
||||
)
|
||||
for attn_implementation, sdpa_backend in list(dict.fromkeys(attn_impl_and_sdpa_backend)):
|
||||
for cm in list(dict.fromkeys(compiled_mode)):
|
||||
for kernelize_on in list(dict.fromkeys(kernelized)):
|
||||
config = BenchmarkConfig(
|
||||
attn_implementation=attn_implementation,
|
||||
sdpa_backend=sdpa_backend,
|
||||
compile_mode=cm,
|
||||
kernelize=kernelize_on,
|
||||
**kwargs,
|
||||
)
|
||||
configs.append(config)
|
||||
return configs
|
||||
|
||||
|
||||
def generate_all_configs(
|
||||
warmup_iterations: int = 5,
|
||||
measurement_iterations: int = 20,
|
||||
batch_size: int = 1,
|
||||
sequence_length: int = 128,
|
||||
num_tokens_to_generate: int = 128,
|
||||
gpu_monitoring: bool = False,
|
||||
) -> list[BenchmarkConfig]:
|
||||
all_attn_implementations = [
|
||||
("flash_attention_2", None),
|
||||
("eager", None),
|
||||
("sdpa", "math"),
|
||||
("sdpa", "flash_attention"),
|
||||
("flex_attention", None),
|
||||
]
|
||||
return cross_generate_configs(
|
||||
attn_impl_and_sdpa_backend=all_attn_implementations,
|
||||
compiled_mode=[None, "default", "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"],
|
||||
kernelized=[False, KERNELIZATION_AVAILABLE],
|
||||
warmup_iterations=warmup_iterations,
|
||||
measurement_iterations=measurement_iterations,
|
||||
batch_size=batch_size,
|
||||
sequence_length=sequence_length,
|
||||
num_tokens_to_generate=num_tokens_to_generate,
|
||||
gpu_monitoring=gpu_monitoring,
|
||||
)
|
||||
|
||||
|
||||
def generate_main_configs(
|
||||
warmup_iterations: int = 5,
|
||||
measurement_iterations: int = 20,
|
||||
batch_size: int = 1,
|
||||
sequence_length: int = 128,
|
||||
num_tokens_to_generate: int = 128,
|
||||
gpu_monitoring: bool = False,
|
||||
) -> list[BenchmarkConfig]:
|
||||
# Create kwargs common to all configs
|
||||
kwargs = {
|
||||
"warmup_iterations": warmup_iterations,
|
||||
"measurement_iterations": measurement_iterations,
|
||||
"batch_size": batch_size,
|
||||
"sequence_length": sequence_length,
|
||||
"num_tokens_to_generate": num_tokens_to_generate,
|
||||
"gpu_monitoring": gpu_monitoring,
|
||||
}
|
||||
return [ # TODO: test max-autotune instead of default
|
||||
BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", **kwargs),
|
||||
BenchmarkConfig(attn_implementation="eager", compile_mode="default", **kwargs),
|
||||
BenchmarkConfig(attn_implementation="flash_attention_2", **kwargs),
|
||||
]
|
||||
|
||||
@ -4,7 +4,6 @@ import logging
|
||||
import os
|
||||
import pathlib
|
||||
import re
|
||||
import tempfile
|
||||
import time
|
||||
from contextlib import nullcontext
|
||||
from datetime import datetime
|
||||
@ -12,8 +11,6 @@ from queue import Queue
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
from datasets import Dataset
|
||||
from huggingface_hub import HfApi
|
||||
from tqdm import trange
|
||||
|
||||
from transformers import (
|
||||
@ -53,8 +50,6 @@ DEFAULT_PROMPT = "\n".join([
|
||||
"Its instability ended in the coup of 18 Brumaire and the establishment of the Consulate, with Napoleon Bonaparte as First Consul.",
|
||||
]) # fmt: skip
|
||||
|
||||
PUSH_TO_HUB_TOKEN = os.getenv("PUSH_TO_HUB_TOKEN", None)
|
||||
|
||||
|
||||
def compact_json_numeric_arrays(data: dict):
|
||||
# Match arrays that contain only numbers (ints/floats), whitespace, commas, and newlines
|
||||
@ -125,19 +120,15 @@ def flush_memory():
|
||||
|
||||
class BenchmarkStreamer(BaseStreamer):
|
||||
def __init__(self, **kwargs) -> None:
|
||||
self.timeout = kwargs.pop("timeout", 10)
|
||||
self.timestamps = []
|
||||
self.text_queue = Queue()
|
||||
self.stop_signal = None
|
||||
|
||||
def put(self, value):
|
||||
"""Receives tokens and logs the timestamp of the generation."""
|
||||
self.timestamps.append(time.perf_counter())
|
||||
self.text_queue.put(value)
|
||||
|
||||
def end(self):
|
||||
self.timestamps.append(time.perf_counter())
|
||||
self.text_queue.put(self.stop_signal)
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
@ -153,22 +144,13 @@ class BenchmarkStreamer(BaseStreamer):
|
||||
class BenchmarkRunner:
|
||||
"""Main benchmark runner that coordinates benchmark execution."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
logger: logging.Logger,
|
||||
output_dir: str | None = None,
|
||||
branch_name: str | None = None,
|
||||
commit_id: str | None = None,
|
||||
commit_message: str | None = None,
|
||||
) -> None:
|
||||
def __init__(self, logger: logging.Logger, output_dir: str | None = None, commit_id: str | None = None) -> None:
|
||||
# Those stay constant for the whole run
|
||||
self.logger = logger
|
||||
if output_dir is None:
|
||||
output_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "benchmark_results")
|
||||
self.output_dir = output_dir
|
||||
self.branch_name = branch_name
|
||||
self.commit_id = get_git_revision() if commit_id is None else commit_id
|
||||
self.commit_message = commit_message
|
||||
os.makedirs(self.output_dir, exist_ok=True)
|
||||
self.profile_dir = None
|
||||
# Attributes that are reset for each model
|
||||
@ -181,7 +163,7 @@ class BenchmarkRunner:
|
||||
self.model = None
|
||||
flush_memory()
|
||||
|
||||
def setup_benchmark(self, model_id: str, config: BenchmarkConfig) -> None:
|
||||
def setup_one_run(self, model_id: str, config: BenchmarkConfig) -> None:
|
||||
# Some attributes only need to be set once per model
|
||||
if self._setup_for != model_id:
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||
@ -218,13 +200,10 @@ class BenchmarkRunner:
|
||||
self.model = self.model.eval().to(config.device)
|
||||
|
||||
# Kernelize the model if needed
|
||||
if config.kernelize and kernelize is not None and Mode is not None:
|
||||
if config.kernelize:
|
||||
self.model = kernelize(self.model, mode=Mode.INFERENCE)
|
||||
|
||||
def run_benchmark(
|
||||
self, model_id: str, config: BenchmarkConfig, num_tokens_to_profile: int = 0
|
||||
) -> dict[str, Any] | None:
|
||||
"""Run a single benchmark with the given model ID and config."""
|
||||
def run_one_benchmark(self, model_id: str, config: BenchmarkConfig, num_tokens_to_profile: int = 0) -> None:
|
||||
sdpa_ctx = nullcontext()
|
||||
if config.attn_implementation == "sdpa":
|
||||
sdpa_backend = get_sdpa_backend(config.sdpa_backend)
|
||||
@ -234,9 +213,8 @@ class BenchmarkRunner:
|
||||
self.logger.info(f"Running benchmark scenario: {config.name}")
|
||||
|
||||
# Quick validation: try one measurement first to see if this scenario works
|
||||
generate_fn = self.time_generate_batch if config.continuous_batching else self.time_generate
|
||||
flush_memory()
|
||||
e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = generate_fn(
|
||||
e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = self.time_generate(
|
||||
max_new_tokens=1, gpu_monitor=None
|
||||
)
|
||||
if e2e_latency < 0:
|
||||
@ -246,14 +224,14 @@ class BenchmarkRunner:
|
||||
# Warmup runs
|
||||
self.logger.info(f"Warming up with {config.warmup_iterations} iterations...")
|
||||
for _ in trange(config.warmup_iterations):
|
||||
_ = generate_fn(max_new_tokens=config.num_tokens_to_generate)
|
||||
_ = self.time_generate(max_new_tokens=config.num_tokens_to_generate)
|
||||
self.logger.info("Warmup over.")
|
||||
|
||||
# Measurement runs
|
||||
result = BenchmarkResult()
|
||||
self.logger.info(f"Benchmarking with {config.measurement_iterations} iterations.")
|
||||
for _ in trange(config.measurement_iterations):
|
||||
e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = generate_fn(
|
||||
e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = self.time_generate(
|
||||
max_new_tokens=config.num_tokens_to_generate,
|
||||
gpu_monitor=(GPUMonitor(logger=self.logger) if config.gpu_monitoring else None),
|
||||
)
|
||||
@ -265,68 +243,11 @@ class BenchmarkRunner:
|
||||
self.profile_generate(num_tokens_to_profile, config.name)
|
||||
|
||||
return {
|
||||
"metadata": BenchmarkMetadata(
|
||||
model_id=model_id,
|
||||
branch_name=self.branch_name,
|
||||
commit_id=self.commit_id,
|
||||
commit_message=self.commit_message,
|
||||
),
|
||||
"metadata": BenchmarkMetadata(model_id=model_id, commit_id=self.commit_id),
|
||||
"measurements": result,
|
||||
"config": config,
|
||||
}
|
||||
|
||||
# TODO: refactor `generate_batch` to handle streaming so we can use it here
|
||||
def time_generate_batch(
|
||||
self,
|
||||
max_new_tokens: int,
|
||||
gpu_monitor: GPUMonitor | None = None,
|
||||
) -> tuple[float, list[float], str, GPURawMetrics | None]:
|
||||
if gpu_monitor is not None:
|
||||
gpu_monitor.start()
|
||||
config = GenerationConfig(
|
||||
max_new_tokens=max_new_tokens,
|
||||
eos_token_id=self.tokenizer.eos_token_id,
|
||||
pad_token_id=self.tokenizer.pad_token_id,
|
||||
do_sample=True,
|
||||
)
|
||||
manager = self.model.init_continuous_batching(config)
|
||||
manager.start()
|
||||
try:
|
||||
first_req_results = []
|
||||
timestamps = []
|
||||
wall_time_0 = time.perf_counter()
|
||||
inputs = self.inputs["input_ids"].tolist()
|
||||
manager.add_requests(inputs, max_new_tokens=max_new_tokens, streaming=True)
|
||||
first_req_id = None
|
||||
num_requests = len(inputs)
|
||||
finished_requests = 0
|
||||
while finished_requests < num_requests:
|
||||
# NOTE: I don't like having the extra if stmt here, but hopefully won't degrade perf too much
|
||||
result = manager.get_result()
|
||||
if result:
|
||||
timestamps.append(time.perf_counter() - wall_time_0)
|
||||
if result.is_finished():
|
||||
finished_requests += 1
|
||||
if first_req_id is None:
|
||||
first_req_id = result.request_id
|
||||
if result.request_id == first_req_id:
|
||||
first_req_results.append(result)
|
||||
else:
|
||||
if not manager.is_running():
|
||||
raise RuntimeError("Generation thread exited unexpectedly")
|
||||
wall_time_1 = time.perf_counter()
|
||||
gpu_metrics = gpu_monitor.stop_and_collect() if gpu_monitor is not None else None
|
||||
decoded_output = self.tokenizer.decode(
|
||||
[res.generated_tokens[0] for res in first_req_results], skip_special_tokens=True
|
||||
)
|
||||
shape_and_decoded_output = f"{(1, len(first_req_results))} | {decoded_output}"
|
||||
e2e_latency = wall_time_1 - wall_time_0
|
||||
return e2e_latency, timestamps, shape_and_decoded_output, gpu_metrics
|
||||
except Exception as e:
|
||||
raise e
|
||||
finally:
|
||||
manager.stop()
|
||||
|
||||
def time_generate(
|
||||
self,
|
||||
max_new_tokens: int,
|
||||
@ -384,28 +305,33 @@ class BenchmarkRunner:
|
||||
benchmark_configs: list[BenchmarkConfig],
|
||||
num_tokens_to_profile: int = 0,
|
||||
pretty_print_summary: bool = True,
|
||||
) -> tuple[str, dict[str, Any]]:
|
||||
"""Run multiple benchmarks for the given model ID and list of benchmark configs."""
|
||||
) -> dict[str, Any]:
|
||||
all_results = {}
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
start_time = time.perf_counter()
|
||||
|
||||
n_configs = len(benchmark_configs)
|
||||
for i, config in enumerate(benchmark_configs):
|
||||
# Handle SDPA backend if not determined by the config (needs to be done before skipping duplicates)
|
||||
if config.attn_implementation == "sdpa" and config.sdpa_backend is None:
|
||||
default_backend = "flash_attention" # FIXME: torch has a _cur_sdpa_kernel_backends but it fails
|
||||
self.logger.warning(f"No SDPA backend provided, using {default_backend} instead.")
|
||||
config.sdpa_backend = default_backend
|
||||
|
||||
# Skip if already run
|
||||
if config.hash in all_results:
|
||||
self.logger.info(f"Skipping duplicate config {config.name} for model {model_id} ({i + 1}/{n_configs})")
|
||||
continue
|
||||
|
||||
# Otherwise, run the benchmark
|
||||
self.setup_benchmark(model_id, config)
|
||||
self.setup_one_run(model_id, config)
|
||||
self.logger.info(
|
||||
f"Running benchmark of model {model_id} with scenario: {config.name} ({i + 1}/{n_configs})"
|
||||
)
|
||||
|
||||
# Launch benchmark in a try/except block to avoid stopping the whole run if one benchmark fails
|
||||
try:
|
||||
results = self.run_benchmark(model_id, config, num_tokens_to_profile)
|
||||
results = self.run_one_benchmark(model_id, config, num_tokens_to_profile)
|
||||
if results is not None:
|
||||
all_results[config.hash] = results
|
||||
|
||||
@ -429,14 +355,10 @@ class BenchmarkRunner:
|
||||
for result in all_results.values():
|
||||
print("=" * 100)
|
||||
print(f"Config: {result['config'].infer_name(compact=False)}\n")
|
||||
result["measurements"].pprint(
|
||||
batch_size=result["config"].batch_size,
|
||||
num_generated_tokens=result["config"].num_tokens_to_generate,
|
||||
tabs=1,
|
||||
)
|
||||
result["measurements"].pprint(batch_size=result["config"].batch_size, tabs=1)
|
||||
print("=" * 100)
|
||||
|
||||
return (timestamp, all_results)
|
||||
return all_results
|
||||
|
||||
def save_results(self, model_name: str, results: dict, timestamp: str = "") -> str:
|
||||
"""Save benchmark results to JSON file."""
|
||||
@ -465,43 +387,3 @@ class BenchmarkRunner:
|
||||
|
||||
self.logger.info(f"Results saved to {filepath}")
|
||||
return filepath
|
||||
|
||||
def push_results_to_hub(self, dataset_id: str, results: dict[Any, Any], timestamp: str) -> None:
|
||||
if PUSH_TO_HUB_TOKEN is None:
|
||||
raise ValueError(
|
||||
"PUSH_TO_HUB_TOKEN is not set, cannot push results to the Hub. When setting dataset_id, please also set the PUSH_TO_HUB_TOKEN environment variable."
|
||||
)
|
||||
|
||||
n_results = len(results)
|
||||
self.logger.info(f"Pushing {n_results} results to: {dataset_id}")
|
||||
rows = []
|
||||
for cfg_hash, entry in results.items():
|
||||
row = {
|
||||
"benchmark_config_hash": cfg_hash,
|
||||
"config": entry["config"].to_dict(),
|
||||
"measurements": entry["measurements"].to_dict(),
|
||||
"metadata": entry["metadata"].to_dict(),
|
||||
}
|
||||
rows.append(row)
|
||||
|
||||
ds = Dataset.from_list(rows)
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
jsonl_path = os.path.join(tmp, "data.jsonl")
|
||||
with open(jsonl_path, "w") as f:
|
||||
json_lines = []
|
||||
for ex in ds:
|
||||
json_lines.append(json.dumps(ex, ensure_ascii=False))
|
||||
f.write("\n".join(json_lines))
|
||||
|
||||
api = HfApi()
|
||||
# NOTE: we expect the repository to already exist
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") if not timestamp else timestamp
|
||||
file_name = f"benchmark_run_{timestamp}.jsonl"
|
||||
api.upload_file(
|
||||
path_or_fileobj=jsonl_path,
|
||||
path_in_repo=file_name,
|
||||
repo_id=dataset_id,
|
||||
repo_type="dataset",
|
||||
token=PUSH_TO_HUB_TOKEN,
|
||||
)
|
||||
self.logger.info(f"Succesfully uploaded results to: {dataset_id}")
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
@ -36,17 +36,16 @@ def add_unit_to_duration(stats: dict[str, float]) -> dict[str, str]:
|
||||
return stats
|
||||
|
||||
|
||||
def equalize_lengths_and_collate(stats: dict[str, dict[str, str]]) -> dict[str, str]:
|
||||
"""Note: This operation is destructive as it will update values in place before returning a new correctly formatted dict"""
|
||||
def equalize_lengths_and_collate(stats: list[dict[str, str]]) -> list[str]:
|
||||
keys = ["avg", "std", "min", "med", "max", "p95"]
|
||||
for key in keys:
|
||||
max_length = max(len(stat[key]) for stat in stats.values())
|
||||
for stat in stats.values():
|
||||
max_length = max(len(stat[key]) for stat in stats)
|
||||
for stat in stats:
|
||||
stat[key] = stat[key].ljust(max_length, " ")
|
||||
return {name: " ".join([f"{key}={stat[key]}" for key in keys]) for name, stat in stats.items()}
|
||||
return [" ".join([f"{key}={stat[key]}" for key in keys]) for stat in stats]
|
||||
|
||||
|
||||
def pretty_print_dict(data: dict[str, str], tabs: int = 0) -> None:
|
||||
def pretty_print_dict(data: dict[str, Any], tabs: int = 0) -> None:
|
||||
max_key_length = max([len(key) for key in data.keys()])
|
||||
for key, value in data.items():
|
||||
tabs_str = " " * tabs
|
||||
@ -60,26 +59,19 @@ class BenchmarkMetadata:
|
||||
|
||||
model_id: str
|
||||
timestamp: str
|
||||
branch_name: str
|
||||
commit_id: str
|
||||
commit_message: str
|
||||
hardware_info: HardwareInfo
|
||||
|
||||
def __init__(self, model_id: str, commit_id: str, branch_name: str = "main", commit_message: str = "") -> None:
|
||||
def __init__(self, model_id: str, commit_id: str):
|
||||
self.model_id = model_id
|
||||
self.timestamp = datetime.now(timezone.utc).isoformat()
|
||||
self.branch_name = branch_name
|
||||
self.timestamp = datetime.utcnow().isoformat()
|
||||
self.commit_id = commit_id
|
||||
self.commit_message = commit_message
|
||||
self.hardware_info = HardwareInfo()
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"model_id": self.model_id,
|
||||
"timestamp": self.timestamp,
|
||||
"branch_name": self.branch_name,
|
||||
"commit_id": self.commit_id,
|
||||
"commit_message": self.commit_message,
|
||||
"hardware_info": self.hardware_info.to_dict(),
|
||||
}
|
||||
|
||||
@ -142,19 +134,27 @@ class BenchmarkResult:
|
||||
def get_measured_itl(self) -> list[float]:
|
||||
return [(dt[-1] - dt[0]) / (len(dt) - 1) for dt in self.token_generation_times if len(dt) > 1]
|
||||
|
||||
def get_throughput(self, total_generated_tokens: int) -> list[float]:
|
||||
return [total_generated_tokens / e2e_latency for e2e_latency in self.e2e_latency]
|
||||
def get_throughput(self, batch_size: int) -> float:
|
||||
return [
|
||||
batch_size * len(dt) / e2e_latency
|
||||
for e2e_latency, dt in zip(self.e2e_latency, self.token_generation_times)
|
||||
]
|
||||
|
||||
def pprint(self, batch_size: int = 0, num_generated_tokens: int = 0, tabs: int = 0) -> None:
|
||||
measurements = {
|
||||
"E2E Latency": add_unit_to_duration(compute_basic_statistics(self.e2e_latency)),
|
||||
"Time to First Token": add_unit_to_duration(compute_basic_statistics(self.get_measured_ttft())),
|
||||
}
|
||||
itl_values = self.get_measured_itl()
|
||||
if len(itl_values) > 0:
|
||||
measurements["Inter-Token Latency"] = add_unit_to_duration(compute_basic_statistics(itl_values))
|
||||
def pprint(self, batch_size: int = 0, tabs: int = 0) -> None:
|
||||
stats_to_collate = [
|
||||
add_unit_to_duration(compute_basic_statistics(self.e2e_latency)),
|
||||
add_unit_to_duration(compute_basic_statistics(self.get_measured_ttft())),
|
||||
add_unit_to_duration(compute_basic_statistics(self.get_measured_itl())),
|
||||
]
|
||||
if batch_size > 0:
|
||||
throughput_stats = compute_basic_statistics(self.get_throughput(batch_size * num_generated_tokens))
|
||||
measurements["Throughput"] = {key: f"{value:.2f}tok/s" for key, value in throughput_stats.items()}
|
||||
dict_to_pprint = equalize_lengths_and_collate(measurements)
|
||||
throughput_stats = compute_basic_statistics(self.get_throughput(batch_size))
|
||||
stats_to_collate.append({key: f"{value:.2f}tok/s" for key, value in throughput_stats.items()})
|
||||
collated_stats = equalize_lengths_and_collate(stats_to_collate)
|
||||
dict_to_pprint = {
|
||||
"E2E Latency": collated_stats[0],
|
||||
"Time to First Token": collated_stats[1],
|
||||
"Inter-Token Latency": collated_stats[2],
|
||||
}
|
||||
if batch_size > 0:
|
||||
dict_to_pprint["Throughput"] = collated_stats[3]
|
||||
pretty_print_dict(dict_to_pprint, tabs=tabs)
|
||||
|
||||
@ -2,5 +2,6 @@ numpy>=1.21.0
|
||||
psutil>=5.8.0
|
||||
gpustat>=1.0.0
|
||||
torch>=2.0.0
|
||||
transformers>=4.30.0
|
||||
datasets>=2.10.0
|
||||
huggingface_hub>=0.16.0
|
||||
huggingface_hub>=0.16.0
|
||||
@ -23,7 +23,7 @@ import logging
|
||||
import sys
|
||||
import uuid
|
||||
|
||||
from framework.benchmark_config import adapt_configs, get_config_by_level
|
||||
from framework.benchmark_config import BenchmarkConfig, generate_all_configs, generate_main_configs
|
||||
from framework.benchmark_runner import BenchmarkRunner
|
||||
|
||||
|
||||
@ -33,37 +33,18 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--output-dir", type=str, default=None, help="Output dir for benchmark results")
|
||||
parser.add_argument("--log-level", type=str, choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO")
|
||||
parser.add_argument("--model-id", type=str, help="Specific model ID to benchmark (if supported by benchmarks)")
|
||||
parser.add_argument("--warmup", "-w", type=int, default=3, help="Number of warmup iterations")
|
||||
parser.add_argument("--iterations", "-i", type=int, default=10, help="Number of measurement iterations")
|
||||
|
||||
parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations")
|
||||
parser.add_argument("--iterations", type=int, default=10, help="Number of measurement iterations")
|
||||
|
||||
parser.add_argument("--batch-size", "-b", type=int, nargs="+", help="Batch size")
|
||||
parser.add_argument("--sequence-length", "-s", type=int, nargs="+", help="Sequence length")
|
||||
parser.add_argument("--num-tokens-to-generate", "-n", type=int, nargs="+", help="Number of tokens to generate")
|
||||
|
||||
parser.add_argument(
|
||||
"--level",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Level of coverage for the benchmark. 0: only the main config, 1: a few important configs, 2: a config for"
|
||||
" each attn implementation an option, 3: cross-generate all combinations of configs, 4: cross-generate all"
|
||||
" combinations of configs w/ all compile modes",
|
||||
)
|
||||
parser.add_argument("--cross-generate", action="store_true", help="Cross-generate all combinations of configs")
|
||||
parser.add_argument("--num-tokens-to-profile", "-p", type=int, default=0, help="Number of tokens to profile")
|
||||
|
||||
parser.add_argument("--branch-name", type=str, help="Git branch name")
|
||||
parser.add_argument("--commit-id", type=str, help="Git commit ID (if not provided, will auto-detect from git)")
|
||||
parser.add_argument("--commit-message", type=str, help="Git commit message")
|
||||
|
||||
parser.add_argument(
|
||||
"--no-gpu-monitoring", action="store_true", help="Disables GPU monitoring during benchmark runs"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--push-result-to-dataset",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Name of the dataset to push results to. If not provided, results are not pushed to the Hub.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Setup logging
|
||||
@ -80,42 +61,56 @@ if __name__ == "__main__":
|
||||
logger.info(f"Benchmark run UUID: {benchmark_run_uuid}")
|
||||
logger.info(f"Output directory: {args.output_dir}")
|
||||
|
||||
# We cannot compute ITL if we don't have at least two measurements
|
||||
if any(n <= 1 for n in args.num_tokens_to_generate):
|
||||
raise ValueError("--num_tokens_to_generate arguments should be larger than 1")
|
||||
|
||||
# Error out if one of the arguments is not provided
|
||||
if len(args.batch_size) * len(args.sequence_length) * len(args.num_tokens_to_generate) == 0:
|
||||
raise ValueError(
|
||||
"At least one of the arguments --batch-size, --sequence-length, or --num-tokens-to-generate is required"
|
||||
)
|
||||
|
||||
# Get the configs for the given coverage level
|
||||
configs = get_config_by_level(args.level)
|
||||
# Adapt the configs to the given arguments
|
||||
configs = adapt_configs(
|
||||
configs,
|
||||
args.warmup,
|
||||
args.iterations,
|
||||
args.batch_size,
|
||||
args.sequence_length,
|
||||
args.num_tokens_to_generate,
|
||||
not args.no_gpu_monitoring,
|
||||
# If there is only one (batch_size, sequence_length, num_tokens_to_generate), we benchmark across configs
|
||||
elif len(args.batch_size) * len(args.sequence_length) * len(args.num_tokens_to_generate) == 1:
|
||||
if args.cross_generate:
|
||||
benchmark_configs = generate_all_configs(
|
||||
warmup_iterations=args.warmup,
|
||||
measurement_iterations=args.iterations,
|
||||
batch_size=args.batch_size[0],
|
||||
sequence_length=args.sequence_length[0],
|
||||
num_tokens_to_generate=args.num_tokens_to_generate[0],
|
||||
)
|
||||
else:
|
||||
benchmark_configs = generate_main_configs(
|
||||
warmup_iterations=args.warmup,
|
||||
measurement_iterations=args.iterations,
|
||||
batch_size=args.batch_size[0],
|
||||
sequence_length=args.sequence_length[0],
|
||||
num_tokens_to_generate=args.num_tokens_to_generate[0],
|
||||
)
|
||||
|
||||
# Otherwise, we benchmark across all combinations of dimensions
|
||||
else:
|
||||
main_config = generate_main_configs(
|
||||
warmup_iterations=args.warmup,
|
||||
measurement_iterations=args.iterations,
|
||||
batch_size=args.batch_size[0],
|
||||
sequence_length=args.sequence_length[0],
|
||||
num_tokens_to_generate=args.num_tokens_to_generate[0],
|
||||
)[0]
|
||||
benchmark_configs = []
|
||||
for num_tokens_to_generate in args.num_tokens_to_generate:
|
||||
for sequence_length in args.sequence_length:
|
||||
for batch_size in args.batch_size:
|
||||
cfg_dict = main_config.to_dict()
|
||||
cfg_dict["batch_size"] = batch_size
|
||||
cfg_dict["sequence_length"] = sequence_length
|
||||
cfg_dict["num_tokens_to_generate"] = num_tokens_to_generate
|
||||
cfg_dict.pop("name")
|
||||
benchmark_configs.append(BenchmarkConfig.from_dict(cfg_dict))
|
||||
|
||||
runner = BenchmarkRunner(logger, args.output_dir, args.commit_id)
|
||||
results = runner.run_benchmarks(
|
||||
args.model_id,
|
||||
benchmark_configs,
|
||||
args.num_tokens_to_profile,
|
||||
pretty_print_summary=True,
|
||||
)
|
||||
|
||||
runner = BenchmarkRunner(logger, args.output_dir, args.branch_name, args.commit_id, args.commit_message)
|
||||
timestamp, results = runner.run_benchmarks(
|
||||
args.model_id, configs, args.num_tokens_to_profile, pretty_print_summary=True
|
||||
)
|
||||
|
||||
# Check if we have any successful benchmark results
|
||||
if len(results) == 0:
|
||||
logger.error("No benchmarks completed successfully. All benchmark configurations failed.")
|
||||
sys.exit(1)
|
||||
|
||||
dataset_id = args.push_result_to_dataset
|
||||
if dataset_id is not None and len(results) > 0:
|
||||
runner.push_results_to_hub(dataset_id, results, timestamp)
|
||||
|
||||
logger.info("Benchmark run completed successfully")
|
||||
sys.exit(0)
|
||||
# runner.save_results(args.model_id, results)
|
||||
|
||||
@ -58,6 +58,7 @@ NOT_DEVICE_TESTS = {
|
||||
"test_model_get_set_embeddings",
|
||||
"test_model_main_input_name",
|
||||
"test_correct_missing_keys",
|
||||
"test_tie_model_weights",
|
||||
"test_can_use_safetensors",
|
||||
"test_load_save_without_tied_weights",
|
||||
"test_tied_weights_keys",
|
||||
@ -87,8 +88,6 @@ def pytest_configure(config):
|
||||
config.addinivalue_line("markers", "not_device_test: mark the tests always running on cpu")
|
||||
config.addinivalue_line("markers", "torch_compile_test: mark test which tests torch compile functionality")
|
||||
config.addinivalue_line("markers", "torch_export_test: mark test which tests torch export functionality")
|
||||
config.addinivalue_line("markers", "flash_attn_test: mark test which tests flash attention functionality")
|
||||
config.addinivalue_line("markers", "flash_attn_3_test: mark test which tests flash attention 3 functionality")
|
||||
|
||||
os.environ["DISABLE_SAFETENSORS_CONVERSION"] = "true"
|
||||
|
||||
|
||||
@ -5,7 +5,7 @@ ARG REF=main
|
||||
RUN apt-get update && apt-get install -y time git g++ pkg-config make git-lfs
|
||||
ENV UV_PYTHON=/usr/local/bin/python
|
||||
RUN pip install uv && uv pip install --no-cache-dir -U pip setuptools GitPython
|
||||
RUN uv pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-cache-dir --upgrade 'torch<2.9' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-cache-dir pypi-kenlm
|
||||
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[quality,testing,torch-speech,vision]"
|
||||
RUN git lfs install
|
||||
|
||||
@ -17,7 +17,7 @@ RUN make install -j 10
|
||||
|
||||
WORKDIR /
|
||||
|
||||
RUN uv pip install --no-cache --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-cache --upgrade 'torch<2.9' --index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-cache-dir --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,spacy,ftfy,rjieba]" unidic unidic-lite
|
||||
# spacy is not used so not tested. Causes to failures. TODO fix later
|
||||
|
||||
@ -5,7 +5,7 @@ USER root
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git-lfs ffmpeg curl
|
||||
ENV UV_PYTHON=/usr/local/bin/python
|
||||
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
|
||||
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-cache-dir 'torch<2.9' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
|
||||
|
||||
|
||||
@ -5,7 +5,7 @@ USER root
|
||||
RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1 g++ tesseract-ocr git-lfs curl
|
||||
ENV UV_PYTHON=/usr/local/bin/python
|
||||
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
|
||||
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-cache-dir 'torch<2.9' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-cache-dir --no-deps timm accelerate
|
||||
RUN uv pip install -U --no-cache-dir pytesseract python-Levenshtein opencv-python nltk
|
||||
# RUN uv pip install --no-cache-dir natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels
|
||||
|
||||
@ -5,7 +5,7 @@ USER root
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git ffmpeg curl
|
||||
ENV UV_PYTHON=/usr/local/bin/python
|
||||
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
|
||||
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-cache-dir 'torch<2.9' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
|
||||
|
||||
|
||||
@ -5,7 +5,7 @@ USER root
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git-lfs ffmpeg curl
|
||||
ENV UV_PYTHON=/usr/local/bin/python
|
||||
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
|
||||
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-cache-dir 'torch<2.9' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]"
|
||||
|
||||
|
||||
@ -9,15 +9,10 @@ SHELL ["sh", "-lc"]
|
||||
# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
|
||||
# to be used as arguments for docker build (so far).
|
||||
|
||||
ARG PYTORCH='2.9.0'
|
||||
ARG PYTORCH='2.8.0'
|
||||
# Example: `cu102`, `cu113`, etc.
|
||||
ARG CUDA='cu126'
|
||||
|
||||
# This needs to be compatible with the above `PYTORCH`.
|
||||
ARG TORCHCODEC='0.8.0'
|
||||
|
||||
ARG FLASH_ATTN='false'
|
||||
|
||||
RUN apt update
|
||||
RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs
|
||||
RUN git lfs install
|
||||
@ -26,48 +21,14 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip
|
||||
ARG REF=main
|
||||
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev]
|
||||
|
||||
# 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future.
|
||||
# 2. For `torchcodec`, use `cpu` as we don't have `libnvcuvid.so` on the host runner. See https://github.com/meta-pytorch/torchcodec/issues/912
|
||||
# **Important**: We need to specify `torchcodec` version if the torch version is not the latest stable one.
|
||||
# 3. `set -e` means "exit immediately if any command fails".
|
||||
RUN set -e; \
|
||||
# Determine torch version
|
||||
if [ ${#PYTORCH} -gt 0 ] && [ "$PYTORCH" != "pre" ]; then \
|
||||
VERSION="torch==${PYTORCH}.*"; \
|
||||
TORCHCODEC_VERSION="torchcodec==${TORCHCODEC}.*"; \
|
||||
else \
|
||||
VERSION="torch"; \
|
||||
TORCHCODEC_VERSION="torchcodec"; \
|
||||
fi; \
|
||||
\
|
||||
# Log the version being installed
|
||||
echo "Installing torch version: $VERSION"; \
|
||||
\
|
||||
# Install PyTorch packages
|
||||
if [ "$PYTORCH" != "pre" ]; then \
|
||||
python3 -m pip install --no-cache-dir -U \
|
||||
$VERSION \
|
||||
torchvision \
|
||||
torchaudio \
|
||||
--extra-index-url https://download.pytorch.org/whl/$CUDA; \
|
||||
# We need to specify the version if the torch version is not the latest stable one.
|
||||
python3 -m pip install --no-cache-dir -U \
|
||||
$TORCHCODEC_VERSION --extra-index-url https://download.pytorch.org/whl/cpu; \
|
||||
else \
|
||||
python3 -m pip install --no-cache-dir -U --pre \
|
||||
torch \
|
||||
torchvision \
|
||||
torchaudio \
|
||||
--extra-index-url https://download.pytorch.org/whl/nightly/$CUDA; \
|
||||
python3 -m pip install --no-cache-dir -U --pre \
|
||||
torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/cpu; \
|
||||
fi
|
||||
# 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`.
|
||||
# Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions).
|
||||
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir -U timm
|
||||
|
||||
RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir --no-build-isolation git+https://github.com/facebookresearch/detectron2.git || echo "Don't install detectron2 with nightly torch"
|
||||
RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git || echo "Don't install detectron2 with nightly torch"
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir pytesseract
|
||||
|
||||
@ -92,7 +53,7 @@ RUN python3 -m pip install --no-cache-dir bitsandbytes
|
||||
RUN python3 -m pip install --no-cache-dir quanto
|
||||
|
||||
# After using A10 as CI runner, let's run FA2 tests
|
||||
RUN [ "$FLASH_ATTN" != "false" ] && python3 -m pip uninstall -y ninja && python3 -m pip install --no-cache-dir ninja && python3 -m pip install flash-attn --no-cache-dir --no-build-isolation || echo "Don't install FA2 with nightly torch"
|
||||
RUN [ "$PYTORCH" != "pre" ] && python3 -m pip uninstall -y ninja && python3 -m pip install --no-cache-dir ninja && python3 -m pip install flash-attn --no-cache-dir --no-build-isolation || echo "Don't install FA2 with nightly torch"
|
||||
|
||||
# TODO (ydshieh): check this again
|
||||
# `quanto` will install `ninja` which leads to many `CUDA error: an illegal memory access ...` in some model tests
|
||||
|
||||
@ -10,7 +10,7 @@ RUN apt-get -y update && apt-get install -y libsndfile1-dev && apt install -y te
|
||||
# Torch needs to be installed before deepspeed
|
||||
RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed]
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir --no-build-isolation torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract
|
||||
RUN python3 -m pip install --no-cache-dir torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract
|
||||
RUN python3 -m pip install -U "itsdangerous<2.1.0"
|
||||
|
||||
# Test if the image could successfully build the doc. before publishing the image
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
FROM rocm/pytorch:rocm7.0.2_ubuntu24.04_py3.12_pytorch_release_2.7.1
|
||||
FROM rocm/pytorch:rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.7.1
|
||||
LABEL maintainer="Hugging Face"
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
@ -10,8 +10,8 @@ RUN apt update && \
|
||||
|
||||
RUN git lfs install
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip numpy importlib-metadata setuptools wheel ninja pytesseract "itsdangerous<2.1.0"
|
||||
RUN python3 -m pip install --no-cache-dir --no-build-isolation git+https://github.com/facebookresearch/detectron2.git
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip numpy
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade importlib-metadata setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0"
|
||||
|
||||
ARG REF=main
|
||||
WORKDIR /
|
||||
@ -39,7 +39,6 @@ RUN python3 -m pip install --no-cache-dir "torchcodec==0.5"
|
||||
# Install flash attention from source. Tested with commit 6387433156558135a998d5568a9d74c1778666d8
|
||||
RUN git clone https://github.com/ROCm/flash-attention/ -b tridao && \
|
||||
cd flash-attention && \
|
||||
GPU_ARCHS="gfx942" python setup.py install
|
||||
# GPU_ARCHS builds for MI300, MI325 but not MI355: we would need to add `;gfx950` but it takes too long to build.
|
||||
GPU_ARCHS="gfx942" python setup.py install
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir einops
|
||||
|
||||
@ -21,7 +21,7 @@ RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'p
|
||||
# Install latest release PyTorch
|
||||
# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
|
||||
# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
|
||||
RUN python3 -m pip uninstall -y torch torchvision torchaudio torchcodec && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
|
||||
RUN python3 -m pip uninstall -y torch torchvision torchaudio && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
||||
|
||||
@ -43,7 +43,7 @@ RUN python3 -m pip uninstall -y deepspeed
|
||||
# This has to be run (again) inside the GPU VMs running the tests.
|
||||
# The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
|
||||
# TODO: Find out why test fail.
|
||||
RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check 2>&1
|
||||
RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
|
||||
|
||||
# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
|
||||
RUN python3 -m pip uninstall -y kernels
|
||||
|
||||
@ -3,10 +3,11 @@ LABEL maintainer="Hugging Face"
|
||||
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
ARG PYTHON_VER=3.12
|
||||
ARG PYTHON_VER=3.11
|
||||
ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt-get remove -y python3.10 && apt-get autoremove -y
|
||||
RUN apt-get update && \
|
||||
apt-get install -y software-properties-common && \
|
||||
add-apt-repository -y ppa:deadsnakes/ppa && \
|
||||
@ -22,6 +23,7 @@ RUN apt-get update && \
|
||||
apt-utils \
|
||||
build-essential \
|
||||
ca-certificates \
|
||||
clinfo \
|
||||
curl \
|
||||
git \
|
||||
git-lfs \
|
||||
@ -33,6 +35,7 @@ RUN apt-get update && \
|
||||
rsync \
|
||||
sudo \
|
||||
libnl-genl-3-200 \
|
||||
xpu-smi \
|
||||
unzip \
|
||||
ffmpeg \
|
||||
tesseract-ocr \
|
||||
@ -42,47 +45,34 @@ RUN apt-get update && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y \
|
||||
linux-headers-$(uname -r) linux-modules-extra-$(uname -r) \
|
||||
linux-headers-$(uname -r) \
|
||||
linux-modules-extra-$(uname -r) \
|
||||
flex bison \
|
||||
intel-fw-gpu intel-i915-dkms xpu-smi intel-ocloc clinfo\
|
||||
intel-fw-gpu intel-i915-dkms xpu-smi \
|
||||
intel-opencl-icd libze-intel-gpu1 libze1 \
|
||||
intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
|
||||
libegl-mesa0 libegl1 libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
|
||||
libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
|
||||
libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
|
||||
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo \
|
||||
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc \
|
||||
libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Use virtual env because Ubuntu-24 does not allowed pip on original python
|
||||
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
ENV PATH="/root/.local/bin:$PATH"
|
||||
ENV VIRTUAL_ENV="/opt/venv"
|
||||
ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python
|
||||
RUN uv venv --python ${PYTHON_VER} --seed ${VIRTUAL_ENV}
|
||||
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
|
||||
RUN pip install --upgrade pip
|
||||
RUN pip install triton==3.3.0
|
||||
|
||||
RUN pip install --upgrade pip wheel
|
||||
RUN pip install triton==3.4.0
|
||||
RUN pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/xpu --no-cache-dir
|
||||
|
||||
RUN pip install torch==2.8.0+xpu torchvision==0.23.0+xpu torchaudio==2.8.0+xpu --index-url https://download.pytorch.org/whl/xpu --no-cache-dir
|
||||
RUN pip install evaluate torchdata pyctcdecode pytesseract decord galore-torch fire scipy scikit-learn sentencepiece sacremoses nltk rouge_score librosa soundfile g2p_en mpi4py requests_mock
|
||||
RUN pip install pretty_midi essentia resampy Levenshtein av sacrebleu phonemizer invisible_watermark schedulefree
|
||||
RUN pip install gguf hqq compressed_tensors gptqmodel mergekit autoawq deepspeed torchao onnx
|
||||
RUN pip install hf_transfer huggingface-hub hf-doc-builder datasets optimum-quanto timm transformers accelerate optimum peft
|
||||
|
||||
RUN pip install torchcodec torchdata --no-cache-dir
|
||||
|
||||
RUN pip install evaluate pyctcdecode pytesseract decord galore-torch fire scipy scikit-learn sentencepiece sacremoses nltk rouge_score librosa soundfile g2p_en mpi4py requests_mock
|
||||
RUN pip install pretty_midi essentia resampy Levenshtein av sacrebleu phonemizer invisible_watermark schedulefree setuptools
|
||||
RUN pip install gptqmodel --no-build-isolation
|
||||
RUN pip install gguf hqq compressed_tensors autoawq deepspeed torchao onnx auto_round
|
||||
RUN pip install hf_transfer huggingface-hub hf-doc-builder datasets optimum-quanto timm transformers accelerate optimum peft diffusers trl kernels
|
||||
|
||||
# install liger-kernel
|
||||
RUN pip install git+https://github.com/linkedin/Liger-Kernel.git --extra-index-url https://download.pytorch.org/whl/test/xpu
|
||||
|
||||
# install mergekit
|
||||
RUN pip install --break-system-packages git+https://github.com/arcee-ai/mergekit.git@v0.1.3
|
||||
|
||||
# install bitsandbytes
|
||||
RUN pip install git+https://github.com/bitsandbytes-foundation/bitsandbytes.git
|
||||
|
||||
|
||||
@ -24,7 +24,7 @@ RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch';
|
||||
RUN echo torch=$VERSION
|
||||
# `torchvision` and `torchaudio` should be installed along with `torch`, especially for nightly build.
|
||||
# Currently, let's just use their latest releases (when `torch` is installed with a release version)
|
||||
RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
|
||||
RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
||||
|
||||
@ -50,7 +50,7 @@ RUN python3 -m pip install --no-cache-dir hqq
|
||||
RUN python3 -m pip install --no-cache-dir gguf
|
||||
|
||||
# Add autoawq for quantization testing
|
||||
RUN python3 -m pip install --no-cache-dir --no-build-isolation autoawq[kernels]
|
||||
RUN python3 -m pip install --no-cache-dir autoawq[kernels]
|
||||
|
||||
# Add quanto for quantization testing
|
||||
RUN python3 -m pip install --no-cache-dir optimum-quanto
|
||||
|
||||
@ -24,7 +24,7 @@ pip install -e ".[dev]"
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
> This command might fail for some OS that are missing dependencies. Check step 4 in [Create a Pull Request](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#create-a-pull-request) to work around it.
|
||||
> This command might fail for some OS that are missing dependencies. Check step 4 in [Create a Pull Request](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#create-a-pull-request) to workaround it.
|
||||
|
||||
Then you need to install our special tool that builds the documentation:
|
||||
|
||||
@ -38,7 +38,7 @@ pip install git+https://github.com/huggingface/doc-builder
|
||||
|
||||
## Building the documentation
|
||||
|
||||
Once you have set up the `doc-builder` and additional packages, you can generate the documentation by
|
||||
Once you have setup the `doc-builder` and additional packages, you can generate the documentation by
|
||||
typing the following command:
|
||||
|
||||
```bash
|
||||
@ -295,11 +295,12 @@ Here's an example of a tuple return, comprising several objects:
|
||||
Due to the rapidly growing repository, it is important to make sure that no files that would significantly weigh down the repository are added. This includes images, videos, and other non-text files. We prefer to leverage a hf.co hosted `dataset` like
|
||||
the ones hosted on [`hf-internal-testing`](https://huggingface.co/hf-internal-testing) in which to place these files and reference
|
||||
them by URL. We recommend putting them in the following dataset: [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images).
|
||||
If an external contribution, feel free to add the images to your PR and ask a Hugging Face member to migrate them to this dataset.
|
||||
If an external contribution, feel free to add the images to your PR and ask a Hugging Face member to migrate your images
|
||||
to this dataset.
|
||||
|
||||
## Styling the docstring
|
||||
|
||||
We have an automatic script running with the `make style` command that will make sure that:
|
||||
We have an automatic script running with the `make style` comment that will make sure that:
|
||||
- the docstrings fully take advantage of the line width
|
||||
- all code examples are formatted using black, like the code of the Transformers library
|
||||
|
||||
|
||||
@ -123,6 +123,8 @@
|
||||
title: تشغيل التدريب على Amazon SageMaker
|
||||
- local: serialization
|
||||
title: التصدير إلى ONNX
|
||||
- local: torchscript
|
||||
title: التصدير إلى TorchScript
|
||||
- local: notebooks
|
||||
title: دفاتر الملاحظات مع الأمثلة
|
||||
- local: community
|
||||
@ -258,6 +260,8 @@
|
||||
# title: النماذج
|
||||
# - local: main_classes/text_generation
|
||||
# title: توليد النصوص
|
||||
# - local: main_classes/onnx
|
||||
# title: ONNX
|
||||
# - local: main_classes/optimizer_schedules
|
||||
# title: التحسين
|
||||
# - local: main_classes/output
|
||||
|
||||
@ -32,7 +32,7 @@
|
||||
لتصدير نموذج 🤗 Transformers إلى ONNX، قم أولاً بتثبيت اعتماد إضافي:
|
||||
|
||||
```bash
|
||||
pip install optimum-onnx
|
||||
pip install optimum[exporters]
|
||||
```
|
||||
|
||||
للاطلاع على جميع المعامﻻت المتاحة، يرجى الرجوع إلى [وثائق 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli)، أو عرض المساعدة في سطر الأوامر:
|
||||
@ -111,3 +111,60 @@ optimum-cli export onnx --model keras-io/transformers-qa distilbert_base_cased_s
|
||||
### تصدير نموذج لهندسة غير مدعومة
|
||||
|
||||
إذا كنت ترغب في المساهمة من خلال إضافة دعم لنموذج لا يُمكن تصديره حاليًا، فيجب عليك أولاً التحقق مما إذا كان مدعومًا في [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/exporters/onnx/overview)، وإذا لم يكن مدعومًا، [فيمكنك المساهمة في 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute) مُباشرةً.
|
||||
|
||||
### تصدير نموذج باستخدام `transformers.onnx`
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
لم يعد يتم دعم `transformers.onnx` يُرجى تصدير النماذج باستخدام 🤗 Optimum كما هو موضح أعلاه. سيتم إزالة هذا القسم في الإصدارات القادمة.
|
||||
|
||||
</Tip>
|
||||
|
||||
لتصدير نموذج 🤗 Transformers إلى ONNX باستخدام `transformers.onnx`، ثبّت التبعيات الإضافية:
|
||||
|
||||
```bash
|
||||
pip install transformers[onnx]
|
||||
```
|
||||
|
||||
استخدم حزمة `transformers.onnx` كنموذج Python لتصدير نقطة حفظ باستخدام تكوين جاهز:
|
||||
|
||||
```bash
|
||||
python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/
|
||||
```
|
||||
|
||||
يُصدّر هذا رسمًا بيانيًا ONNX لنقطة الحفظ المُحددة بواسطة وسيطة `--model`. مرر أي نقطة حفظ على 🤗 Hub أو نقطة حفظ مُخزنة محليًا.
|
||||
يُمكن بعد ذلك تشغيل ملف `model.onnx` الناتج على أحد المُسرعات العديدة التي تدعم معيار ONNX. على سبيل المثال، قم بتحميل وتشغيل النموذج باستخدام ONNX Runtime كما يلي:
|
||||
|
||||
```python
|
||||
>>> from transformers import AutoTokenizer
|
||||
>>> from onnxruntime import InferenceSession
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
|
||||
>>> session = InferenceSession("onnx/model.onnx")
|
||||
>>> # يتوقع ONNX Runtime مصفوفات NumPy كمدخلات
|
||||
>>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
|
||||
>>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
|
||||
```
|
||||
|
||||
يُمكن الحصول على أسماء المخرجات المطلوبة (مثل `["last_hidden_state"]`) من خلال إلقاء نظرة على تكوين ONNX لكل نموذج. على سبيل المثال، بالنسبة لـ DistilBERT، لدينا:
|
||||
|
||||
```python
|
||||
>>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
|
||||
|
||||
>>> config = DistilBertConfig()
|
||||
>>> onnx_config = DistilBertOnnxConfig(config)
|
||||
>>> print(list(onnx_config.outputs.keys()))
|
||||
["last_hidden_state"]
|
||||
```
|
||||
|
||||
العمليات مُتطابقة لنقاط الحفظ TensorFlow على Hub. على سبيل المثال، صدّر نقطة حفظ TensorFlow خالصة كما يلي:
|
||||
|
||||
```bash
|
||||
python -m transformers.onnx --model=keras-io/transformers-qa onnx/
|
||||
```
|
||||
|
||||
لتصدير نموذج مُخزن محليًا، احفظ أوزان النموذج ومجزىء اللغوى في نفس الدليل (على سبيل المثال `local-pt-checkpoint`)، ثم قم بتصديره إلى ONNX عن طريق توجيه وسيط `--model` لحزمة `transformers.onnx` إلى الدليل المطلوب:
|
||||
|
||||
```bash
|
||||
python -m transformers.onnx --model=local-pt-checkpoint onnx/
|
||||
```
|
||||
154
docs/source/ar/torchscript.md
Normal file
154
docs/source/ar/torchscript.md
Normal file
@ -0,0 +1,154 @@
|
||||
# التصدير إلى TorchScript
|
||||
|
||||
<Tip>
|
||||
|
||||
هذه هي بداية تجاربنا مع TorchScript ولا زلنا نستكشف قدراته مع نماذج المدخلات المتغيرة الحجم. إنه مجال اهتمامنا وسنعمق تحليلنا في الإصدارات القادمة، مع المزيد من الأمثلة البرمجية، وتنفيذ أكثر مرونة، ومقاييس مقارنة بين الأكواد القائمة على Python مع أكواد TorchScript المُجمّعة.
|
||||
|
||||
</Tip>
|
||||
|
||||
وفقًا لـ [وثائق TorchScript](https://pytorch.org/docs/stable/jit.html):
|
||||
|
||||
> TorchScript هي طريقة لإنشاء نماذج قابلة للتسلسل والتحسين من تعليمات PyTorch البرمجية.
|
||||
|
||||
هناك وحدتان من PyTorch، [JIT and TRACE](https://pytorch.org/docs/stable/jit.html)، تتيحان للمطورين تصدير نماذجهم لإعادة استخدامها في برامج أخرى مثل برامج C++ المُحسّنة للأداء.
|
||||
|
||||
نقدم واجهة تتيح لك تصدير نماذج 🤗 Transformers إلى TorchScript بحيث يمكن إعادة استخدامها في بيئة مختلفة عن برامج Python القائمة إلى PyTorch. هنا نشرح كيفية تصدير نماذجنا واستخدامها باستخدام TorchScript.
|
||||
|
||||
يتطلب تصدير نموذج أمرين:
|
||||
|
||||
- تهيئة مثيل للنموذج باستخدام علامة `torchscript`
|
||||
- تمرير مُدخلات وهمية (dummy inputs) خلال النموذج
|
||||
|
||||
تنطوي هذه الضرورات على عدة أمور يجب على المطورين توخي الحذر بشأنها كما هو مفصل أدناه.
|
||||
|
||||
## علامة TorchScript والأوزان المرتبطة
|
||||
|
||||
علامة `torchscript` ضرورية لأن معظم نماذج اللغة 🤗 Transformers لها أوزان مرتبطة بين طبقة `Embedding` وطبقة `Decoding`. لا يسمح لك TorchScript بتصدير النماذج ذات الأوزان المرتبطة، لذلك من الضروري فصل الأوزان ونسخها مسبقًا.
|
||||
|
||||
النماذج المُهيأة باستخدام علامة `torchscript` لها طبقة `Embedding` وطبقة`Decoding` منفصلتين، مما يعني أنه لا ينبغي تدريبها لاحقًا. سيؤدي التدريب إلى عدم تزامن الطبقتين، مما يؤدي إلى نتائج غير متوقعة.
|
||||
|
||||
هذا لا ينطبق على النماذج التي لا تحتوي على رأس نموذج اللغة، حيث لا تملك أوزانًا مرتبطة. يمكن تصدير هذه النماذج بأمان دون علامة `torchscript`.
|
||||
|
||||
## المدخلات الوهمية والأطوال القياسية
|
||||
|
||||
تُستخدم المُدخلات الوهمية لتمرير أمامي خلال النموذج. أثناء انتشار قيم المُدخلات عبر الطبقات، يتتبع PyTorch العمليات المختلفة التي يتم تنفيذها على كل مصفوفة(tensor). ثم يتم استخدام هذه العمليات المُسجلة بعد ذلك لإنشاء *أثر* النموذج.
|
||||
|
||||
يتم إنشاء التتبع بالنسبة لأبعاد المُدخلات. وبالتالي، فهو مُقيّد بأبعاد المُدخلات الوهمية، ولن يعمل لأي طول تسلسل أو حجم دفعة مختلف. عند المحاولة بحجم مختلف، يتم رفع الخطأ التالي:
|
||||
|
||||
```
|
||||
`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2`
|
||||
```
|
||||
|
||||
نوصي بتتبع النموذج باستخدام حجم مُدخلات وهمية لا يقل عن أكبر مُدخل سيتم تقديمه للنموذج أثناء الاستدلال. يمكن أن تساعد الحشوة(padding) في ملء القيم المفقودة. ومع ذلك، نظرًا لتتبع النموذج بحجم مُدخل أكبر، ستكون أبعاد المصفوفة ستكون كبيرة أيضًا، مما يؤدي عنه المزيد من الحسابات.
|
||||
|
||||
انتبه إلى إجمالي عدد العمليات المُنفذة على كل مُدخل وتابع الأداء عن كثب عند تصدير نماذج متغيرة طول التسلسل.
|
||||
|
||||
## استخدام TorchScript في Python
|
||||
|
||||
يوضح هذا القسم كيفية حفظ النماذج وتحميلها، بالإضافة إلى كيفية استخدام التتبع للاستدلال.
|
||||
|
||||
### حفظ نموذج
|
||||
|
||||
لتصدير `BertModel` باستخدام TorchScript، قم بتهيئة ـ `BertModel` من فئة `BertConfig` ثم احفظه على القرص تحت اسم الملف `traced_bert.pt`:
|
||||
|
||||
```python
|
||||
from transformers import BertModel, BertTokenizer, BertConfig
|
||||
import torch
|
||||
|
||||
enc = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
|
||||
|
||||
# Tokenizing input text
|
||||
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
|
||||
tokenized_text = enc.tokenize(text)
|
||||
|
||||
# Masking one of the input tokens
|
||||
masked_index = 8
|
||||
tokenized_text[masked_index] = "[MASK]"
|
||||
indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
|
||||
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
|
||||
|
||||
# Creating a dummy input
|
||||
tokens_tensor = torch.tensor([indexed_tokens])
|
||||
segments_tensors = torch.tensor([segments_ids])
|
||||
dummy_input = [tokens_tensor, segments_tensors]
|
||||
|
||||
# Initializing the model with the torchscript flag
|
||||
# Flag set to True even though it is not necessary as this model does not have an LM Head.
|
||||
config = BertConfig(
|
||||
vocab_size_or_config_json_file=32000,
|
||||
hidden_size=768,
|
||||
num_hidden_layers=12,
|
||||
num_attention_heads=12,
|
||||
intermediate_size=3072,
|
||||
torchscript=True,
|
||||
)
|
||||
|
||||
# Instantiating the model
|
||||
model = BertModel(config)
|
||||
|
||||
# The model needs to be in evaluation mode
|
||||
model.eval()
|
||||
|
||||
# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag
|
||||
model = BertModel.from_pretrained("google-bert/bert-base-uncased", torchscript=True)
|
||||
|
||||
# Creating the trace
|
||||
traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
|
||||
torch.jit.save(traced_model, "traced_bert.pt")
|
||||
```
|
||||
|
||||
### تحميل نموذج
|
||||
|
||||
يمكنك الآن تحميل `BertModel` المُحفظ سابقًا، `traced_bert.pt`، من القرص واستخدامه على `dummy_input` المُهيأ سابقًا:
|
||||
|
||||
```python
|
||||
loaded_model = torch.jit.load("traced_bert.pt")
|
||||
loaded_model.eval()
|
||||
|
||||
all_encoder_layers, pooled_output = loaded_model(*dummy_input)
|
||||
```
|
||||
|
||||
### استخدام نموذج مُتتبع للاستدلال
|
||||
|
||||
استخدم النموذج المُتتبع للاستدلال باستخدام أسلوب `__call__` الخاص به:
|
||||
|
||||
```python
|
||||
traced_model(tokens_tensor, segments_tensors)
|
||||
```
|
||||
|
||||
## نشر نماذج Hugging Face TorchScript على AWS باستخدام Neuron SDK
|
||||
|
||||
قدمت AWS عائلة [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) من اﻷجهزة لخفض التكلفة وأداء التعلم الآلي عالي الأداء في البيئة السحابية. تعمل أجهزة Inf1 بواسطة شريحة Inferentia من AWS، وهي مُسرّع أجهزة مُخصص، متخصص في أعباء عمل الاستدلال للتعلم العميق. [AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) هي SDK لـ Inferentia التي تدعم تتبع نماذج المحولات وتحسينها للنشر على Inf1. توفر Neuron SDK ما يلي:
|
||||
|
||||
1. واجهة برمجة تطبيقات سهلة الاستخدام مع تغيير سطر واحد من التعليمات البرمجية لتتبع نموذج TorchScript وتحسينه للاستدلال في البيئة السحابية.
|
||||
2. تحسينات الأداء الجاهزة للاستخدام [تحسين التكلفة والأداء](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>).
|
||||
3. دعم نماذج Hugging Face المحولات المبنية باستخدام إما [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html) أو [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html).
|
||||
|
||||
### الآثار المترتبة
|
||||
|
||||
تعمل نماذج المحولات المستندة إلى بنية [BERT (تمثيلات الترميز ثنائية الاتجاه من المحولات)](https://huggingface.co/docs/transformers/main/model_doc/bert) أو متغيراتها مثل [distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) و [roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta) بشكل أفضل على Inf1 للمهام غير التوليدية مثل الإجابة على الأسئلة الاستخراجية، وتصنيف التسلسلات، وتصنيف الرموز (tokens). ومع ذلك، يمكن تكييف مهام توليد النصوص للعمل على Inf1 وفقًا لهذا [برنامج تعليمي AWS Neuron MarianMT](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html). يمكن العثور على مزيد من المعلومات حول النماذج التي يمكن تحويلها جاهزة على Inferentia في قسم [ملاءمة بنية النموذج](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia) من وثائق Neuron.
|
||||
|
||||
### التبعيات (Dependencies)
|
||||
|
||||
يتطلب استخدام AWS Neuron لتحويل النماذج [بيئة SDK Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide) والتي تأتي مسبقًا على [AMI للتعلم العميق من AWS](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html).
|
||||
|
||||
### تحويل نموذج لـ AWS Neuron
|
||||
|
||||
قم بتحويل نموذج لـ AWS NEURON باستخدام نفس التعليمات البرمجية من [استخدام TorchScript في Python](torchscript#using-torchscript-in-python) لتتبع `BertModel`. قم باستيراد امتداد إطار عمل `torch.neuron` للوصول إلى مكونات Neuron SDK من خلال واجهة برمجة تطبيقات Python:
|
||||
|
||||
```python
|
||||
from transformers import BertModel, BertTokenizer, BertConfig
|
||||
import torch
|
||||
import torch.neuron
|
||||
```
|
||||
|
||||
كل ما عليك فعله هو تعديل السطر التالي:
|
||||
|
||||
```diff
|
||||
- torch.jit.trace(model, [tokens_tensor, segments_tensors])
|
||||
+ torch.neuron.trace(model, [token_tensor, segments_tensors])
|
||||
```
|
||||
|
||||
يتيح ذلك لـ Neuron SDK تتبع النموذج وتحسينه لمثيلات Inf1.
|
||||
|
||||
لمعرفة المزيد حول ميزات AWS Neuron SDK والأدوات ودروس البرامج التعليمية والتحديثات الأخيرة، يرجى الاطلاع على [وثائق AWS NeuronSDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html).
|
||||
@ -88,8 +88,6 @@
|
||||
title: Tool use
|
||||
- local: chat_templating_writing
|
||||
title: Writing a chat template
|
||||
- local: chat_response_parsing
|
||||
title: Response parsing
|
||||
title: Chat with models
|
||||
- sections:
|
||||
- local: serving
|
||||
@ -119,8 +117,6 @@
|
||||
title: Tools
|
||||
- local: transformers_as_backend
|
||||
title: Inference server backends
|
||||
- local: continuous_batching
|
||||
title: Continuous Batching
|
||||
title: Inference
|
||||
- isExpanded: false
|
||||
sections:
|
||||
@ -231,6 +227,8 @@
|
||||
title: ONNX
|
||||
- local: executorch
|
||||
title: ExecuTorch
|
||||
- local: torchscript
|
||||
title: TorchScript
|
||||
title: Export to production
|
||||
- isExpanded: false
|
||||
sections:
|
||||
@ -1257,8 +1255,6 @@
|
||||
title: Importing Utilities
|
||||
- local: internal/time_series_utils
|
||||
title: Utilities for Time Series
|
||||
- local: internal/rope_utils
|
||||
title: Rotary Embeddings Utilities
|
||||
title: Internal helpers
|
||||
- sections:
|
||||
- local: reference/environment_variables
|
||||
|
||||
@ -95,12 +95,9 @@ print(tokenizer.decode(outputs[0][len(inputs["input_ids"][0]):]))
|
||||
|
||||
The chat model called the `get_current_temperature` tool with the correct parameters from the docstring. It inferred France as the location based on Paris, and that it should use Celsius for the units of temperature.
|
||||
|
||||
A model **cannot actually call the tool itself**. It requests a tool call, and it's your job to handle the call and append it and the result to the chat history. For
|
||||
models that support [response parsing](./chat_response_parsing), the response parsing will be handled automatically, and you can just use
|
||||
[`~PreTrainedTokenizer.parse_response] to extract the tool call. For other models, you'll need to manually translate the output
|
||||
string into a tool call dict.
|
||||
A model **cannot actually call the tool itself**. It requests a tool call, and it's your job to handle the call and append it and the result to the chat history.
|
||||
|
||||
Regardless of the approach you use, the tool call should go in the `tool_calls` key of an `assistant` message. This is the recommended API, and should be supported by the chat template of most tool-using models.
|
||||
Hold the call in the `tool_calls` key of an `assistant` message. This is the recommended API, and should be supported by the chat template of most tool-using models.
|
||||
|
||||
> [!WARNING]
|
||||
> Although `tool_calls` is similar to the OpenAI API, the OpenAI API uses a JSON string as its `tool_calls` format. This may cause errors or strange model behavior if used in Transformers, which expects a dict.
|
||||
|
||||
@ -1,233 +0,0 @@
|
||||
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Response Parsing
|
||||
|
||||
It is increasingly common for chat models to generate structured outputs, rather than just a single reply string.
|
||||
The most common uses for structured outputs are [tool calling](./chat_extras) and [reasoning models](https://huggingface.co/reasoning-course).
|
||||
Tool calling models can output tool calls, containing the name of the tool to call and any arguments to be passed to it,
|
||||
while reasoning models often output reasoning steps as a "chain of thought". Some recent models even use both of these,
|
||||
and may output reasoning and/or one or more tool calls before their final answer.
|
||||
|
||||
Models with structured outputs pose a challenge for chat templating, because the output needs to be parsed before it
|
||||
can be appended to the chat. For a concrete example, let's say we ask [GPT-OSS](https://huggingface.co/openai/gpt-oss-120b)
|
||||
what the weather is like, and it thinks and decides to call a tool. Here's what the raw model output might look like:
|
||||
|
||||
```txt
|
||||
<|start|>analysis<|message|>The user asks: "What is the weather like in SF?" We need to get the location of the user? The user explicitly asks about SF (San Francisco).
|
||||
So we need to get the current weather in San Francisco, CA. We need to call get_current_weather function. But we need to call function to get weather data.
|
||||
So we should call get_current_weather with location "San Francisco, CA". Let's do that.
|
||||
We will call function get_current_weather.<|end|><|start|>commentary to=functions.get_current_weather<|channel|>commentary <|constrain|>json<|message|>{"location":"San Francisco, CA"}<|call|>
|
||||
}
|
||||
```
|
||||
|
||||
But if you want to append this to a chat, you'll need to format it as a chat message dict, like this:
|
||||
|
||||
```json
|
||||
{
|
||||
"role": "assistant",
|
||||
"thinking": "The user asks: \"What is the weather like in SF?\" We need to get the location of the user? The user explicitly asks about SF (San Francisco). So we need to get the current weather in San Francisco, CA. We need to call get_current_weather function. But we need to call function to get weather data. So we should call get_current_weather with location \"San Francisco, CA\". Let's do that.",
|
||||
"tool_calls": [
|
||||
{
|
||||
"name": "get_current_weather",
|
||||
"arguments": {
|
||||
"location": "San Francisco, CA"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
Chat **templates** give us a way to turn messages into formatted input for a model, but we need something else to
|
||||
parse model output back into a standard message dict. This is what chat **parsing** is for.
|
||||
|
||||
## The [parse_response](~PreTrainedTokenizerBase.parse_response) method
|
||||
|
||||
Parsing a chat response on a model that supports it is straightforward. Simply take the raw, decoded output from
|
||||
[generate](`~generation.GenerationMixin.generate`), and pass it to the tokenizer's [parse_response](~PreTrainedTokenizerBase.parse_response) method:
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
checkpoint = "HuggingFaceTB/SmolLM3-3B"
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
||||
model = AutoModelForCausalLM.from_pretrained(checkpoint, dtype="auto", device_map="auto")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hey! Can you summarize the end of the Cold War as briefly as possible? Like, comically briefly. It should really leave out almost most of the relevant information."
|
||||
}
|
||||
]
|
||||
|
||||
input_ids = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_tensors="pt"
|
||||
).to(model.device)
|
||||
|
||||
outputs = model.generate(input_ids, max_new_tokens=1024)[0, input_ids.shape[1]:]
|
||||
out_text = tokenizer.decode(outputs)
|
||||
parsed = tokenizer.parse_response(out_text)
|
||||
print(parsed.keys())
|
||||
```
|
||||
|
||||
And you should get:
|
||||
|
||||
```text
|
||||
dict_keys(['thinking', 'content'])
|
||||
```
|
||||
|
||||
And that's all you need to start using response parsing! `parse_response` should return a complete message dict that is ready to be appended to the chat history.
|
||||
When the tokenizer does not support response parsing, `parse_response` will throw an error. We hope to add support
|
||||
to more tokenizers over time.
|
||||
|
||||
## Developers: Understanding a simple response schema
|
||||
|
||||
Under the hood, `parse_response` uses a **JSON schema** to parse the model output. A JSON schema represents
|
||||
the structure of the output message dict. The schema is augmented with additional fields that indicate how the
|
||||
output message string should be parsed into the expected format. Let's take a look at the schema for a SmolLM response,
|
||||
excluding tool calls for now:
|
||||
|
||||
```python
|
||||
{
|
||||
"x-regex": "(?:<think>\n?(?P<thinking>.+?)\n?</think>)?\s*(?P<content>.+?)?\s*(?:<\|im_end\|>|$)",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"role": {"const": "assistant"},
|
||||
"content": {"type": "string"},
|
||||
"thinking": {"type": "string"}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
We can see that the schema describes a JSON "object" (a `dict`, in other words) with three keys: `role`, `content`, and `thinking`.
|
||||
Because all assistant responses have the role "assistant", the `role` key is a `const`(ant). The other two keys are strings, extracted
|
||||
from the named groups in the regex in the `x-regex` field.
|
||||
|
||||
Like chat templates, response schemas are set as a property of the tokenizer. To enable response parsing, all you need
|
||||
to do is set `tokenizer.response_schema` to a valid schema dict, and `tokenizer.parse_response()` will work! Again, like
|
||||
chat templates, this schema will be saved with the processor, so once you set it, you can use `save_pretrained()` or `push_to_hub()` to
|
||||
save and share the schema.
|
||||
|
||||
## Developers: Complex schemas
|
||||
|
||||
Now, let's look at a more complex schema, which includes tool calls, to gain more of an understanding of the parser
|
||||
internals. For this, we'll use the `GPT-OSS` schema. GPT-OSS emits both tool calls and thinking blocks, and it uses
|
||||
an unusual format where model responses are tagged with one of three "channels": `commentary` for things like
|
||||
tool calls, `analysis` for chain of thought blocks, and `final` for messages intended to be sent to the user.
|
||||
A full message where the model calls a tool named `get_current_weather` might look like this, with some extra linebreaks added for clarity:
|
||||
|
||||
```text
|
||||
<|channel|>analysis<|message|>
|
||||
The user asks: "What is the weather like in SF?" So we need to get the current weather in San Francisco, CA.
|
||||
We need to call get_current_weather function. So we should call get_current_weather with location "San Francisco, CA".
|
||||
<|end|>
|
||||
<|start|>assistant<|channel|>commentary
|
||||
to=functions.get_current_weather <|constrain|>json<|message|>
|
||||
{
|
||||
"location": "San Francisco, CA"
|
||||
}
|
||||
<|call|>
|
||||
```
|
||||
|
||||
Parsing proceeds recursively; the output of a regex (or other parser) at one level becomes the input to the nodes below it.
|
||||
In other words, don't feel like you have to parse the entire output in one enormous regex! Instead, start with the schema,
|
||||
and then add regexes to extract the relevant chunks as you go. Here's a schema that will parse it, with some
|
||||
explanatory comments:
|
||||
|
||||
```python
|
||||
{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"role": {"const": "assistant"},
|
||||
# "content" and "thinking" are both similar to the previous example, and just extract a single string
|
||||
# However, rather than using a single regex with named groups to extract both, we use a regex in each subkey.
|
||||
# When an object node has no parser/regex, the entire input string is passed to all of its children, so
|
||||
# parsing can either be done with named groups at the object level, or with separate regexes at the property level.
|
||||
"content": {"type": "string", "x-regex": r"<\|channel\|>final<\|message\|>(.*?)(?:<\|end\|>|$)"},
|
||||
"thinking": {"type": "string", "x-regex": r"<\|channel\|>analysis<\|message\|>(.*?)<\|end\|>"},
|
||||
"tool_calls": {
|
||||
# "x-regex-iterator" uses re.findall to find multiple possible manages, and returns them as an
|
||||
# array/list. You don't need to worry about array handling, though - each item in the array will be
|
||||
# parsed by the `items` schema, so just write the schema for a single item.
|
||||
"x-regex-iterator": r"<\|channel\|>commentary (to=functions\..*?<\|message\|>.*?)(?:<\|call\|>|$)",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
# A const property is a fixed value, and the input has no effect on it.
|
||||
"type": {"const": "function"},
|
||||
# Here, we wrap the entire tool call dict in a `{"function": ...}` block. The input string is passed through to it unchanged.
|
||||
"function": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {"type": "string", "x-regex": r"^to=functions\.(\w+)"},
|
||||
"arguments": {
|
||||
"type": "object",
|
||||
"x-regex": "<\|message\|>(.*)",
|
||||
# The "x-parser" field indicates that the extracted string should be parsed as JSON.
|
||||
# The output is then passed to the schema nodes below and recursive parsing continues.
|
||||
"x-parser": "json",
|
||||
"additionalProperties": {"type": "any"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
```
|
||||
|
||||
## Developers: Understanding the parser logic
|
||||
|
||||
The parser follows a few simple rules:
|
||||
|
||||
1. Each level of the schema receives input from the level above, applies any regex or parser it has, and then passes the output to its children.
|
||||
2. The root level receives the entire decoded model output string as input.
|
||||
3. If a node has structured content after parsing (for example, if the regex has named groups and returns a dict, or if the parser returns a dict or list),
|
||||
then that structured content is mapped to the node's children, and each child node receives its corresponding value as input.
|
||||
4. If an `object` (dict) node has unstructured (string) output, then the entire string is passed to all of its children. This allows child nodes
|
||||
to handle parsing individually rather than requiring a single parent regex to extract all keys at once.
|
||||
5. If an `array` (list) node has unstructured (string) output, then this throws an error.
|
||||
|
||||
There is a small set of allowable `x-` keys that indicate how parsing should be done at each node:
|
||||
- `x-regex`: A regex string to apply to the input. If the regex has named groups, the output is a dict of group names to values. Named groups should only be used in `object` nodes.
|
||||
Otherwise, the regex must have exactly one unnamed capturing group, and the output is the value of that group as a string.
|
||||
- `x-regex-iterator`: A regex string to apply to the input using `re.findall()`. The output is a list of all matches.
|
||||
This should only be used in `array` nodes, and the regex must have exactly one unnamed capturing group. The output is distributed to
|
||||
the node's `items` schema.
|
||||
- `x-parser`: Calls a built-in parser to apply to the input. Currently, the only supported parser is `json`, which parses the input string as JSON.
|
||||
The output is passed to the child nodes for further parsing. Note that the `json` parser can return deeply nested output - in this case, the output
|
||||
will be progressively unwrapped as it is passed through child nodes. The child nodes do not need additional `x-parser` or `x-regex` fields in this case,
|
||||
but their structure must match the structure of the parsed JSON.
|
||||
- `x-parser-args`: Only allowed in conjunction with `x-parser`. This is a dict of additional arguments that control parsing. Right now, the only supported
|
||||
argument is `transform`, which specifies a `jmespath` transformation to apply to the output. This is useful when the JSON parser returns a structure
|
||||
that needs to be modified to match the schema.
|
||||
- `x-regex-key-value`: This is rarely necessary, but it can be useful when parsing key-value pairs in non-JSON format where the names of the keys are not known
|
||||
in advance, such as when a model emits XML tool calls with arbitrary argument names. The regex must have exactly two named capturing groups,
|
||||
`key` and `value`, and the output is a dict mapping keys to values. This should only be used in `object` nodes.
|
||||
|
||||
In general, multiple regexes/parsers cannot be combined at the same level. The exception is that `x-regex`, returning a single string, can be combined with the other parsers. In this case,
|
||||
`x-regex` is applied first, and then the output is passed to the other parser, either `x-regex-iterator`, `x-parser`, or `x-regex-key-value`.
|
||||
|
||||
Putting these ideas together, you can see that the input flows through the schema, being parsed at each level and then distributed to child nodes. Each level
|
||||
only needs to extract the input content that is relevant for that part of the schema, and can then let its child nodes handle the rest. Internally, this is handled
|
||||
with a parser function that receives input, applies any regexes/parsers at the current level, then maps the result to its child nodes before recursively calling itself on each of them.
|
||||
Recursion terminates when it reaches leaf nodes, usually primitive types like `string` or `number`, which simply return the input they receive.
|
||||
@ -1,194 +0,0 @@
|
||||
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Continuous Batching
|
||||
|
||||
Continuous Batching (CB) is an advanced technique to optimize the inference of transformer models by dynamically grouping multiple requests into batches. This approach maximizes GPU utilization and throughput, specifically for workloads with many variable-length inputs.
|
||||
|
||||
We are particularly interested in having Continuous Batching in transformers for the following use cases:
|
||||
- Evaluation of models on large datasets with variable-length inputs
|
||||
- Generating outputs for multiple sequences for GRPO policies
|
||||
|
||||
CB is what makes inference engines like vLLM or SGLang efficient. That being said, transformers does not aim to be a production-ready inference engine, but a complete framework for model development. For this reason, CB is available in `transformers serve`.
|
||||
|
||||
If you are not familiar with some of the core concepts CB is built upon, we invite you to read the associated blog post: [Continuous Batching: Efficient Inference for Large Language Models](https://huggingface.co/blog/continuous-batching). _broken link for now_
|
||||
|
||||
## API Reference
|
||||
|
||||
## Usage Examples
|
||||
|
||||
The main way to use CB in transformers is via the `generate_batch` method.
|
||||
|
||||
Unlike `generate`, CB takes already tokenized inputs, known as input IDs. Each sequence of input IDs is represented as a list of integers, in python: `list[int]`. Since
|
||||
|
||||
For a more detailed example, please refer to: [examples/continuous_batching](./path/to/example)
|
||||
|
||||
### `generate_batch` example
|
||||
|
||||
We have created a `ContinuousMixin` that is inherited by the `GenerationMixin` so that all auto regressive text models support CB.
|
||||
|
||||
This adds the `generate_batch` method to all models that inherit from `GenerationMixin`.
|
||||
|
||||
You can use it as follows:
|
||||
|
||||
```py
|
||||
import datasets
|
||||
import torch
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from transformers.generation import GenerationConfig
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"Qwen/Qwen3-4B-Instruct-2507",
|
||||
attn_implementation="spda_paged",
|
||||
device_map="cuda", # if you need cuda
|
||||
dtype=torch.bfloat16,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left")
|
||||
|
||||
# prepare a batch of inputs
|
||||
dataset = datasets.load_dataset("openai/gsm8k", "socratic", split="test")
|
||||
dataset = dataset.select(range(args.samples))
|
||||
tokenized_datasets = dataset.map(lambda x: tokenizer(x["question"]), batched=True)
|
||||
simple_batch_inputs = [item["input_ids"] for item in tokenized_datasets]
|
||||
|
||||
generation_config = GenerationConfig(
|
||||
max_new_tokens=32,
|
||||
use_cuda_graph=False, # Not supported for simple version
|
||||
eos_token_id=tokenizer.eos_token_id,
|
||||
pad_token_id=tokenizer.pad_token_id,
|
||||
do_sample=False,
|
||||
max_batch_tokens=512, # max number of tokens in a batch, this is just a default value you should tune based on your hardware
|
||||
)
|
||||
|
||||
batch_outputs = model.generate_batch(
|
||||
inputs=simple_batch_inputs,
|
||||
generation_config=generation_config,
|
||||
)
|
||||
|
||||
for request_id, output in batch_outputs.items():
|
||||
generated_text = tokenizer.decode(output.generated_tokens, skip_special_tokens=True)
|
||||
print(f"Request {request_id} output: {generated_text}")
|
||||
```
|
||||
|
||||
### `ContinuousBatchingManager` example
|
||||
|
||||
If you want more control w.r.t. how you want to schedule requests using CB, you can use the `ContinuousBatchingManager` class directly.
|
||||
|
||||
This is what we use in `transformers serve` because requests arrive asynchronously and we can leverage the asynchronous nature of the CB process to make things more efficient.
|
||||
|
||||
Under the hood, the `ContinuousBatchingManager` creates a background thread that receives inputs from a python `queue.Queue` which it uses to get requests to batch in each forward pass.
|
||||
|
||||
Note that the manager is thread safe!
|
||||
|
||||
```py
|
||||
import datasets
|
||||
import torch
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from transformers.generation import GenerationConfig
|
||||
from transformers.generation.continuous_batching import RequestStatus
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"Qwen/Qwen3-4B-Instruct-2507",
|
||||
attn_implementation="spda_paged",
|
||||
device_map="cuda", # if you need cuda
|
||||
dtype=torch.bfloat16,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left")
|
||||
|
||||
# prepare a batch of inputs
|
||||
dataset = datasets.load_dataset("openai/gsm8k", "socratic", split="test")
|
||||
dataset = dataset.select(range(args.samples))
|
||||
tokenized_datasets = dataset.map(lambda x: tokenizer(x["question"]), batched=True)
|
||||
simple_batch_inputs = [item["input_ids"] for item in tokenized_datasets]
|
||||
|
||||
# initialize the manager, available method thanks to the `ContinuousMixin`
|
||||
manager = model.init_continuous_batching(generation_config=generation_config)
|
||||
|
||||
# start the background thread
|
||||
manager.start()
|
||||
|
||||
# this is for demonstration purposes only, in practice this is most useful to do concurrently
|
||||
for i, input in enumerate(simple_batch_inputs):
|
||||
request_id = manager.add_request(input_ids=input, request_id=f"request_{i}") # if you do not specify a request_id, one will be generated for you
|
||||
|
||||
# Can be done in an other thread
|
||||
for id, request in manager.get_result():
|
||||
generated_text = tokenizer.decode(request.generated_tokens, skip_special_tokens=True)
|
||||
print(f"Request {id} output: {generated_text}")
|
||||
|
||||
# you can also get results for a specific request id
|
||||
result = manager.get_result(request_id="request_5") # this is blocking and will wait for the result to be ready
|
||||
|
||||
# or get results for a request that is streaming
|
||||
manager.add_request(
|
||||
input_ids=input,
|
||||
request_id="streaming_request",
|
||||
stream=True,
|
||||
)
|
||||
for chunk in manager.request_id_iter(request_id="streaming_request"):
|
||||
generated_text = tokenizer.decode(chunk.generated_tokens, skip_special_tokens=True)
|
||||
print(generated_text)
|
||||
# FIXME: stop iteration in `request_id_iter` when finished instead of doing it externally
|
||||
if chunk.status == RequestStatus.FINISHED:
|
||||
break
|
||||
|
||||
# stop the background thread before exiting the process
|
||||
manager.stop()
|
||||
```
|
||||
|
||||
## Supported & Unsupported Features
|
||||
|
||||
### Supported Features
|
||||
|
||||
- Dynamic scheduling of variable-length requests
|
||||
- Chunked prefill
|
||||
- Paged Attention Cache
|
||||
- Sliding window attention
|
||||
- Chat templates
|
||||
|
||||
### Unsupported Features
|
||||
|
||||
At the moment, the following features are not supported with CB. We plan to add support to the following:
|
||||
|
||||
- Prefix caching
|
||||
- Beam search
|
||||
- tool calling
|
||||
|
||||
The others are unplanned, but depending on community requests we might consider adding them:
|
||||
|
||||
- MTP (multi token prediction)
|
||||
- Medusa
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
|
||||
## Integration with Serving
|
||||
|
||||
You can use CB in `transformers serve` by passing the `--continuous-batching` flag when starting the server.
|
||||
|
||||
## Monitoring
|
||||
|
||||
We have added `opentelemetry` support to Continuous Batching to help you monitor its performance in production. To enable it, you need to install the `opentelemetry` extra when installing `transformers`:
|
||||
|
||||
```sh
|
||||
# this installs `opentelemetry-api`, `opentelemetry-sdk` and `opentelemetry-exporter-otlp`
|
||||
pip install transformers[open-telemetry]
|
||||
```
|
||||
|
||||
This will enable traces and metrics collection in CB. You will then have to setup the backend to collect and visualize the traces and metrics.
|
||||
|
||||
@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
[ExecuTorch](https://pytorch.org/executorch/stable/index.html) runs PyTorch models on mobile and edge devices. Export your Transformers models to the ExecuTorch format with [Optimum ExecuTorch](https://github.com/huggingface/optimum-executorch) with the command below.
|
||||
|
||||
```bash
|
||||
```
|
||||
optimum-cli export executorch \
|
||||
--model "HuggingFaceTB/SmolLM2-135M-Instruct" \
|
||||
--task "text-generation" \
|
||||
@ -29,5 +29,4 @@ optimum-cli export executorch \
|
||||
--qembedding 8w \
|
||||
--output_dir="hf_smollm2"
|
||||
```
|
||||
|
||||
Run `optimum-cli export executorch --help` to see all export options. For detailed export instructions, check the [README](optimum/exporters/executorch/README.md).
|
||||
|
||||
@ -320,7 +320,7 @@ df.sort_values(by=['skipped_proportion'], ascending=False)
|
||||
You can focus on a specific test method using `--test_method_name`:
|
||||
|
||||
```bash
|
||||
python utils/scan_skipped_tests.py --test_method_name test_inputs_embeds --output_dir path/to/output
|
||||
$ python utils/scan_skipped_tests.py --test_method_name test_inputs_embeds --output_dir path/to/output
|
||||
```
|
||||
|
||||
- `--test_method_name`: Name of the test method to scan (e.g., `test_inputs_embeds`).
|
||||
|
||||
@ -1,83 +0,0 @@
|
||||
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Utilities for Rotary Embedding
|
||||
|
||||
This page explains how the Rotary Embedding is computed and applied in Transformers and what types of RoPE are supported.
|
||||
|
||||
## Overview
|
||||
|
||||
Rotary Position Embeddings are a technique used to inject positional information into attention mechanisms without relying on explicit position encodings.
|
||||
Instead of adding position vectors to token embeddings, RoPE rotates query and key vectors in the complex plane according to their positions enabling relative positional awareness and better extrapolation to unseen sequence lengths.
|
||||
|
||||
The Transformers library provides a flexible and extensible implementation of various RoPE types defined in `[`~modeling_rope_utils.ROPE_VALIDATION_FUNCTIONS`]`, including both the default and scaled variants:
|
||||
|
||||
| Rope Type | Description |
|
||||
|------------|-------------|
|
||||
| `"default"` | Standard rotary embedding as in LLaMA. |
|
||||
| `"linear"` | Linear-scaled RoPE which allows longer context windows. |
|
||||
| `"dynamic"` | NTK-aware scaling computed by rescaling frequency base (`θ`) for longer context. |
|
||||
| `"yarn"` | YaRN scaling variant providing smoother extrapolation and stability. |
|
||||
| `"longrope"` | [LongRoPE](https://github.com/microsoft/LongRoPE) scaling as in Phi-2 model series. |
|
||||
| `"llama3"` | RoPE scaling as in Llama3.1. |
|
||||
|
||||
## Configuration in Model Configs
|
||||
|
||||
To enable and customize rotary embeddings, add a `rope_parameters` field to your model’s configuration file (`config.json`). This field controls the RoPE behavior across model layers. Note that each RoPE variant defines its own set of expected keys and missing keys will raise an error. See the example below which creates a llama config with default RoPE parameters:
|
||||
|
||||
```python
|
||||
from transformers import LlamaConfig
|
||||
|
||||
config = LlamaConfig()
|
||||
config.rope_parameters = {
|
||||
"rope_type": "default", # type of RoPE to use
|
||||
"rope_theta": 10000.0 # base frequency parameter
|
||||
}
|
||||
|
||||
# If we want to apply a scaled RoPE type, we need to pass extra parameters
|
||||
config.rope_parameters = {
|
||||
"rope_type": "linear",
|
||||
"rope_theta": 10000.0,
|
||||
"factor": 8.0 # scale factor for context extension
|
||||
}
|
||||
```
|
||||
|
||||
## Per-Layer-Type RoPE Configuration
|
||||
|
||||
Some models such as Gemma-3 use different layer types with different attention mechanisms, i.e. "full attention" in some blocks and "sliding-window attention" in others. Transformers supports specifying distinct RoPE parameters per layer type for these models. In this case, `rope_parameters` should be a nested dictionary, where top-level keys correspond to `config.layer_types` and values are per-type RoPE parameters. During model initialization, each decoder layer will automatically look up the matching RoPE configuration based on its declared layer type.
|
||||
|
||||
```python
|
||||
from transformers import Gemma3Config
|
||||
|
||||
config = Gemma3Config()
|
||||
config.rope_parameters = {
|
||||
"full_attention": {
|
||||
"rope_type": "dynamic",
|
||||
"rope_theta": 1000000.0,
|
||||
"factor": 8.0,
|
||||
"original_max_position_embeddings": 8096,
|
||||
},
|
||||
"sliding_attention": {
|
||||
"rope_type": "default",
|
||||
"rope_theta": 10000.0,
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Utilities
|
||||
|
||||
[[autodoc]] RopeParameters
|
||||
- __call__
|
||||
@ -1,3 +1,3 @@
|
||||
# Overview
|
||||
|
||||
Kernels in transformers are used to optimize the performance of models with custom layers from the hub and very low effort.
|
||||
Kernels in transformers are used to optimize the performance of models with custom layers from the hub and very low effort.
|
||||
@ -393,9 +393,3 @@ model = AutoModelForCausalLM.from_pretrained(
|
||||
"mistralai/Mistral-7B-v0.1", quantization_config=quant_config, device_map="auto"
|
||||
)
|
||||
```
|
||||
|
||||
## Continuous Batching
|
||||
|
||||
When serving LLMs for inference, you may have multiple requests arriving at different times. Continuous Batching (CB) is a technique that groups incoming requests into batches to maximize GPU utilization and throughput.
|
||||
|
||||
See the [Continuous Batching](./continuous_batching) guide for more details on how to use CB in transformers.
|
||||
|
||||
@ -67,6 +67,6 @@ Examples of use can be found in the [example scripts](../examples) or [example n
|
||||
|
||||
[[autodoc]] data.data_collator.DataCollatorWithFlattening
|
||||
|
||||
## DataCollatorForMultipleChoice
|
||||
# DataCollatorForMultipleChoice
|
||||
|
||||
[[autodoc]] data.data_collator.DataCollatorForMultipleChoice
|
||||
|
||||
@ -267,7 +267,6 @@ about how many forward passes you inputs are actually going to trigger, you can
|
||||
independently of the inputs. The caveats from the previous section still apply.
|
||||
|
||||
## Pipeline FP16 inference
|
||||
|
||||
Models can be run in FP16 which can be significantly faster on GPU while saving memory. Most models will not suffer noticeable performance loss from this. The larger the model, the less likely that it will.
|
||||
|
||||
To enable FP16 inference, you can simply pass `dtype=torch.float16` or `dtype='float16'` to the pipeline constructor. Note that this only works for models with a PyTorch backend. Your inputs will be converted to FP16 internally.
|
||||
@ -335,7 +334,6 @@ Pipelines available for audio tasks include the following.
|
||||
Pipelines available for computer vision tasks include the following.
|
||||
|
||||
### DepthEstimationPipeline
|
||||
|
||||
[[autodoc]] DepthEstimationPipeline
|
||||
- __call__
|
||||
- all
|
||||
|
||||
@ -43,7 +43,6 @@ Learn how to quantize models in the [Quantization](../quantization) guide.
|
||||
[[autodoc]] AwqConfig
|
||||
|
||||
## EetqConfig
|
||||
|
||||
[[autodoc]] EetqConfig
|
||||
|
||||
## GPTQConfig
|
||||
|
||||
@ -50,14 +50,14 @@ several advanced alignment methods which can be used to map between the original
|
||||
token space (e.g., getting the index of the token comprising a given character or the span of characters corresponding
|
||||
to a given token).
|
||||
|
||||
## Multimodal Tokenizer
|
||||
# Multimodal Tokenizer
|
||||
|
||||
Apart from that each tokenizer can be a "multimodal" tokenizer which means that the tokenizer will hold all relevant special tokens
|
||||
as part of tokenizer attributes for easier access. For example, if the tokenizer is loaded from a vision-language model like LLaVA, you will
|
||||
be able to access `tokenizer.image_token_id` to obtain the special image token used as a placeholder.
|
||||
|
||||
To enable extra special tokens for any type of tokenizer, you have to add the following lines and save the tokenizer. Extra special tokens do not
|
||||
have to be modality related and can be anything that the model often needs access to. In the below code, tokenizer at `output_dir` will have direct access
|
||||
have to be modality related and can ne anything that the model often needs access to. In the below code, tokenizer at `output_dir` will have direct access
|
||||
to three more special tokens.
|
||||
|
||||
```python
|
||||
|
||||
@ -23,7 +23,6 @@ The video processor extends the functionality of image processors by allowing Vi
|
||||
When adding a new VLM or updating an existing one to enable distinct video preprocessing, saving and reloading the processor configuration will store the video related arguments in a dedicated file named `video_preprocessing_config.json`. Don't worry if you haven't updated your VLM, the processor will try to load video related configurations from a file named `preprocessing_config.json`.
|
||||
|
||||
### Usage Example
|
||||
|
||||
Here's an example of how to load a video processor with [`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf) model:
|
||||
|
||||
```python
|
||||
|
||||
@ -158,24 +158,6 @@ print("Retrieval scores (query x image):")
|
||||
print(scores)
|
||||
```
|
||||
|
||||
You can also use checkpoints for `ColQwen2.5` that are **compatible with the ColQwen2 architecture**. This version of the model uses [Qwen2_5_VL](./qwen2_5_vl) as the backbone.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from transformers import ColQwen2ForRetrieval, ColQwen2Processor
|
||||
from transformers.utils.import_utils import is_flash_attn_2_available
|
||||
|
||||
model_name = "Sahil-Kabir/colqwen2.5-v0.2-hf" # An existing compatible checkpoint
|
||||
|
||||
model = ColQwen2ForRetrieval.from_pretrained(
|
||||
model_name,
|
||||
dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
attn_implementation="flash_attention_2" if is_flash_attn_2_available() else "sdpa"
|
||||
)
|
||||
processor = ColQwen2Processor.from_pretrained(model_name)
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- [`~ColQwen2Processor.score_retrieval`] returns a 2D tensor where the first dimension is the number of queries and the second dimension is the number of images. A higher score indicates more similarity between the query and image.
|
||||
|
||||
@ -31,7 +31,7 @@ This model was contributed by [Connor Henderson](https://huggingface.co/connor-h
|
||||
|
||||
FastSpeech2's general structure with a Mel-spectrogram decoder was implemented, and the traditional transformer blocks were replaced with conformer blocks as done in the ESPnet library.
|
||||
|
||||
### FastSpeech2 Model Architecture
|
||||
#### FastSpeech2 Model Architecture
|
||||
|
||||

|
||||
|
||||
|
||||
@ -70,8 +70,8 @@ from transformers import AutoProcessor, Florence2ForConditionalGeneration
|
||||
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
|
||||
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
|
||||
|
||||
model = Florence2ForConditionalGeneration.from_pretrained("florence-community/Florence-2-base", dtype=torch.bfloat16, device_map="auto")
|
||||
processor = AutoProcessor.from_pretrained("florence-community/Florence-2-base")
|
||||
model = Florence2ForConditionalGeneration.from_pretrained("microsoft/Florence-2-base", dtype=torch.bfloat16, device_map="auto")
|
||||
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base")
|
||||
|
||||
task_prompt = "<OD>"
|
||||
inputs = processor(text=task_prompt, images=image, return_tensors="pt").to(model.device)
|
||||
@ -105,12 +105,12 @@ from transformers import AutoProcessor, Florence2ForConditionalGeneration, BitsA
|
||||
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
|
||||
|
||||
model = Florence2ForConditionalGeneration.from_pretrained(
|
||||
"florence-community/Florence-2-base",
|
||||
"microsoft/Florence-2-large",
|
||||
dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
quantization_config=quantization_config
|
||||
)
|
||||
processor = AutoProcessor.from_pretrained("florence-community/Florence-2-base")
|
||||
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large")
|
||||
|
||||
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
|
||||
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
|
||||
|
||||
@ -75,11 +75,11 @@ A processor requires an image_processor and a tokenizer. Hence, inputs can be lo
|
||||
from PIL import Image
|
||||
from transformers import AutoTokenizer
|
||||
from transformers.models.fuyu.processing_fuyu import FuyuProcessor
|
||||
from transformers.models.fuyu.image_processing_fuyu_fast import FuyuImageProcessorFast
|
||||
from transformers.models.fuyu.image_processing_fuyu import FuyuImageProcessor
|
||||
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained('adept-hf-collab/fuyu-8b')
|
||||
image_processor = FuyuImageProcessorFast()
|
||||
image_processor = FuyuImageProcessor()
|
||||
|
||||
|
||||
processor = FuyuProcessor(image_processor=image_processor, tokenizer=tokenizer)
|
||||
@ -118,11 +118,6 @@ The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece.
|
||||
[[autodoc]] FuyuImageProcessor
|
||||
- __call__
|
||||
|
||||
## FuyuImageProcessor
|
||||
|
||||
[[autodoc]] FuyuImageProcessorFast
|
||||
- __call__
|
||||
|
||||
## FuyuProcessor
|
||||
|
||||
[[autodoc]] FuyuProcessor
|
||||
|
||||
@ -33,7 +33,7 @@ this model, including [Alternating Updates][altup] (AltUp), [Learned Augmented R
|
||||
[MatFormer][matformer], Per-Layer Embeddings (PLE), [Activation Sparsity with Statistical Top-k][spark-transformer], and KV cache sharing. The language model uses
|
||||
a similar attention pattern to [Gemma 3](./gemma3) with alternating 4 local sliding window self-attention layers for
|
||||
every global self-attention layer with a maximum context length of 32k tokens. Gemma 3n introduces
|
||||
MobileNet v5 as the vision encoder, using a default resolution of 768x768 pixels, and adds a newly
|
||||
[MobileNet v5][mobilenetv5] as the vision encoder, using a default resolution of 768x768 pixels, and adds a newly
|
||||
trained audio encoder based on the [Universal Speech Model][usm] (USM) architecture.
|
||||
|
||||
The instruction-tuned variant was post-trained with knowledge distillation and reinforcement learning.
|
||||
|
||||
@ -61,11 +61,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
|
||||
[[autodoc]] GLPNImageProcessor
|
||||
- preprocess
|
||||
|
||||
## GLPNImageProcessorFast
|
||||
|
||||
[[autodoc]] GLPNImageProcessorFast
|
||||
- preprocess
|
||||
|
||||
## GLPNModel
|
||||
|
||||
[[autodoc]] GLPNModel
|
||||
|
||||
@ -63,6 +63,11 @@ The attributes can be obtained from model config, as `model.config.num_query_tok
|
||||
[[autodoc]] InstructBlipVideoVideoProcessor
|
||||
- preprocess
|
||||
|
||||
## InstructBlipVideoImageProcessor
|
||||
|
||||
[[autodoc]] InstructBlipVideoImageProcessor
|
||||
- preprocess
|
||||
|
||||
## InstructBlipVideoVisionModel
|
||||
|
||||
[[autodoc]] InstructBlipVideoVisionModel
|
||||
|
||||
@ -88,16 +88,16 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
|
||||
import torch
|
||||
from PIL import Image
|
||||
import requests
|
||||
|
||||
|
||||
processor = AutoImageProcessor.from_pretrained("ETH-CVG/lightglue_superpoint")
|
||||
model = AutoModel.from_pretrained("ETH-CVG/lightglue_superpoint")
|
||||
|
||||
|
||||
# LightGlue requires pairs of images
|
||||
images = [image1, image2]
|
||||
inputs = processor(images, return_tensors="pt")
|
||||
with torch.inference_mode():
|
||||
outputs = model(**inputs)
|
||||
|
||||
|
||||
# Extract matching information
|
||||
keypoints0 = outputs.keypoints0 # Keypoints in first image
|
||||
keypoints1 = outputs.keypoints1 # Keypoints in second image
|
||||
@ -112,7 +112,7 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
|
||||
# Process outputs for visualization
|
||||
image_sizes = [[(image.height, image.width) for image in images]]
|
||||
processed_outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
|
||||
|
||||
|
||||
for i, output in enumerate(processed_outputs):
|
||||
print(f"For the image pair {i}")
|
||||
for keypoint0, keypoint1, matching_score in zip(
|
||||
@ -147,13 +147,6 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
|
||||
- post_process_keypoint_matching
|
||||
- visualize_keypoint_matching
|
||||
|
||||
## LightGlueImageProcessorFast
|
||||
|
||||
[[autodoc]] LightGlueImageProcessorFast
|
||||
- preprocess
|
||||
- post_process_keypoint_matching
|
||||
- visualize_keypoint_matching
|
||||
|
||||
## LightGlueForKeypointMatching
|
||||
|
||||
[[autodoc]] LightGlueForKeypointMatching
|
||||
|
||||
@ -247,6 +247,10 @@ model = LlavaNextVideoForConditionalGeneration.from_pretrained(
|
||||
|
||||
[[autodoc]] LlavaNextVideoProcessor
|
||||
|
||||
## LlavaNextVideoImageProcessor
|
||||
|
||||
[[autodoc]] LlavaNextVideoImageProcessor
|
||||
|
||||
## LlavaNextVideoVideoProcessor
|
||||
|
||||
[[autodoc]] LlavaNextVideoVideoProcessor
|
||||
|
||||
@ -54,7 +54,7 @@ model.set_output_embeddings(resized_embeddings)
|
||||
|
||||
## Usage Example
|
||||
|
||||
### Instruct model
|
||||
#### Instruct model
|
||||
|
||||
```python
|
||||
import torch
|
||||
@ -80,7 +80,7 @@ output = model.generate(**inputs, max_new_tokens=25)
|
||||
print(processor.decode(output[0]))
|
||||
```
|
||||
|
||||
### Base model
|
||||
#### Base model
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
@ -88,16 +88,16 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
|
||||
import torch
|
||||
from PIL import Image
|
||||
import requests
|
||||
|
||||
|
||||
processor = AutoImageProcessor.from_pretrained("magic-leap-community/superglue_outdoor")
|
||||
model = AutoModel.from_pretrained("magic-leap-community/superglue_outdoor")
|
||||
|
||||
|
||||
# SuperGlue requires pairs of images
|
||||
images = [image1, image2]
|
||||
inputs = processor(images, return_tensors="pt")
|
||||
with torch.inference_mode():
|
||||
outputs = model(**inputs)
|
||||
|
||||
|
||||
# Extract matching information
|
||||
keypoints0 = outputs.keypoints0 # Keypoints in first image
|
||||
keypoints1 = outputs.keypoints1 # Keypoints in second image
|
||||
@ -112,7 +112,7 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
|
||||
# Process outputs for visualization
|
||||
image_sizes = [[(image.height, image.width) for image in images]]
|
||||
processed_outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
|
||||
|
||||
|
||||
for i, output in enumerate(processed_outputs):
|
||||
print(f"For the image pair {i}")
|
||||
for keypoint0, keypoint1, matching_score in zip(
|
||||
@ -147,13 +147,6 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
|
||||
- post_process_keypoint_matching
|
||||
- visualize_keypoint_matching
|
||||
|
||||
## SuperGlueImageProcessorFast
|
||||
|
||||
[[autodoc]] SuperGlueImageProcessorFast
|
||||
- preprocess
|
||||
- post_process_keypoint_matching
|
||||
- visualize_keypoint_matching
|
||||
|
||||
## SuperGlueForKeypointMatching
|
||||
|
||||
[[autodoc]] SuperGlueForKeypointMatching
|
||||
|
||||
@ -288,7 +288,7 @@ class Olmo2DecoderLayer(OlmoDecoderLayer):
|
||||
output_attentions: Optional[bool] = False,
|
||||
use_cache: Optional[bool] = False,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
|
||||
position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
|
||||
**kwargs,
|
||||
) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
||||
residual = hidden_states
|
||||
|
||||
@ -154,7 +154,7 @@ pip install schedulefree
|
||||
|
||||
[Schedule Free optimizer (SFO)](https://hf.co/papers/2405.15682) replaces the base optimizers momentum with a combination of averaging and interpolation. Unlike a traditional scheduler, SFO completely removes the need to anneal the learning rate.
|
||||
|
||||
SFO supports the RAdam (`schedule_free_radam`), AdamW (`schedule_free_adamw`) and SGD (`schedule_free_sgd`) optimizers. The RAdam scheduler doesn't require `warmup_steps`.
|
||||
SFO supports the RAdam (`schedule_free_radam`), AdamW (`schedule_free_adamw`) and SGD (`schedule_free_sgd`) optimizers. The RAdam scheduler doesn't require `warmup_steps` or `warmup_ratio`.
|
||||
|
||||
By default, it is recommended to set `lr_scheduler_type="constant"`. Other `lr_scheduler_type` values may also work, but combining SFO optimizers with other learning rate schedules could affect SFOs intended behavior and performance.
|
||||
|
||||
|
||||
@ -45,13 +45,7 @@ This guide shows how to enable tensor parallelism with Transformers and differen
|
||||
|
||||
## Partitioning a model
|
||||
|
||||
Transformers supports tensor parallelism if a model has a `tp_plan`. There are two ways to partition a model.
|
||||
|
||||
- Set `tp_plan="auto"` to automatically use a tensor parallelism plan based on a model's predefined configuration.
|
||||
- Define and pass a manual `tp_plan`.
|
||||
|
||||
<hfoptions id="tp_plan">
|
||||
<hfoption id="auto plan">
|
||||
Transformers supports tensor parallelism if a model has a `tp_plan`. Set `tp_plan="auto"` to automatically use a tensor parallelism plan based on a model's predefined configuration.
|
||||
|
||||
```py
|
||||
import os
|
||||
@ -59,7 +53,9 @@ import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
# model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct" # better to visualize all the possible strategies
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct" , dtype=torch.bfloat16, tp_plan="auto")
|
||||
model_id = "meta-llama/Meta-Llama-3-8B-Instruct" # better for smaller number of GPUs
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16, tp_plan="auto")
|
||||
print(model._tp_plan)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
|
||||
@ -76,31 +72,6 @@ Launch the inference script above on [torchrun](https://pytorch.org/docs/stable/
|
||||
torchrun --nproc-per-node 4 demo.py
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="manual plan">
|
||||
|
||||
Define a tensor parallel plan for each layer in `tp_plan` and pass it to [`~PreTrainedModel.from_pretrained`]. The example below uses column and row partitioning. See the [Partitioning strategies](#partitioning-strategies) section for other supported strategies.
|
||||
|
||||
Manual partitioning requires deep understanding of model architecture and strategy interactions. Poor partitioning choices create slow models that fail or produce incorrect results. The [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=tensor_parallelism) explains partitioning strategies in detail.
|
||||
|
||||
```py
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
tp_plan = {
|
||||
"model.layers.*.self_attn.q_proj": "colwise",
|
||||
"model.layers.*.self_attn.k_proj": "colwise",
|
||||
"model.layers.*.self_attn.v_proj": "colwise",
|
||||
"model.layers.*.self_attn.o_proj": "rowwise",
|
||||
...
|
||||
}
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", dtype="auto", tp_plan=tp_plan)
|
||||
print(model.tp_plan)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Partitioning strategies
|
||||
|
||||
All partitioning strategies are defined in the [`ParallelInterface`] class which maps a string to the strategy implementation. You don't need to interact with this class directly since all the strategies are set with `tp_plan` in [`~PreTrainedModel.from_pretrained`], but it is useful for checking what strategies are available.
|
||||
|
||||
@ -38,7 +38,7 @@ pip install transformers[dev]
|
||||
or for an editable install:
|
||||
|
||||
```bash
|
||||
pip install -e ".[dev]"
|
||||
pip install -e .[dev]
|
||||
```
|
||||
|
||||
inside the Transformers repo. Since the number of optional dependencies of Transformers has grown a lot, it's possible you don't manage to get all of them. If the dev install fails, make sure to install PyTorch then do
|
||||
@ -50,7 +50,7 @@ pip install transformers[quality]
|
||||
or for an editable install:
|
||||
|
||||
```bash
|
||||
pip install -e ".[quality]"
|
||||
pip install -e .[quality]
|
||||
```
|
||||
|
||||
## Tests
|
||||
|
||||
@ -40,7 +40,7 @@ You can choose between MXFP4 and NVFP4 with `FPQuantConfig(forward_dtype="mxfp4"
|
||||
|
||||
A **Blackwell-generation GPU is required** to run the kernels. Runtime support for FP-Quant is implemented through the [QuTLASS](https://github.com/IST-DASLab/qutlass) library and a lightweight PyTorch interface lib [`fp_quant`](https://github.com/IST-DASLab/FP-Quant/tree/master/inference_lib). We recommend installing the former **from source** and the latter with `pip install fp_quant`.
|
||||
|
||||
Users **without a Blackwell-generation GPU** , can use the method with `quantization_config=FPQuantConfig(pseudoquantization=True)` without having to install [QuTLASS](https://github.com/IST-DASLab/qutlass). This would provide no speedups but would fully emulate the effect of quantization.
|
||||
Users **without a Blackwell-generation GPU** , can use the method with `quantization_config=FPQuantConfig(pseudoquant=True)` without having to install [QuTLASS](https://github.com/IST-DASLab/qutlass). This would provide no speedups but would fully emulate the effect of quantization.
|
||||
|
||||
> [!TIP]
|
||||
> Find models pre-quantized with FP-Quant in the official ISTA-DASLab [collection](https://huggingface.co/collections/ISTA-DASLab/fp-quant-6877c186103a21d3a02568ee).
|
||||
|
||||
@ -33,7 +33,7 @@ Export a Transformers model to ONNX with the Optimum CLI or the `optimum.onnxrun
|
||||
Run the command below to install Optimum and the [exporters](https://huggingface.co/docs/optimum/exporters/overview) module.
|
||||
|
||||
```bash
|
||||
pip install optimum-onnx
|
||||
pip install optimum[exporters]
|
||||
```
|
||||
|
||||
> [!TIP]
|
||||
|
||||
@ -383,30 +383,6 @@ transformers serve \
|
||||
--attn_implementation "sdpa"
|
||||
```
|
||||
|
||||
### Quantization
|
||||
|
||||
transformers serve is compatible with all [quantization methods](https://huggingface.co/docs/transformers/main/quantization/overview) supported in transformers. Quantization can significantly reduce memory usage and improve inference speed, with two main workflows: pre-quantized models and on-the-fly quantization.
|
||||
|
||||
#### Pre-quantized Models
|
||||
|
||||
For models that are already quantized (e.g., GPTQ, AWQ, bitsandbytes), simply choose a quantized model name for serving.
|
||||
Make sure to install the required libraries listed in the quantization documentation.
|
||||
|
||||
> [!TIP]
|
||||
> Pre-quantized models generally provide the best balance of performance and accuracy.
|
||||
|
||||
#### On the fly quantization
|
||||
|
||||
If you want to quantize a model at runtime, you can specify the --quantization flag in the CLI. Note that not all quantization methods support on-the-fly conversion. The full list of supported methods is available in the quantization [overview](https://huggingface.co/docs/transformers/main/quantization/overview).
|
||||
|
||||
Currently, with transformers serve, we only supports some methods: ["bnb-4bit", "bnb-8bit"]
|
||||
|
||||
For example, to enable 4-bit quantization with bitsandbytes, you need to pass add `--quantization bnb-4bit`:
|
||||
|
||||
```sh
|
||||
transformers serve --quantization bnb-4bit
|
||||
```
|
||||
|
||||
### Performance tips
|
||||
|
||||
- Use an efficient attention backend when available:
|
||||
@ -421,4 +397,6 @@ transformers serve \
|
||||
|
||||
- `--dtype {bfloat16|float16}` typically improve throughput and memory use vs. `float32`
|
||||
|
||||
- `--load_in_4bit`/`--load_in_8bit` can reduce memory footprint for LoRA setups
|
||||
|
||||
- `--force-model <repo_id>` avoids per-request model hints and helps produce stable, repeatable runs
|
||||
|
||||
@ -220,7 +220,7 @@ At this point, only three steps remain:
|
||||
... gradient_accumulation_steps=4,
|
||||
... per_device_eval_batch_size=32,
|
||||
... num_train_epochs=10,
|
||||
... warmup_steps=0.1,
|
||||
... warmup_ratio=0.1,
|
||||
... logging_steps=10,
|
||||
... load_best_model_at_end=True,
|
||||
... metric_for_best_model="accuracy",
|
||||
|
||||
@ -169,7 +169,7 @@ def compute_metrics(eval_pred):
|
||||
return {"wer_score": wer_score}
|
||||
```
|
||||
|
||||
## Train
|
||||
## Train!
|
||||
|
||||
Now, you are ready to start fine-tuning the model. You will use the 🤗 [`Trainer`] for this.
|
||||
|
||||
|
||||
@ -211,7 +211,7 @@ At this point, only three steps remain:
|
||||
... gradient_accumulation_steps=4,
|
||||
... per_device_eval_batch_size=16,
|
||||
... num_train_epochs=3,
|
||||
... warmup_steps=0.1,
|
||||
... warmup_ratio=0.1,
|
||||
... logging_steps=10,
|
||||
... load_best_model_at_end=True,
|
||||
... metric_for_best_model="accuracy",
|
||||
|
||||
@ -126,6 +126,7 @@ def rebuild_objects(bboxes, labels):
|
||||
train_dataset = train_dataset.with_transform(train_transform)
|
||||
```
|
||||
|
||||
|
||||
Build COCO-style annotations for the image processor.
|
||||
|
||||
```py
|
||||
@ -246,4 +247,4 @@ image = Image.open(requests.get("https://huggingface.co/datasets/merve/vlm_test_
|
||||
plot_results(image, results, threshold=0.05)
|
||||
```
|
||||
|
||||

|
||||

|
||||
@ -378,7 +378,7 @@ Most of the training arguments are self-explanatory, but one that is quite impor
|
||||
... learning_rate=5e-5,
|
||||
... per_device_train_batch_size=batch_size,
|
||||
... per_device_eval_batch_size=batch_size,
|
||||
... warmup_steps=0.1,
|
||||
... warmup_ratio=0.1,
|
||||
... logging_steps=10,
|
||||
... load_best_model_at_end=True,
|
||||
... metric_for_best_model="accuracy",
|
||||
|
||||
138
docs/source/en/torchscript.md
Normal file
138
docs/source/en/torchscript.md
Normal file
@ -0,0 +1,138 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# TorchScript
|
||||
|
||||
[TorchScript](https://pytorch.org/docs/stable/jit.html) serializes PyTorch models into programs that can be executed in non-Python processes. This is especially advantageous in production environments where Python may not be the most performant choice.
|
||||
|
||||
Transformers can export a model to TorchScript by:
|
||||
|
||||
1. creating dummy inputs to create a *trace* of the model to serialize to TorchScript
|
||||
2. enabling the `torchscript` parameter in either [`~PreTrainedConfig.torchscript`] for a randomly initialized model or [`~PreTrainedModel.from_pretrained`] for a pretrained model
|
||||
|
||||
## Dummy inputs
|
||||
|
||||
The dummy inputs are used in the forward pass, and as the input values are propagated through each layer, PyTorch tracks the different operations executed on each tensor. The recorded operations are used to create the model trace. Once it is recorded, it is serialized into a TorchScript program.
|
||||
|
||||
```py
|
||||
from transformers import BertModel, BertTokenizer, BertConfig
|
||||
import torch
|
||||
|
||||
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
|
||||
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
|
||||
tokenized_text = tokenizer.tokenize(text)
|
||||
|
||||
masked_index = 8
|
||||
tokenized_text[masked_index] = "[MASK]"
|
||||
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
||||
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
|
||||
|
||||
# creating a dummy input
|
||||
tokens_tensor = torch.tensor([indexed_tokens])
|
||||
segments_tensors = torch.tensor([segments_ids])
|
||||
dummy_input = [tokens_tensor, segments_tensors]
|
||||
```
|
||||
|
||||
The trace is created based on the provided inputs dimensions and it can only handle inputs with the same shape as the provided input during tracing. An input with a different size raises the error message shown below.
|
||||
|
||||
```bash
|
||||
`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2`.
|
||||
```
|
||||
|
||||
Try to create a trace with a dummy input size at least as large as the largest expected input during inference. Padding can help fill missing values for larger inputs. It may be slower though since a larger input size requires more calculations. Be mindful of the total number of operations performed on each input and track the model performance when exporting models with variable sequence lengths.
|
||||
|
||||
## Tied weights
|
||||
|
||||
Weights between the `Embedding` and `Decoding` layers are tied in Transformers and TorchScript can't export models with tied weights. Instantiating a model with `torchscript=True`, separates the `Embedding` and `Decoding` layers and they aren't trained any further because it would throw the two layers out of sync which can lead to unexpected results.
|
||||
|
||||
Models *without* a language model head don't have tied weights and can be safely exported without the `torchscript` parameter.
|
||||
|
||||
<hfoptions id="torchscript">
|
||||
<hfoption id="randomly initialized model">
|
||||
|
||||
```py
|
||||
config = BertConfig(
|
||||
vocab_size_or_config_json_file=32000,
|
||||
hidden_size=768,
|
||||
num_hidden_layers=12,
|
||||
num_attention_heads=12,
|
||||
intermediate_size=3072,
|
||||
torchscript=True,
|
||||
)
|
||||
|
||||
model = BertModel(config)
|
||||
model.eval()
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="pretrained model">
|
||||
|
||||
```py
|
||||
model = BertModel.from_pretrained("google-bert/bert-base-uncased", torchscript=True)
|
||||
model.eval()
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Export to TorchScript
|
||||
|
||||
Create the Torchscript program with [torch.jit.trace](https://pytorch.org/docs/stable/generated/torch.jit.trace.html), and save with [torch.jit.save](https://pytorch.org/docs/stable/generated/torch.jit.save.html).
|
||||
|
||||
```py
|
||||
traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
|
||||
torch.jit.save(traced_model, "traced_bert.pt")
|
||||
```
|
||||
|
||||
Use [torch.jit.load](https://pytorch.org/docs/stable/generated/torch.jit.load.html) to load the traced model.
|
||||
|
||||
```py
|
||||
loaded_model = torch.jit.load("traced_bert.pt")
|
||||
loaded_model.eval()
|
||||
|
||||
all_encoder_layers, pooled_output = loaded_model(*dummy_input)
|
||||
```
|
||||
|
||||
To use the traced model for inference, use the `__call__` dunder method.
|
||||
|
||||
```py
|
||||
traced_model(tokens_tensor, segments_tensors)
|
||||
```
|
||||
|
||||
## Deploy to AWS
|
||||
|
||||
TorchScript programs serialized from Transformers can be deployed on [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) instances. The instance is powered by AWS Inferentia chips, a custom hardware accelerator designed for deep learning inference workloads. [AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) supports tracing Transformers models for deployment on Inf1 instances.
|
||||
|
||||
> [!TIP]
|
||||
> AWS Neuron requires a [Neuron SDK environment](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/inference-torch-neuron.html#inference-torch-neuron) which is preconfigured on [AWS DLAMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html).
|
||||
|
||||
Instead of [torch.jit.trace](https://pytorch.org/docs/stable/generated/torch.jit.trace.html), use [torch.neuron.trace](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuron/api-compilation-python-api.html) to trace a model and optimize it for Inf1 instances.
|
||||
|
||||
```py
|
||||
import torch.neuron
|
||||
|
||||
torch.neuron.trace(model, [tokens_tensor, segments_tensors])
|
||||
```
|
||||
|
||||
Refer to the [AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html) documentation for more information.
|
||||
|
||||
### Model architectures
|
||||
|
||||
BERT-based models - like [DistilBERT](./model_doc/distilbert) or [RoBERTa](./model_doc/roberta) - run best on Inf1 instances for non-generative tasks such as extractive question answering, and sequence or token classification.
|
||||
|
||||
Text generation can be adapted to run on an Inf1 instance as shown in the [Transformers MarianMT](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html) tutorial.
|
||||
|
||||
Refer to the [Inference Samples/Tutorials (Inf1)](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/models/inference-inf1-samples.html#model-samples-inference-inf1) guide for more information about which models can be converted out of the box to run on Inf1 instances.
|
||||
@ -187,7 +187,7 @@ from torch import nn
|
||||
from transformers import Trainer
|
||||
|
||||
class CustomTrainer(Trainer):
|
||||
def compute_loss(self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], return_outputs: bool = False, num_items_in_batch: Optional[torch.Tensor] = None):
|
||||
def compute_loss(self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], return_outputs: bool = False num_items_in_batch: Optional[torch.Tensor] = None):
|
||||
labels = inputs.pop("labels")
|
||||
# forward pass
|
||||
outputs = model(**inputs)
|
||||
|
||||
@ -64,6 +64,8 @@
|
||||
title: Entrenador
|
||||
- local: sagemaker
|
||||
title: Ejecutar el entrenamiento en Amazon SageMaker
|
||||
- local: torchscript
|
||||
title: Exportar a TorchScript
|
||||
- local: community
|
||||
title: Los recursos de la comunidad
|
||||
title: Guías para desarrolladores
|
||||
|
||||
@ -37,7 +37,7 @@ pip install transformers[dev]
|
||||
o una instalación editable:
|
||||
|
||||
```bash
|
||||
pip install -e ".[dev]"
|
||||
pip install -e .[dev]
|
||||
```
|
||||
|
||||
del repositorio de Transformers.
|
||||
|
||||
@ -220,7 +220,7 @@ Al llegar a este punto, solo quedan tres pasos:
|
||||
... gradient_accumulation_steps=4,
|
||||
... per_device_eval_batch_size=32,
|
||||
... num_train_epochs=10,
|
||||
... warmup_steps=0.1,
|
||||
... warmup_ratio=0.1,
|
||||
... logging_steps=10,
|
||||
... load_best_model_at_end=True,
|
||||
... metric_for_best_model="accuracy",
|
||||
|
||||
167
docs/source/es/torchscript.md
Normal file
167
docs/source/es/torchscript.md
Normal file
@ -0,0 +1,167 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Exportar a TorchScript
|
||||
|
||||
<Tip>
|
||||
Este es el comienzo de nuestros experimentos con TorchScript y todavía estamos explorando sus capacidades con modelos de variables de entrada. Es un tema de interés para nosotros y profundizaremos en nuestro análisis en las próximas versiones, con más ejemplos de código, una implementación más flexible y comparativas de rendimiento comparando códigos basados en Python con TorchScript compilado.
|
||||
|
||||
</Tip>
|
||||
|
||||
De acuerdo con la documentación de TorchScript:
|
||||
|
||||
> "TorchScript es una manera de crear modelos serializables y optimizables a partir del código PyTorch."
|
||||
|
||||
Hay dos módulos de PyTorch, [JIT y TRACE](https://pytorch.org/docs/stable/jit.html), que permiten a los desarrolladores exportar sus modelos para ser reusados en otros programas, como los programas de C++ orientados a la eficiencia.
|
||||
|
||||
Nosotros proveemos una interface que te permite exportar los modelos 🤗Transformers a TorchScript para que puedan ser reusados en un entorno diferente al de los programas Python basados en PyTorch. Aquí explicamos como exportar y usar nuestros modelos utilizando TorchScript.
|
||||
|
||||
Exportar un modelo requiere de dos cosas:
|
||||
|
||||
- La instanciación del modelo con la bandera TorchScript.
|
||||
- Un paso hacia adelante con entradas ficticias.
|
||||
|
||||
Estas necesidades implican varias cosas de las que los desarrolladores deben tener cuidado, como se detalla a continuación.
|
||||
|
||||
## Bandera TorchScript y pesos atados.
|
||||
|
||||
La bandera `torchscript` es necesaria porque la mayoría de los modelos de lenguaje de 🤗Transformers tienen pesos atados entre su `capa de incrustación` (`Embedding`) y su `capa de decodificación` (`Decoding`). TorchScript no te permite exportar modelos que tienen pesos atados, por lo que es necesario desatar y clonar los pesos de antemano.
|
||||
|
||||
Los modelos instanciados con la bandera `torchscript` tienen su `capa de incrustación` (`Embedding`) y su `capa de decodificación` (`Decoding`) separadas, lo que significa que no deben ser entrenados más adelante. Entrenar desincronizaría las dos capas, lo que llevaría a resultados inesperados.
|
||||
|
||||
Esto no es así para los modelos que no tienen una cabeza de modelo de lenguaje, ya que esos modelos no tienen pesos atados. Estos modelos pueden ser exportados de manera segura sin la bandera `torchscript`.
|
||||
|
||||
## Entradas ficticias y longitudes estándar
|
||||
|
||||
Las entradas ficticias se utilizan para un paso del modelo hacia adelante. Mientras los valores de las entradas se propagan a través de las capas, PyTorch realiza un seguimiento de las diferentes operaciones ejecutadas en cada tensor. Estas operaciones registradas se utilizan luego para crear *la traza* del modelo.
|
||||
La traza se crea en relación con las dimensiones de las entradas. Por lo tanto, está limitada por las dimensiones de la entrada ficticia y no funcionará para ninguna otra longitud de secuencia o tamaño de lote. Cuando se intenta con un tamaño diferente, se genera el siguiente error:
|
||||
|
||||
```
|
||||
`El tamaño expandido del tensor (3) debe coincidir con el tamaño existente (7) en la dimensión no singleton 2`.
|
||||
```
|
||||
|
||||
Recomendamos trazar el modelo con un tamaño de entrada ficticio al menos tan grande como la entrada más grande con la que se alimentará al modelo durante la inferencia. El relleno puede ayudar a completar los valores faltantes. Sin embargo, dado que el modelo se traza con un tamaño de entrada más grande, las dimensiones de la matriz también serán grandes, lo que resultará en más cálculos.
|
||||
|
||||
Ten cuidado con el número total de operaciones realizadas en cada entrada y sigue de cerca el rendimiento al exportar modelos con longitudes de secuencia variables.
|
||||
|
||||
## Usando TorchScript en Python
|
||||
|
||||
Esta sección demuestra cómo guardar y cargar modelos, así como cómo usar la traza para la inferencia.
|
||||
|
||||
### Guardando un modelo
|
||||
|
||||
Para exportar un `BertModel` con TorchScript, instancia `BertModel` a partir de la clase `BertConfig` y luego guárdalo en disco bajo el nombre de archivo `traced_bert.pt`:
|
||||
|
||||
```python
|
||||
from transformers import BertModel, BertTokenizer, BertConfig
|
||||
import torch
|
||||
|
||||
enc = BertTokenizer.from_pretrained("bert-base-uncased")
|
||||
|
||||
# Tokenizing input text
|
||||
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
|
||||
tokenized_text = enc.tokenize(text)
|
||||
|
||||
# Masking one of the input tokens
|
||||
masked_index = 8
|
||||
tokenized_text[masked_index] = "[MASK]"
|
||||
indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
|
||||
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
|
||||
|
||||
# Creating a dummy input
|
||||
tokens_tensor = torch.tensor([indexed_tokens])
|
||||
segments_tensors = torch.tensor([segments_ids])
|
||||
dummy_input = [tokens_tensor, segments_tensors]
|
||||
|
||||
# Initializing the model with the torchscript flag
|
||||
# Flag set to True even though it is not necessary as this model does not have an LM Head.
|
||||
config = BertConfig(
|
||||
vocab_size_or_config_json_file=32000,
|
||||
hidden_size=768,
|
||||
num_hidden_layers=12,
|
||||
num_attention_heads=12,
|
||||
intermediate_size=3072,
|
||||
torchscript=True,
|
||||
)
|
||||
|
||||
# Instantiating the model
|
||||
model = BertModel(config)
|
||||
|
||||
# The model needs to be in evaluation mode
|
||||
model.eval()
|
||||
|
||||
# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag
|
||||
model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)
|
||||
|
||||
# Creating the trace
|
||||
traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
|
||||
torch.jit.save(traced_model, "traced_bert.pt")
|
||||
```
|
||||
### Cargando un modelo
|
||||
|
||||
Ahora puedes cargar el `BertModel` guardado anteriormente, `traced_bert.pt`, desde el disco y usarlo en la entrada ficticia (`dummy_input`) previamente inicializada:
|
||||
|
||||
```python
|
||||
loaded_model = torch.jit.load("traced_bert.pt")
|
||||
loaded_model.eval()
|
||||
|
||||
all_encoder_layers, pooled_output = loaded_model(*dummy_input)
|
||||
```
|
||||
|
||||
## Usando un modelo trazado para inferencia
|
||||
|
||||
Utiliza el modelo trazado para inferencia utilizando su método `_call_` dunder:
|
||||
|
||||
```python
|
||||
traced_model(tokens_tensor, segments_tensors)
|
||||
```
|
||||
## Despliega modelos TorchScript de Hugging Face en AWS con el Neuron SDK
|
||||
|
||||
AWS introdujo la familia de instancias [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) para inferencia de aprendizaje automático de alto rendimiento y bajo costo en la nube. Las instancias Inf1 están alimentadas por el chip AWS Inferentia, un acelerador de hardware personalizado que se especializa en cargas de trabajo de inferencia de aprendizaje profundo. [AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) es el SDK para Inferentia que admite el trazado y la optimización de modelos de transformers para implementación en Inf1. El SDK Neuron proporciona:
|
||||
|
||||
1. Una API fácil de usar con un solo cambio de línea de código para trazar y optimizar un modelo TorchScript para inferencia en la nube.
|
||||
|
||||
2. Optimizaciones de rendimiento listas para usar [para mejorar el rendimiento y el costo](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>).
|
||||
|
||||
3. Soporte para modelos de transformers de Hugging Face construidos tanto con [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html) como con [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html).
|
||||
|
||||
### Implicaciones
|
||||
|
||||
Los modelos transformers basados en la arquitectura [BERT (Bidirectional Encoder Representations from Transformers)](https://huggingface.co/docs/transformers/main/model_doc/bert), o sus variantes como [distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) y [roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta), funcionan mejor en Inf1 para tareas no generativas como la respuesta a preguntas extractivas, la clasificación de secuencias y la clasificación de tokens. Sin embargo, las tareas de generación de texto aún pueden adaptarse para ejecutarse en Inf1 según este [tutorial de AWS Neuron MarianMT](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html). Se puede encontrar más información sobre los modelos que se pueden convertir fácilmente para usar en Inferentia en la sección de [Model Architecture Fit](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia) de la documentación de Neuron.
|
||||
|
||||
### Dependencias
|
||||
|
||||
El uso de AWS Neuron para convertir modelos requiere un [entorno de Neuron SDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide) que viene preconfigurado en [la AMI de AWS Deep Learning](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html).
|
||||
|
||||
### Convertir un modelo para AWS Neuron
|
||||
|
||||
Convierte un modelo para AWS NEURON utilizando el mismo código de [Uso de TorchScript en Python](torchscript#using-torchscript-in-python) para trazar un `BertModel`. Importa la extensión del framework `torch.neuron` para acceder a los componentes del Neuron SDK a través de una API de Python:
|
||||
|
||||
```python
|
||||
from transformers import BertModel, BertTokenizer, BertConfig
|
||||
import torch
|
||||
import torch.neuron
|
||||
```
|
||||
Solo necesitas la linea sigueda:
|
||||
|
||||
```diff
|
||||
- torch.jit.trace(model, [tokens_tensor, segments_tensors])
|
||||
+ torch.neuron.trace(model, [token_tensor, segments_tensors])
|
||||
```
|
||||
|
||||
Esto permite que el Neuron SDK trace el modelo y lo optimice para las instancias Inf1.
|
||||
|
||||
Para obtener más información sobre las características, herramientas, tutoriales de ejemplo y últimas actualizaciones del AWS Neuron SDK, consulta [la documentación de AWS NeuronSDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html).
|
||||
@ -37,7 +37,7 @@ pip install transformers[dev]
|
||||
o un'installazione modificabile:
|
||||
|
||||
```bash
|
||||
pip install -e ".[dev]"
|
||||
pip install -e .[dev]
|
||||
```
|
||||
|
||||
all'interno del repo Transformers.
|
||||
|
||||
@ -109,6 +109,8 @@
|
||||
title: チャットモデルのテンプレート
|
||||
- local: serialization
|
||||
title: ONNX へのエクスポート
|
||||
- local: torchscript
|
||||
title: トーチスクリプトへのエクスポート
|
||||
- local: community
|
||||
title: コミュニティリソース
|
||||
- local: troubleshooting
|
||||
@ -200,6 +202,8 @@
|
||||
title: モデル
|
||||
- local: main_classes/text_generation
|
||||
title: テキストの生成
|
||||
- local: main_classes/onnx
|
||||
title: ONNX
|
||||
- local: main_classes/optimizer_schedules
|
||||
title: 最適化
|
||||
- local: main_classes/output
|
||||
|
||||
@ -1292,7 +1292,7 @@ DeepSpeed は、`LRRangeTest`、`OneCycle`、`WarmupLR`、および`WarmupDecayL
|
||||
したがって、スケジューラを設定しない場合、これがデフォルトで設定されるスケジューラになります。
|
||||
|
||||
設定ファイルで `scheduler` エントリを設定しない場合、[`Trainer`] は
|
||||
`--lr_scheduler_type`、`--learning_rate`、および `--warmup_steps` の値を設定します。
|
||||
`--lr_scheduler_type`、`--learning_rate`、および `--warmup_steps` または `--warmup_ratio` の値を設定します。
|
||||
🤗 それのトランスフォーマーバージョン。
|
||||
|
||||
以下は、`WarmupLR`の自動構成された`scheduler`エントリの例です。
|
||||
@ -1316,7 +1316,8 @@ DeepSpeed は、`LRRangeTest`、`OneCycle`、`WarmupLR`、および`WarmupDecayL
|
||||
|
||||
- `warmup_min_lr` の値は `0` です。
|
||||
- `warmup_max_lr` と `--learning_rate` の値。
|
||||
- `warmup_num_steps` と `--warmup_steps` の値 (指定されている場合)
|
||||
- `warmup_num_steps` と `--warmup_steps` の値 (指定されている場合)。それ以外の場合は `--warmup_ratio` を使用します
|
||||
トレーニング ステップの数を乗算し、切り上げます。
|
||||
- `total_num_steps` には `--max_steps` の値を指定するか、指定されていない場合は実行時に自動的に導出されます。
|
||||
環境、データセットのサイズ、およびその他のコマンド ライン引数 (
|
||||
`WarmupDecayLR`)。
|
||||
|
||||
50
docs/source/ja/main_classes/onnx.md
Normal file
50
docs/source/ja/main_classes/onnx.md
Normal file
@ -0,0 +1,50 @@
|
||||
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Exporting 🤗 Transformers models to ONNX
|
||||
|
||||
🤗 Transformers は `transformers.onnx` パッケージを提供します。
|
||||
設定オブジェクトを利用することで、モデルのチェックポイントをONNXグラフに変換することができます。
|
||||
|
||||
詳細は[ガイド](../serialization) を参照してください。
|
||||
を参照してください。
|
||||
|
||||
## ONNX Configurations
|
||||
|
||||
以下の3つの抽象クラスを提供しています。
|
||||
エクスポートしたいモデルアーキテクチャのタイプに応じて、継承すべき3つの抽象クラスを提供します:
|
||||
|
||||
* エンコーダーベースのモデルは [`~onnx.config.OnnxConfig`] を継承します。
|
||||
* デコーダーベースのモデルは [`~onnx.config.OnnxConfigWithPast`] を継承します。
|
||||
* エンコーダー・デコーダーモデルは [`~onnx.config.OnnxSeq2SeqConfigWithPast`] を継承しています。
|
||||
|
||||
|
||||
### OnnxConfig
|
||||
|
||||
[[autodoc]] onnx.config.OnnxConfig
|
||||
|
||||
### OnnxConfigWithPast
|
||||
|
||||
[[autodoc]] onnx.config.OnnxConfigWithPast
|
||||
|
||||
### OnnxSeq2SeqConfigWithPast
|
||||
|
||||
[[autodoc]] onnx.config.OnnxSeq2SeqConfigWithPast
|
||||
|
||||
## ONNX Features
|
||||
|
||||
各 ONNX 構成は、次のことを可能にする一連の _機能_ に関連付けられています。
|
||||
さまざまなタイプのトポロジまたはタスクのモデルをエクスポートします。
|
||||
@ -472,6 +472,8 @@ FlexFlowは、サンプル-オペレータ-属性-パラメータの4D並列化
|
||||
|
||||
したがって、このフレームワークの約束は非常に魅力的です。選択したクラスタで30分間のシミュレーションを実行し、この特定の環境を最適に利用するための最良の戦略を提供します。部分を追加/削除/置換すると、それに対して実行して再最適化プランを作成します。その後、トレーニングできます。異なるセットアップには独自の最適化があります。
|
||||
|
||||
🤗 Transformersの現在の状況: まだ統合されていません。すでに[transformers.utils.fx](https://github.com/huggingface/transformers/blob/master/src/transformers/utils/fx.py)を使用してモデルがFXトレース可能であるため、FlexFlowを動作させるために必要な手順を誰かが見つける必要があります。
|
||||
|
||||
## Which Strategy To Use When
|
||||
|
||||
ここでは、どの並列化戦略をいつ使用するかの非常におおまかなアウトラインを示します。各リストの最初が通常よりも速いことが一般的です。
|
||||
|
||||
@ -40,7 +40,7 @@ pip install transformers[dev]
|
||||
|
||||
|
||||
```bash
|
||||
pip install -e ".[dev]"
|
||||
pip install -e .[dev]
|
||||
```
|
||||
|
||||
トランスフォーマーズのリポジトリ内で作業しています。トランスフォーマーズのオプションの依存関係の数が増えたため、すべてを取得できない可能性があります。開発用インストールが失敗した場合、作業しているディープラーニングフレームワーク(PyTorch、TensorFlow、および/またはFlax)をインストールし、次の手順を実行してください。
|
||||
@ -53,7 +53,7 @@ pip install transformers[quality]
|
||||
または編集可能なインストールの場合:
|
||||
|
||||
```bash
|
||||
pip install -e ".[quality]"
|
||||
pip install -e .[quality]
|
||||
```
|
||||
|
||||
## Tests
|
||||
|
||||
@ -47,7 +47,7 @@ ONNX形式にエクスポートされたモデルは、以下のように使用
|
||||
🤗 TransformersモデルをONNXにエクスポートするには、まず追加の依存関係をインストールしてください:
|
||||
|
||||
```bash
|
||||
pip install optimum-onnx
|
||||
pip install optimum[exporters]
|
||||
```
|
||||
|
||||
すべての利用可能な引数を確認するには、[🤗 Optimumドキュメント](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli)を参照してください。または、コマンドラインでヘルプを表示することもできます:
|
||||
@ -128,3 +128,64 @@ CLIの代わりに、🤗 TransformersモデルをONNXにプログラム的に
|
||||
### Exporting a model for an unsupported architecture
|
||||
|
||||
現在エクスポートできないモデルをサポートするために貢献したい場合、まず[`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/exporters/onnx/overview)でサポートされているかどうかを確認し、サポートされていない場合は[🤗 Optimumに貢献](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute)してください。
|
||||
|
||||
### Exporting a model with `transformers.onnx`
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
`transformers.onnx`はもはやメンテナンスされていないため、モデルを上記で説明したように🤗 Optimumでエクスポートしてください。このセクションは将来のバージョンで削除されます。
|
||||
|
||||
</Tip>
|
||||
|
||||
🤗 TransformersモデルをONNXにエクスポートするには、追加の依存関係をインストールしてください:
|
||||
|
||||
|
||||
```bash
|
||||
pip install transformers[onnx]
|
||||
```
|
||||
|
||||
`transformers.onnx`パッケージをPythonモジュールとして使用して、事前に用意された設定を使用してチェックポイントをエクスポートする方法は以下の通りです:
|
||||
|
||||
```bash
|
||||
python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/
|
||||
```
|
||||
|
||||
この方法は、`--model`引数で定義されたチェックポイントのONNXグラフをエクスポートします。🤗 Hubのいずれかのチェックポイントまたはローカルに保存されたチェックポイントを渡すことができます。エクスポートされた`model.onnx`ファイルは、ONNX標準をサポートする多くのアクセラレータで実行できます。例えば、ONNX Runtimeを使用してモデルを読み込んで実行する方法は以下の通りです:
|
||||
|
||||
|
||||
```python
|
||||
>>> from transformers import AutoTokenizer
|
||||
>>> from onnxruntime import InferenceSession
|
||||
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
|
||||
>>> session = InferenceSession("onnx/model.onnx")
|
||||
>>> # ONNX Runtime expects NumPy arrays as input
|
||||
>>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
|
||||
>>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
|
||||
```
|
||||
|
||||
必要な出力名(例: `["last_hidden_state"]`)は、各モデルのONNX構成を確認することで取得できます。例えば、DistilBERTの場合、次のようになります:
|
||||
|
||||
|
||||
```python
|
||||
>>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
|
||||
|
||||
>>> config = DistilBertConfig()
|
||||
>>> onnx_config = DistilBertOnnxConfig(config)
|
||||
>>> print(list(onnx_config.outputs.keys()))
|
||||
["last_hidden_state"]
|
||||
```
|
||||
|
||||
ハブから純粋なTensorFlowのチェックポイントをプログラム的にエクスポートするプロセスは、以下のように同様です:
|
||||
|
||||
```bash
|
||||
python -m transformers.onnx --model=keras-io/transformers-qa onnx/
|
||||
```
|
||||
|
||||
ローカルに保存されたモデルをエクスポートする場合、モデルの重みとトークナイザのファイルを同じディレクトリに保存してください(例: `local-pt-checkpoint`)。その後、`transformers.onnx`パッケージの `--model`引数を希望するディレクトリに向けて設定して、ONNXにエクスポートします:
|
||||
|
||||
|
||||
```bash
|
||||
python -m transformers.onnx --model=local-pt-checkpoint onnx/
|
||||
```
|
||||
|
||||
|
||||
@ -219,7 +219,7 @@ MInDS-14 データセットのサンプリング レートは 8khz です (こ
|
||||
... gradient_accumulation_steps=4,
|
||||
... per_device_eval_batch_size=32,
|
||||
... num_train_epochs=10,
|
||||
... warmup_steps=0.1,
|
||||
... warmup_ratio=0.1,
|
||||
... logging_steps=10,
|
||||
... load_best_model_at_end=True,
|
||||
... metric_for_best_model="accuracy",
|
||||
|
||||
@ -216,7 +216,7 @@ Datasets、🤗 データセット ライブラリから Food-101 データセ
|
||||
... gradient_accumulation_steps=4,
|
||||
... per_device_eval_batch_size=16,
|
||||
... num_train_epochs=3,
|
||||
... warmup_steps=0.1,
|
||||
... warmup_ratio=0.1,
|
||||
... logging_steps=10,
|
||||
... load_best_model_at_end=True,
|
||||
... metric_for_best_model="accuracy",
|
||||
|
||||
@ -360,7 +360,7 @@ You should probably TRAIN this model on a down-stream task to be able to use it
|
||||
... learning_rate=5e-5,
|
||||
... per_device_train_batch_size=batch_size,
|
||||
... per_device_eval_batch_size=batch_size,
|
||||
... warmup_steps=0.1,
|
||||
... warmup_ratio=0.1,
|
||||
... logging_steps=10,
|
||||
... load_best_model_at_end=True,
|
||||
... metric_for_best_model="accuracy",
|
||||
|
||||
177
docs/source/ja/torchscript.md
Normal file
177
docs/source/ja/torchscript.md
Normal file
@ -0,0 +1,177 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Export to TorchScript
|
||||
|
||||
<Tip>
|
||||
|
||||
これはTorchScriptを使用した実験の最初であり、可変入力サイズのモデルに対するその能力をまだ探求中です。これは私たちの関心の焦点であり、今後のリリースでは、より柔軟な実装や、PythonベースのコードとコンパイルされたTorchScriptを比較するベンチマークを含む、より多くのコード例で詳細な分析を行います。
|
||||
|
||||
</Tip>
|
||||
|
||||
[TorchScriptのドキュメント](https://pytorch.org/docs/stable/jit.html)によれば:
|
||||
|
||||
> TorchScriptは、PyTorchコードから直列化および最適化可能なモデルを作成する方法です。
|
||||
|
||||
TorchScriptを使用すると、効率志向のC++プログラムなど、他のプログラムでモデルを再利用できるようになります。PyTorchベースのPythonプログラム以外の環境で🤗 Transformersモデルをエクスポートして使用するためのインターフェースを提供しています。ここでは、TorchScriptを使用してモデルをエクスポートし、使用する方法を説明します。
|
||||
|
||||
モデルをエクスポートするには、次の2つの要件があります:
|
||||
|
||||
- `torchscript`フラグを使用したモデルのインスタンス化
|
||||
- ダミーの入力を使用したフォワードパス
|
||||
|
||||
これらの必要条件は、以下で詳細に説明されているように、開発者が注意する必要があるいくつかのことを意味します。
|
||||
|
||||
## TorchScript flag and tied weights
|
||||
|
||||
`torchscript`フラグは、ほとんどの🤗 Transformers言語モデルにおいて、`Embedding`レイヤーと`Decoding`レイヤー間で重みが連結されているため必要です。
|
||||
TorchScriptでは、重みが連結されているモデルをエクスポートすることはできませんので、事前に重みを切り離して複製する必要があります。
|
||||
|
||||
`torchscript`フラグを使用してインスタンス化されたモデルは、`Embedding`レイヤーと`Decoding`レイヤーが分離されており、そのため後でトレーニングしてはいけません。
|
||||
トレーニングは、これらの2つのレイヤーを非同期にする可能性があり、予期しない結果をもたらす可能性があります。
|
||||
|
||||
言語モデルヘッドを持たないモデルには言及しませんが、これらのモデルには連結された重みが存在しないため、`torchscript`フラグなしで安全にエクスポートできます。
|
||||
|
||||
## Dummy inputs and standard lengths
|
||||
|
||||
ダミー入力はモデルのフォワードパスに使用されます。入力の値はレイヤーを通じて伝播される間、PyTorchは各テンソルに実行された異なる操作を追跡します。これらの記録された操作は、モデルの*トレース*を作成するために使用されます。
|
||||
|
||||
トレースは入力の寸法に対して作成されます。そのため、ダミー入力の寸法に制約され、他のシーケンス長やバッチサイズでは動作しません。異なるサイズで試すと、以下のエラーが発生します:
|
||||
|
||||
```
|
||||
`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2`
|
||||
```
|
||||
|
||||
お勧めしますのは、モデルの推論中に供給される最大の入力と同じ大きさのダミー入力サイズでモデルをトレースすることです。パディングを使用して不足値を補完することもできます。ただし、モデルがより大きな入力サイズでトレースされるため、行列の寸法も大きくなり、より多くの計算が発生します。
|
||||
|
||||
異なるシーケンス長のモデルをエクスポートする際に、各入力に対して実行される演算の総数に注意して、パフォーマンスを密接にフォローすることをお勧めします。
|
||||
|
||||
## Using TorchScript in Python
|
||||
|
||||
このセクションでは、モデルの保存と読み込み、および推論にトレースを使用する方法を示します。
|
||||
|
||||
### Saving a model
|
||||
|
||||
TorchScriptで`BertModel`をエクスポートするには、`BertConfig`クラスから`BertModel`をインスタンス化し、それをファイル名`traced_bert.pt`でディスクに保存します:
|
||||
|
||||
```python
|
||||
from transformers import BertModel, BertTokenizer, BertConfig
|
||||
import torch
|
||||
|
||||
enc = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
|
||||
|
||||
# Tokenizing input text
|
||||
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
|
||||
tokenized_text = enc.tokenize(text)
|
||||
|
||||
# Masking one of the input tokens
|
||||
masked_index = 8
|
||||
tokenized_text[masked_index] = "[MASK]"
|
||||
indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
|
||||
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
|
||||
|
||||
# Creating a dummy input
|
||||
tokens_tensor = torch.tensor([indexed_tokens])
|
||||
segments_tensors = torch.tensor([segments_ids])
|
||||
dummy_input = [tokens_tensor, segments_tensors]
|
||||
|
||||
# Initializing the model with the torchscript flag
|
||||
# Flag set to True even though it is not necessary as this model does not have an LM Head.
|
||||
config = BertConfig(
|
||||
vocab_size_or_config_json_file=32000,
|
||||
hidden_size=768,
|
||||
num_hidden_layers=12,
|
||||
num_attention_heads=12,
|
||||
intermediate_size=3072,
|
||||
torchscript=True,
|
||||
)
|
||||
|
||||
# Instantiating the model
|
||||
model = BertModel(config)
|
||||
|
||||
# The model needs to be in evaluation mode
|
||||
model.eval()
|
||||
|
||||
# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag
|
||||
model = BertModel.from_pretrained("google-bert/bert-base-uncased", torchscript=True)
|
||||
|
||||
# Creating the trace
|
||||
traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
|
||||
torch.jit.save(traced_model, "traced_bert.pt")
|
||||
```
|
||||
|
||||
### Loading a model
|
||||
|
||||
以前に保存した `BertModel`、`traced_bert.pt` をディスクから読み込んで、以前に初期化した `dummy_input` で使用できます。
|
||||
|
||||
```python
|
||||
loaded_model = torch.jit.load("traced_bert.pt")
|
||||
loaded_model.eval()
|
||||
|
||||
all_encoder_layers, pooled_output = loaded_model(*dummy_input)
|
||||
```
|
||||
|
||||
|
||||
### Using a traced model for inference
|
||||
|
||||
トレースモデルを使用して推論を行うには、その `__call__` ダンダーメソッドを使用します。
|
||||
|
||||
```python
|
||||
traced_model(tokens_tensor, segments_tensors)
|
||||
```
|
||||
|
||||
|
||||
## Deploy Hugging Face TorchScript models to AWS with the Neuron SDK
|
||||
|
||||
AWSはクラウドでの低コストで高性能な機械学習推論向けに [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) インスタンスファミリーを導入しました。Inf1インスタンスはAWS Inferentiaチップによって駆動され、ディープラーニング推論ワークロードに特化したカスタムビルドのハードウェアアクセラレータです。[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) はInferentia用のSDKで、トランスフォーマーモデルをトレースして最適化し、Inf1に展開するためのサポートを提供します。
|
||||
|
||||
Neuron SDK が提供するもの:
|
||||
|
||||
1. クラウドでの推論のためにTorchScriptモデルをトレースして最適化するための、1行のコード変更で使用できる簡単なAPI。
|
||||
2. [改善されたコストパフォーマンス](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/) のためのボックス外のパフォーマンス最適化。
|
||||
3. [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html) または [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html) で構築されたHugging Faceトランスフォーマーモデルへのサポート。
|
||||
|
||||
### Implications
|
||||
|
||||
BERT(Bidirectional Encoder Representations from Transformers)アーキテクチャやその変種([distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) や [roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta) など)に基づくトランスフォーマーモデルは、非生成タスク(抽出型質問応答、シーケンス分類、トークン分類など)において、Inf1上で最適に動作します。ただし、テキスト生成タスクも [AWS Neuron MarianMT チュートリアル](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html) に従ってInf1上で実行できます。Inferentiaでボックス外で変換できるモデルに関する詳細情報は、Neuronドキュメンテーションの [Model Architecture Fit](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia) セクションにあります。
|
||||
|
||||
### Dependencies
|
||||
|
||||
モデルをAWS Neuronに変換するには、[Neuron SDK 環境](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide) が必要で、[AWS Deep Learning AMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html) に事前に構成されています。
|
||||
|
||||
### Converting a model for AWS Neuron
|
||||
|
||||
モデルをAWS NEURON用に変換するには、[PythonでTorchScriptを使用する](torchscript#using-torchscript-in-python) と同じコードを使用して `BertModel` をトレースします。Python APIを介してNeuron SDKのコンポーネントにアクセスするために、`torch.neuron` フレームワーク拡張をインポートします。
|
||||
|
||||
```python
|
||||
from transformers import BertModel, BertTokenizer, BertConfig
|
||||
import torch
|
||||
import torch.neuron
|
||||
```
|
||||
|
||||
次の行を変更するだけで済みます。
|
||||
|
||||
```diff
|
||||
- torch.jit.trace(model, [tokens_tensor, segments_tensors])
|
||||
+ torch.neuron.trace(model, [token_tensor, segments_tensors])
|
||||
```
|
||||
|
||||
これにより、Neuron SDKはモデルをトレースし、Inf1インスタンス向けに最適化します。
|
||||
|
||||
AWS Neuron SDKの機能、ツール、サンプルチュートリアル、最新のアップデートについて詳しく知りたい場合は、[AWS NeuronSDK ドキュメンテーション](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html) をご覧ください。
|
||||
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user