mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-23 19:04:35 +08:00
Compare commits
11 Commits
remove-tf-
...
trigger-re
Author | SHA1 | Date | |
---|---|---|---|
6d38d27ef3 | |||
20c0f8bc77 | |||
9b2afaf02d | |||
d188134b95 | |||
e2ed15c465 | |||
005459827e | |||
69419a4935 | |||
1fdb9f3908 | |||
3dfebf2fc0 | |||
e6093deb18 | |||
b7ec09c2f4 |
@ -109,9 +109,7 @@ class CircleCIJob:
|
|||||||
self.docker_image[0]["image"] = f"{self.docker_image[0]['image']}:dev"
|
self.docker_image[0]["image"] = f"{self.docker_image[0]['image']}:dev"
|
||||||
print(f"Using {self.docker_image} docker image")
|
print(f"Using {self.docker_image} docker image")
|
||||||
if self.install_steps is None:
|
if self.install_steps is None:
|
||||||
self.install_steps = ["uv pip install ."]
|
self.install_steps = ["uv venv && uv pip install ."]
|
||||||
# Use a custom patched pytest to force exit the process at the end, to avoid `Too long with no output (exceeded 10m0s): context deadline exceeded`
|
|
||||||
self.install_steps.append("uv pip install git+https://github.com/ydshieh/pytest.git@8.4.1-ydshieh")
|
|
||||||
if self.pytest_options is None:
|
if self.pytest_options is None:
|
||||||
self.pytest_options = {}
|
self.pytest_options = {}
|
||||||
if isinstance(self.tests_to_run, str):
|
if isinstance(self.tests_to_run, str):
|
||||||
@ -177,29 +175,11 @@ class CircleCIJob:
|
|||||||
"command": f"TESTS=$(circleci tests split --split-by=timings {self.job_name}_test_list.txt) && echo $TESTS > splitted_tests.txt && echo $TESTS | tr ' ' '\n'" if self.parallelism else f"awk '{{printf \"%s \", $0}}' {self.job_name}_test_list.txt > splitted_tests.txt"
|
"command": f"TESTS=$(circleci tests split --split-by=timings {self.job_name}_test_list.txt) && echo $TESTS > splitted_tests.txt && echo $TESTS | tr ' ' '\n'" if self.parallelism else f"awk '{{printf \"%s \", $0}}' {self.job_name}_test_list.txt > splitted_tests.txt"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
# During the CircleCI docker images build time, we might already (or not) download the data.
|
{"run": {"name": "fetch hub objects before pytest", "command": "python3 utils/fetch_hub_objects_for_ci.py"}},
|
||||||
# If it's done already, the files are inside the directory `/test_data/`.
|
|
||||||
{"run": {"name": "fetch hub objects before pytest", "command": "cp -r /test_data/* . 2>/dev/null || true; python3 utils/fetch_hub_objects_for_ci.py"}},
|
|
||||||
{"run": {
|
{"run": {
|
||||||
"name": "Run tests",
|
"name": "Run tests",
|
||||||
"command": f"({timeout_cmd} python3 -m pytest {marker_cmd} -n {self.pytest_num_workers} {junit_flags} {repeat_on_failure_flags} {' '.join(pytest_flags)} $(cat splitted_tests.txt) | tee tests_output.txt)"}
|
"command": f"({timeout_cmd} python3 -m pytest {marker_cmd} -n {self.pytest_num_workers} {junit_flags} {repeat_on_failure_flags} {' '.join(pytest_flags)} $(cat splitted_tests.txt) | tee tests_output.txt)"}
|
||||||
},
|
},
|
||||||
{"run":
|
|
||||||
{
|
|
||||||
"name": "Check for test crashes",
|
|
||||||
"when": "always",
|
|
||||||
"command": """if [ ! -f tests_output.txt ]; then
|
|
||||||
echo "ERROR: tests_output.txt does not exist - tests may not have run properly"
|
|
||||||
exit 1
|
|
||||||
elif grep -q "crashed and worker restarting disabled" tests_output.txt; then
|
|
||||||
echo "ERROR: Worker crash detected in test output"
|
|
||||||
echo "Found: crashed and worker restarting disabled"
|
|
||||||
exit 1
|
|
||||||
else
|
|
||||||
echo "Tests output file exists and no worker crashes detected"
|
|
||||||
fi"""
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{"run": {"name": "Expand to show skipped tests", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --skip"}},
|
{"run": {"name": "Expand to show skipped tests", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --skip"}},
|
||||||
{"run": {"name": "Failed tests: show reasons", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --fail"}},
|
{"run": {"name": "Failed tests: show reasons", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --fail"}},
|
||||||
{"run": {"name": "Errors", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --errors"}},
|
{"run": {"name": "Errors", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --errors"}},
|
||||||
@ -233,7 +213,7 @@ generate_job = CircleCIJob(
|
|||||||
docker_image=[{"image": "huggingface/transformers-torch-light"}],
|
docker_image=[{"image": "huggingface/transformers-torch-light"}],
|
||||||
# networkx==3.3 (after #36957) cause some issues
|
# networkx==3.3 (after #36957) cause some issues
|
||||||
# TODO: remove this once it works directly
|
# TODO: remove this once it works directly
|
||||||
install_steps=["uv pip install ."],
|
install_steps=["uv venv && uv pip install ."],
|
||||||
marker="generate",
|
marker="generate",
|
||||||
parallelism=6,
|
parallelism=6,
|
||||||
)
|
)
|
||||||
@ -264,12 +244,13 @@ custom_tokenizers_job = CircleCIJob(
|
|||||||
docker_image=[{"image": "huggingface/transformers-custom-tokenizers"}],
|
docker_image=[{"image": "huggingface/transformers-custom-tokenizers"}],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
examples_torch_job = CircleCIJob(
|
examples_torch_job = CircleCIJob(
|
||||||
"examples_torch",
|
"examples_torch",
|
||||||
additional_env={"OMP_NUM_THREADS": 8},
|
additional_env={"OMP_NUM_THREADS": 8},
|
||||||
docker_image=[{"image":"huggingface/transformers-examples-torch"}],
|
docker_image=[{"image":"huggingface/transformers-examples-torch"}],
|
||||||
# TODO @ArthurZucker remove this once docker is easier to build
|
# TODO @ArthurZucker remove this once docker is easier to build
|
||||||
install_steps=["uv pip install . && uv pip install -r examples/pytorch/_tests_requirements.txt"],
|
install_steps=["uv venv && uv pip install . && uv pip install -r examples/pytorch/_tests_requirements.txt"],
|
||||||
pytest_num_workers=4,
|
pytest_num_workers=4,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -278,7 +259,7 @@ hub_job = CircleCIJob(
|
|||||||
additional_env={"HUGGINGFACE_CO_STAGING": True},
|
additional_env={"HUGGINGFACE_CO_STAGING": True},
|
||||||
docker_image=[{"image":"huggingface/transformers-torch-light"}],
|
docker_image=[{"image":"huggingface/transformers-torch-light"}],
|
||||||
install_steps=[
|
install_steps=[
|
||||||
'uv pip install .',
|
'uv venv && uv pip install .',
|
||||||
'git config --global user.email "ci@dummy.com"',
|
'git config --global user.email "ci@dummy.com"',
|
||||||
'git config --global user.name "ci"',
|
'git config --global user.name "ci"',
|
||||||
],
|
],
|
||||||
@ -287,6 +268,20 @@ hub_job = CircleCIJob(
|
|||||||
resource_class="medium",
|
resource_class="medium",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
onnx_job = CircleCIJob(
|
||||||
|
"onnx",
|
||||||
|
docker_image=[{"image":"huggingface/transformers-torch-tf-light"}],
|
||||||
|
install_steps=[
|
||||||
|
"uv venv",
|
||||||
|
"uv pip install .[testing,sentencepiece,onnxruntime,vision,rjieba]",
|
||||||
|
],
|
||||||
|
pytest_options={"k onnx": None},
|
||||||
|
pytest_num_workers=1,
|
||||||
|
resource_class="small",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
exotic_models_job = CircleCIJob(
|
exotic_models_job = CircleCIJob(
|
||||||
"exotic_models",
|
"exotic_models",
|
||||||
docker_image=[{"image":"huggingface/transformers-exotic-models"}],
|
docker_image=[{"image":"huggingface/transformers-exotic-models"}],
|
||||||
@ -294,6 +289,7 @@ exotic_models_job = CircleCIJob(
|
|||||||
pytest_options={"durations": 100},
|
pytest_options={"durations": 100},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
repo_utils_job = CircleCIJob(
|
repo_utils_job = CircleCIJob(
|
||||||
"repo_utils",
|
"repo_utils",
|
||||||
docker_image=[{"image":"huggingface/transformers-consistency"}],
|
docker_image=[{"image":"huggingface/transformers-consistency"}],
|
||||||
@ -301,12 +297,13 @@ repo_utils_job = CircleCIJob(
|
|||||||
resource_class="large",
|
resource_class="large",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
non_model_job = CircleCIJob(
|
non_model_job = CircleCIJob(
|
||||||
"non_model",
|
"non_model",
|
||||||
docker_image=[{"image": "huggingface/transformers-torch-light"}],
|
docker_image=[{"image": "huggingface/transformers-torch-light"}],
|
||||||
# networkx==3.3 (after #36957) cause some issues
|
# networkx==3.3 (after #36957) cause some issues
|
||||||
# TODO: remove this once it works directly
|
# TODO: remove this once it works directly
|
||||||
install_steps=["uv pip install .[serving]"],
|
install_steps=["uv venv && uv pip install ."],
|
||||||
marker="not generate",
|
marker="not generate",
|
||||||
parallelism=6,
|
parallelism=6,
|
||||||
)
|
)
|
||||||
@ -324,7 +321,7 @@ doc_test_job = CircleCIJob(
|
|||||||
additional_env={"TRANSFORMERS_VERBOSITY": "error", "DATASETS_VERBOSITY": "error", "SKIP_CUDA_DOCTEST": "1"},
|
additional_env={"TRANSFORMERS_VERBOSITY": "error", "DATASETS_VERBOSITY": "error", "SKIP_CUDA_DOCTEST": "1"},
|
||||||
install_steps=[
|
install_steps=[
|
||||||
# Add an empty file to keep the test step running correctly even no file is selected to be tested.
|
# Add an empty file to keep the test step running correctly even no file is selected to be tested.
|
||||||
"uv pip install .",
|
"uv venv && pip install .",
|
||||||
"touch dummy.py",
|
"touch dummy.py",
|
||||||
command,
|
command,
|
||||||
"cat pr_documentation_tests_temp.txt",
|
"cat pr_documentation_tests_temp.txt",
|
||||||
@ -336,7 +333,7 @@ doc_test_job = CircleCIJob(
|
|||||||
pytest_num_workers=1,
|
pytest_num_workers=1,
|
||||||
)
|
)
|
||||||
|
|
||||||
REGULAR_TESTS = [torch_job, hub_job, tokenization_job, processor_job, generate_job, non_model_job] # fmt: skip
|
REGULAR_TESTS = [torch_job, hub_job, onnx_job, tokenization_job, processor_job, generate_job, non_model_job] # fmt: skip
|
||||||
EXAMPLES_TESTS = [examples_torch_job]
|
EXAMPLES_TESTS = [examples_torch_job]
|
||||||
PIPELINE_TESTS = [pipelines_torch_job]
|
PIPELINE_TESTS = [pipelines_torch_job]
|
||||||
REPO_UTIL_TESTS = [repo_utils_job]
|
REPO_UTIL_TESTS = [repo_utils_job]
|
||||||
|
39
.github/copilot-instructions.md
vendored
39
.github/copilot-instructions.md
vendored
@ -1,39 +0,0 @@
|
|||||||
# copilot-instructions.md Guide for Hugging Face Transformers
|
|
||||||
|
|
||||||
This copilot-instructions.md file provides guidance for code agents working with this codebase.
|
|
||||||
|
|
||||||
## Core Project Structure
|
|
||||||
|
|
||||||
- `/src/transformers`: This contains the core source code for the library
|
|
||||||
- `/models`: Code for individual models. Models inherit from base classes in the root `/src/transformers` directory.
|
|
||||||
- `/tests`: This contains the core test classes for the library. These are usually inherited rather than directly run.
|
|
||||||
- `/models`: Tests for individual models. Model tests inherit from common tests in the root `/tests` directory.
|
|
||||||
- `/docs`: This contains the documentation for the library, including guides, tutorials, and API references.
|
|
||||||
|
|
||||||
## Coding Conventions for Hugging Face Transformers
|
|
||||||
|
|
||||||
- PRs should be as brief as possible. Bugfix PRs in particular can often be only one or two lines long, and do not need large comments, docstrings or new functions in this case. Aim to minimize the size of the diff.
|
|
||||||
- When writing tests, they should be added to an existing file. The only exception is for PRs to add a new model, when a new test directory should be created for that model.
|
|
||||||
- Code style is enforced in the CI. You can install the style tools with `pip install -e .[quality]`. You can then run `make fixup` to apply style and consistency fixes to your code.
|
|
||||||
|
|
||||||
## Copying and inheritance
|
|
||||||
|
|
||||||
Many models in the codebase have similar code, but it is not shared by inheritance because we want each model file to be self-contained.
|
|
||||||
We use two mechanisms to keep this code in sync:
|
|
||||||
|
|
||||||
- "Copied from" syntax. Functions or entire classes can have a comment at the top like this: `# Copied from transformers.models.llama.modeling_llama.rotate_half` or `# Copied from transformers.models.t5.modeling_t5.T5LayerNorm with T5->MT5`
|
|
||||||
These comments are actively checked by the style tools, and copies will automatically be updated when the base code is updated. If you need to update a copied function, you should
|
|
||||||
either update the base function and use `make fixup` to propagate the change to all copies, or simply remove the `# Copied from` comment if that is inappropriate.
|
|
||||||
- "Modular" files. These files briefly define models by composing them using inheritance from other models. They are not meant to be used directly. Instead, the style tools
|
|
||||||
automatically generate a complete modeling file, like `modeling_bert.py`, from the modular file like `modular_bert.py`. If a model has a modular file, the modeling file
|
|
||||||
should never be edited directly! Instead, changes should be made in the modular file, and then you should run `make fixup` to update the modeling file automatically.
|
|
||||||
|
|
||||||
When adding new models, you should prefer `modular` style and inherit as many classes as possible from existing models.
|
|
||||||
|
|
||||||
## Testing
|
|
||||||
|
|
||||||
After making changes, you should usually run `make fixup` to ensure any copies and modular files are updated, and then test all affected models. This includes both
|
|
||||||
the model you made the changes in and any other models that were updated by `make fixup`. Tests can be run with `pytest tests/models/[name]/test_modeling_[name].py`
|
|
||||||
If your changes affect code in other classes like tokenizers or processors, you should run those tests instead, like `test_processing_[name].py` or `test_tokenization_[name].py`.
|
|
||||||
|
|
||||||
In order to run tests, you may need to install dependencies. You can do this with `pip install -e .[testing]`. You will probably also need to `pip install torch accelerate` if your environment does not already have them.
|
|
2
.github/workflows/benchmark.yml
vendored
2
.github/workflows/benchmark.yml
vendored
@ -48,7 +48,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Run database init script
|
- name: Run database init script
|
||||||
run: |
|
run: |
|
||||||
psql -f benchmark/utils/init_db.sql
|
psql -f benchmark/init_db.sql
|
||||||
env:
|
env:
|
||||||
PGDATABASE: metrics
|
PGDATABASE: metrics
|
||||||
PGHOST: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGHOST }}
|
PGHOST: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGHOST }}
|
||||||
|
2
.github/workflows/build-ci-docker-images.yml
vendored
2
.github/workflows/build-ci-docker-images.yml
vendored
@ -26,7 +26,7 @@ jobs:
|
|||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
file: ["quality", "consistency", "custom-tokenizers", "torch-light", "exotic-models", "examples-torch"]
|
file: ["quality", "consistency", "custom-tokenizers", "torch-light", "tf-light", "exotic-models", "torch-tf-light", "jax-light", "examples-torch", "examples-tf"]
|
||||||
continue-on-error: true
|
continue-on-error: true
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
@ -2,10 +2,6 @@ name: Build docker images (Nightly CI)
|
|||||||
|
|
||||||
on:
|
on:
|
||||||
workflow_call:
|
workflow_call:
|
||||||
inputs:
|
|
||||||
job:
|
|
||||||
required: true
|
|
||||||
type: string
|
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- build_nightly_ci_docker_image*
|
- build_nightly_ci_docker_image*
|
||||||
@ -16,8 +12,7 @@ concurrency:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
latest-with-torch-nightly-docker:
|
latest-with-torch-nightly-docker:
|
||||||
name: "Nightly PyTorch"
|
name: "Nightly PyTorch + Stable TensorFlow"
|
||||||
if: inputs.job == 'latest-with-torch-nightly-docker' || inputs.job == ''
|
|
||||||
runs-on:
|
runs-on:
|
||||||
group: aws-general-8-plus
|
group: aws-general-8-plus
|
||||||
steps:
|
steps:
|
||||||
@ -46,7 +41,6 @@ jobs:
|
|||||||
|
|
||||||
nightly-torch-deepspeed-docker:
|
nightly-torch-deepspeed-docker:
|
||||||
name: "Nightly PyTorch + DeepSpeed"
|
name: "Nightly PyTorch + DeepSpeed"
|
||||||
if: inputs.job == 'nightly-torch-deepspeed-docker' || inputs.job == ''
|
|
||||||
runs-on:
|
runs-on:
|
||||||
group: aws-g4dn-2xlarge-cache
|
group: aws-g4dn-2xlarge-cache
|
||||||
steps:
|
steps:
|
||||||
|
7
.github/workflows/check_failed_tests.yml
vendored
7
.github/workflows/check_failed_tests.yml
vendored
@ -21,9 +21,6 @@ on:
|
|||||||
report_repo_id:
|
report_repo_id:
|
||||||
required: true
|
required: true
|
||||||
type: string
|
type: string
|
||||||
commit_sha:
|
|
||||||
required: false
|
|
||||||
type: string
|
|
||||||
|
|
||||||
|
|
||||||
env:
|
env:
|
||||||
@ -44,7 +41,7 @@ jobs:
|
|||||||
check_new_failures:
|
check_new_failures:
|
||||||
name: " "
|
name: " "
|
||||||
runs-on:
|
runs-on:
|
||||||
group: aws-g5-4xlarge-cache
|
group: aws-g4dn-4xlarge-cache
|
||||||
container:
|
container:
|
||||||
image: ${{ inputs.docker }}
|
image: ${{ inputs.docker }}
|
||||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
@ -90,7 +87,7 @@ jobs:
|
|||||||
- name: Update clone
|
- name: Update clone
|
||||||
working-directory: /transformers
|
working-directory: /transformers
|
||||||
if: ${{ env.process == 'true' }}
|
if: ${{ env.process == 'true' }}
|
||||||
run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
|
run: git fetch && git checkout ${{ github.sha }}
|
||||||
|
|
||||||
- name: Get target commit
|
- name: Get target commit
|
||||||
working-directory: /transformers/utils
|
working-directory: /transformers/utils
|
||||||
|
43
.github/workflows/collated-reports.yml
vendored
43
.github/workflows/collated-reports.yml
vendored
@ -1,43 +0,0 @@
|
|||||||
name: CI collated reports
|
|
||||||
|
|
||||||
on:
|
|
||||||
workflow_call:
|
|
||||||
inputs:
|
|
||||||
job:
|
|
||||||
required: true
|
|
||||||
type: string
|
|
||||||
report_repo_id:
|
|
||||||
required: true
|
|
||||||
type: string
|
|
||||||
machine_type:
|
|
||||||
required: true
|
|
||||||
type: string
|
|
||||||
gpu_name:
|
|
||||||
description: Name of the GPU used for the job. Its enough that the value contains the name of the GPU, e.g. "noise-h100-more-noise". Case insensitive.
|
|
||||||
required: true
|
|
||||||
type: string
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
collated_reports:
|
|
||||||
name: Collated reports
|
|
||||||
runs-on: ubuntu-22.04
|
|
||||||
if: always()
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
- uses: actions/download-artifact@v4
|
|
||||||
|
|
||||||
- name: Collated reports
|
|
||||||
shell: bash
|
|
||||||
env:
|
|
||||||
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
|
||||||
CI_SHA: ${{ github.sha }}
|
|
||||||
TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
|
|
||||||
run: |
|
|
||||||
pip install huggingface_hub
|
|
||||||
python3 utils/collated_reports.py \
|
|
||||||
--path . \
|
|
||||||
--machine-type ${{ inputs.machine_type }} \
|
|
||||||
--commit-hash ${{ env.CI_SHA }} \
|
|
||||||
--job ${{ inputs.job }} \
|
|
||||||
--report-repo-id ${{ inputs.report_repo_id }} \
|
|
||||||
--gpu-name ${{ inputs.gpu_name }}
|
|
4
.github/workflows/doctest_job.yml
vendored
4
.github/workflows/doctest_job.yml
vendored
@ -28,10 +28,10 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
split_keys: ${{ fromJson(inputs.split_keys) }}
|
split_keys: ${{ fromJson(inputs.split_keys) }}
|
||||||
runs-on:
|
runs-on:
|
||||||
group: aws-g5-4xlarge-cache
|
group: aws-g4dn-4xlarge-cache
|
||||||
container:
|
container:
|
||||||
image: huggingface/transformers-all-latest-gpu
|
image: huggingface/transformers-all-latest-gpu
|
||||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
steps:
|
steps:
|
||||||
- name: Update clone
|
- name: Update clone
|
||||||
working-directory: /transformers
|
working-directory: /transformers
|
||||||
|
4
.github/workflows/doctests.yml
vendored
4
.github/workflows/doctests.yml
vendored
@ -15,10 +15,10 @@ jobs:
|
|||||||
setup:
|
setup:
|
||||||
name: Setup
|
name: Setup
|
||||||
runs-on:
|
runs-on:
|
||||||
group: aws-g5-4xlarge-cache
|
group: aws-g4dn-4xlarge-cache
|
||||||
container:
|
container:
|
||||||
image: huggingface/transformers-all-latest-gpu
|
image: huggingface/transformers-all-latest-gpu
|
||||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
outputs:
|
outputs:
|
||||||
job_splits: ${{ steps.set-matrix.outputs.job_splits }}
|
job_splits: ${{ steps.set-matrix.outputs.job_splits }}
|
||||||
split_keys: ${{ steps.set-matrix.outputs.split_keys }}
|
split_keys: ${{ steps.set-matrix.outputs.split_keys }}
|
||||||
|
157
.github/workflows/get-pr-info.yml
vendored
157
.github/workflows/get-pr-info.yml
vendored
@ -1,157 +0,0 @@
|
|||||||
name: Get PR commit SHA
|
|
||||||
on:
|
|
||||||
workflow_call:
|
|
||||||
inputs:
|
|
||||||
pr_number:
|
|
||||||
required: true
|
|
||||||
type: string
|
|
||||||
outputs:
|
|
||||||
PR_HEAD_REPO_FULL_NAME:
|
|
||||||
description: "The full name of the repository from which the pull request is created"
|
|
||||||
value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REPO_FULL_NAME }}
|
|
||||||
PR_BASE_REPO_FULL_NAME:
|
|
||||||
description: "The full name of the repository to which the pull request is created"
|
|
||||||
value: ${{ jobs.get-pr-info.outputs.PR_BASE_REPO_FULL_NAME }}
|
|
||||||
PR_HEAD_REPO_OWNER:
|
|
||||||
description: "The owner of the repository from which the pull request is created"
|
|
||||||
value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}
|
|
||||||
PR_BASE_REPO_OWNER:
|
|
||||||
description: "The owner of the repository to which the pull request is created"
|
|
||||||
value: ${{ jobs.get-pr-info.outputs.PR_BASE_REPO_OWNER }}
|
|
||||||
PR_HEAD_REPO_NAME:
|
|
||||||
description: "The name of the repository from which the pull request is created"
|
|
||||||
value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}
|
|
||||||
PR_BASE_REPO_NAME:
|
|
||||||
description: "The name of the repository to which the pull request is created"
|
|
||||||
value: ${{ jobs.get-pr-info.outputs.PR_BASE_REPO_NAME }}
|
|
||||||
PR_HEAD_REF:
|
|
||||||
description: "The branch name of the pull request in the head repository"
|
|
||||||
value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REF }}
|
|
||||||
PR_BASE_REF:
|
|
||||||
description: "The branch name in the base repository (to merge into)"
|
|
||||||
value: ${{ jobs.get-pr-info.outputs.PR_BASE_REF }}
|
|
||||||
PR_HEAD_SHA:
|
|
||||||
description: "The head sha of the pull request branch in the head repository"
|
|
||||||
value: ${{ jobs.get-pr-info.outputs.PR_HEAD_SHA }}
|
|
||||||
PR_BASE_SHA:
|
|
||||||
description: "The head sha of the target branch in the base repository"
|
|
||||||
value: ${{ jobs.get-pr-info.outputs.PR_BASE_SHA }}
|
|
||||||
PR_MERGE_COMMIT_SHA:
|
|
||||||
description: "The sha of the merge commit for the pull request (created by GitHub) in the base repository"
|
|
||||||
value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_SHA }}
|
|
||||||
PR_HEAD_COMMIT_DATE:
|
|
||||||
description: "The date of the head sha of the pull request branch in the head repository"
|
|
||||||
value: ${{ jobs.get-pr-info.outputs.PR_HEAD_COMMIT_DATE }}
|
|
||||||
PR_MERGE_COMMIT_DATE:
|
|
||||||
description: "The date of the merge commit for the pull request (created by GitHub) in the base repository"
|
|
||||||
value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_DATE }}
|
|
||||||
PR_HEAD_COMMIT_TIMESTAMP:
|
|
||||||
description: "The timestamp of the head sha of the pull request branch in the head repository"
|
|
||||||
value: ${{ jobs.get-pr-info.outputs.PR_HEAD_COMMIT_TIMESTAMP }}
|
|
||||||
PR_MERGE_COMMIT_TIMESTAMP:
|
|
||||||
description: "The timestamp of the merge commit for the pull request (created by GitHub) in the base repository"
|
|
||||||
value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
|
|
||||||
PR:
|
|
||||||
description: "The PR"
|
|
||||||
value: ${{ jobs.get-pr-info.outputs.PR }}
|
|
||||||
PR_FILES:
|
|
||||||
description: "The files touched in the PR"
|
|
||||||
value: ${{ jobs.get-pr-info.outputs.PR_FILES }}
|
|
||||||
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
get-pr-info:
|
|
||||||
runs-on: ubuntu-22.04
|
|
||||||
name: Get PR commit SHA better
|
|
||||||
outputs:
|
|
||||||
PR_HEAD_REPO_FULL_NAME: ${{ steps.pr_info.outputs.head_repo_full_name }}
|
|
||||||
PR_BASE_REPO_FULL_NAME: ${{ steps.pr_info.outputs.base_repo_full_name }}
|
|
||||||
PR_HEAD_REPO_OWNER: ${{ steps.pr_info.outputs.head_repo_owner }}
|
|
||||||
PR_BASE_REPO_OWNER: ${{ steps.pr_info.outputs.base_repo_owner }}
|
|
||||||
PR_HEAD_REPO_NAME: ${{ steps.pr_info.outputs.head_repo_name }}
|
|
||||||
PR_BASE_REPO_NAME: ${{ steps.pr_info.outputs.base_repo_name }}
|
|
||||||
PR_HEAD_REF: ${{ steps.pr_info.outputs.head_ref }}
|
|
||||||
PR_BASE_REF: ${{ steps.pr_info.outputs.base_ref }}
|
|
||||||
PR_HEAD_SHA: ${{ steps.pr_info.outputs.head_sha }}
|
|
||||||
PR_BASE_SHA: ${{ steps.pr_info.outputs.base_sha }}
|
|
||||||
PR_MERGE_COMMIT_SHA: ${{ steps.pr_info.outputs.merge_commit_sha }}
|
|
||||||
PR_HEAD_COMMIT_DATE: ${{ steps.pr_info.outputs.head_commit_date }}
|
|
||||||
PR_MERGE_COMMIT_DATE: ${{ steps.pr_info.outputs.merge_commit_date }}
|
|
||||||
PR_HEAD_COMMIT_TIMESTAMP: ${{ steps.get_timestamps.outputs.head_commit_timestamp }}
|
|
||||||
PR_MERGE_COMMIT_TIMESTAMP: ${{ steps.get_timestamps.outputs.merge_commit_timestamp }}
|
|
||||||
PR: ${{ steps.pr_info.outputs.pr }}
|
|
||||||
PR_FILES: ${{ steps.pr_info.outputs.files }}
|
|
||||||
if: ${{ inputs.pr_number != '' }}
|
|
||||||
steps:
|
|
||||||
- name: Extract PR details
|
|
||||||
id: pr_info
|
|
||||||
uses: actions/github-script@v6
|
|
||||||
with:
|
|
||||||
script: |
|
|
||||||
const { data: pr } = await github.rest.pulls.get({
|
|
||||||
owner: context.repo.owner,
|
|
||||||
repo: context.repo.repo,
|
|
||||||
pull_number: ${{ inputs.pr_number }}
|
|
||||||
});
|
|
||||||
|
|
||||||
const { data: head_commit } = await github.rest.repos.getCommit({
|
|
||||||
owner: pr.head.repo.owner.login,
|
|
||||||
repo: pr.head.repo.name,
|
|
||||||
ref: pr.head.ref
|
|
||||||
});
|
|
||||||
|
|
||||||
const { data: merge_commit } = await github.rest.repos.getCommit({
|
|
||||||
owner: pr.base.repo.owner.login,
|
|
||||||
repo: pr.base.repo.name,
|
|
||||||
ref: pr.merge_commit_sha,
|
|
||||||
});
|
|
||||||
|
|
||||||
const { data: files } = await github.rest.pulls.listFiles({
|
|
||||||
owner: context.repo.owner,
|
|
||||||
repo: context.repo.repo,
|
|
||||||
pull_number: ${{ inputs.pr_number }}
|
|
||||||
});
|
|
||||||
|
|
||||||
core.setOutput('head_repo_full_name', pr.head.repo.full_name);
|
|
||||||
core.setOutput('base_repo_full_name', pr.base.repo.full_name);
|
|
||||||
core.setOutput('head_repo_owner', pr.head.repo.owner.login);
|
|
||||||
core.setOutput('base_repo_owner', pr.base.repo.owner.login);
|
|
||||||
core.setOutput('head_repo_name', pr.head.repo.name);
|
|
||||||
core.setOutput('base_repo_name', pr.base.repo.name);
|
|
||||||
core.setOutput('head_ref', pr.head.ref);
|
|
||||||
core.setOutput('base_ref', pr.base.ref);
|
|
||||||
core.setOutput('head_sha', pr.head.sha);
|
|
||||||
core.setOutput('base_sha', pr.base.sha);
|
|
||||||
core.setOutput('merge_commit_sha', pr.merge_commit_sha);
|
|
||||||
core.setOutput('pr', pr);
|
|
||||||
|
|
||||||
core.setOutput('head_commit_date', head_commit.commit.committer.date);
|
|
||||||
core.setOutput('merge_commit_date', merge_commit.commit.committer.date);
|
|
||||||
|
|
||||||
core.setOutput('files', files);
|
|
||||||
|
|
||||||
console.log('PR head commit:', {
|
|
||||||
head_commit: head_commit,
|
|
||||||
commit: head_commit.commit,
|
|
||||||
date: head_commit.commit.committer.date
|
|
||||||
});
|
|
||||||
|
|
||||||
console.log('PR merge commit:', {
|
|
||||||
merge_commit: merge_commit,
|
|
||||||
commit: merge_commit.commit,
|
|
||||||
date: merge_commit.commit.committer.date
|
|
||||||
});
|
|
||||||
|
|
||||||
- name: Convert dates to timestamps
|
|
||||||
id: get_timestamps
|
|
||||||
run: |
|
|
||||||
head_commit_date=${{ steps.pr_info.outputs.head_commit_date }}
|
|
||||||
merge_commit_date=${{ steps.pr_info.outputs.merge_commit_date }}
|
|
||||||
echo $head_commit_date
|
|
||||||
echo $merge_commit_date
|
|
||||||
head_commit_timestamp=$(date -d "$head_commit_date" +%s)
|
|
||||||
merge_commit_timestamp=$(date -d "$merge_commit_date" +%s)
|
|
||||||
echo $head_commit_timestamp
|
|
||||||
echo $merge_commit_timestamp
|
|
||||||
echo "head_commit_timestamp=$head_commit_timestamp" >> $GITHUB_OUTPUT
|
|
||||||
echo "merge_commit_timestamp=$merge_commit_timestamp" >> $GITHUB_OUTPUT
|
|
36
.github/workflows/get-pr-number.yml
vendored
36
.github/workflows/get-pr-number.yml
vendored
@ -1,36 +0,0 @@
|
|||||||
name: Get PR number
|
|
||||||
on:
|
|
||||||
workflow_call:
|
|
||||||
outputs:
|
|
||||||
PR_NUMBER:
|
|
||||||
description: "The extracted PR number"
|
|
||||||
value: ${{ jobs.get-pr-number.outputs.PR_NUMBER }}
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
get-pr-number:
|
|
||||||
runs-on: ubuntu-22.04
|
|
||||||
name: Get PR number
|
|
||||||
outputs:
|
|
||||||
PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
|
|
||||||
steps:
|
|
||||||
- name: Get PR number
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
if [[ "${{ github.event.issue.number }}" != "" && "${{ github.event.issue.pull_request }}" != "" ]]; then
|
|
||||||
echo "PR_NUMBER=${{ github.event.issue.number }}" >> $GITHUB_ENV
|
|
||||||
elif [[ "${{ github.event.pull_request.number }}" != "" ]]; then
|
|
||||||
echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV
|
|
||||||
elif [[ "${{ github.event.pull_request }}" != "" ]]; then
|
|
||||||
echo "PR_NUMBER=${{ github.event.number }}" >> $GITHUB_ENV
|
|
||||||
else
|
|
||||||
echo "PR_NUMBER=" >> $GITHUB_ENV
|
|
||||||
fi
|
|
||||||
|
|
||||||
- name: Check PR number
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
echo "${{ env.PR_NUMBER }}"
|
|
||||||
|
|
||||||
- name: Set PR number
|
|
||||||
id: set_pr_number
|
|
||||||
run: echo "PR_NUMBER=${{ env.PR_NUMBER }}" >> "$GITHUB_OUTPUT"
|
|
33
.github/workflows/model_jobs.yml
vendored
33
.github/workflows/model_jobs.yml
vendored
@ -12,25 +12,16 @@ on:
|
|||||||
slice_id:
|
slice_id:
|
||||||
required: true
|
required: true
|
||||||
type: number
|
type: number
|
||||||
runner_map:
|
runner:
|
||||||
required: false
|
required: true
|
||||||
type: string
|
type: string
|
||||||
docker:
|
docker:
|
||||||
required: true
|
required: true
|
||||||
type: string
|
type: string
|
||||||
commit_sha:
|
|
||||||
required: false
|
|
||||||
type: string
|
|
||||||
report_name_prefix:
|
report_name_prefix:
|
||||||
required: false
|
required: false
|
||||||
default: run_models_gpu
|
default: run_models_gpu
|
||||||
type: string
|
type: string
|
||||||
runner_type:
|
|
||||||
required: false
|
|
||||||
type: string
|
|
||||||
report_repo_id:
|
|
||||||
required: false
|
|
||||||
type: string
|
|
||||||
|
|
||||||
env:
|
env:
|
||||||
HF_HOME: /mnt/cache
|
HF_HOME: /mnt/cache
|
||||||
@ -54,7 +45,7 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
|
folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
|
||||||
runs-on:
|
runs-on:
|
||||||
group: ${{ fromJson(inputs.runner_map)[matrix.folders][inputs.machine_type] }}
|
group: '${{ inputs.machine_type }}'
|
||||||
container:
|
container:
|
||||||
image: ${{ inputs.docker }}
|
image: ${{ inputs.docker }}
|
||||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
@ -79,7 +70,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Update clone
|
- name: Update clone
|
||||||
working-directory: /transformers
|
working-directory: /transformers
|
||||||
run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
|
run: git fetch && git checkout ${{ github.sha }}
|
||||||
|
|
||||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||||
working-directory: /transformers
|
working-directory: /transformers
|
||||||
@ -116,9 +107,9 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
echo "${{ inputs.machine_type }}"
|
echo "${{ inputs.machine_type }}"
|
||||||
|
|
||||||
if [ "${{ inputs.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
if [ "${{ inputs.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
|
||||||
machine_type=single-gpu
|
machine_type=single-gpu
|
||||||
elif [ "${{ inputs.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
elif [ "${{ inputs.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
|
||||||
machine_type=multi-gpu
|
machine_type=multi-gpu
|
||||||
else
|
else
|
||||||
machine_type=${{ inputs.machine_type }}
|
machine_type=${{ inputs.machine_type }}
|
||||||
@ -149,15 +140,3 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
|
name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
|
||||||
path: /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
|
path: /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
|
||||||
|
|
||||||
collated_reports:
|
|
||||||
name: Collated Reports
|
|
||||||
if: ${{ always() }}
|
|
||||||
needs: run_models_gpu
|
|
||||||
uses: huggingface/transformers/.github/workflows/collated-reports.yml@main
|
|
||||||
with:
|
|
||||||
job: run_models_gpu
|
|
||||||
report_repo_id: ${{ inputs.report_repo_id }}
|
|
||||||
gpu_name: ${{ inputs.runner_type }}
|
|
||||||
machine_type: ${{ inputs.machine_type }}
|
|
||||||
secrets: inherit
|
|
||||||
|
128
.github/workflows/model_jobs_amd.yml
vendored
Normal file
128
.github/workflows/model_jobs_amd.yml
vendored
Normal file
@ -0,0 +1,128 @@
|
|||||||
|
name: model jobs
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_call:
|
||||||
|
inputs:
|
||||||
|
folder_slices:
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
machine_type:
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
slice_id:
|
||||||
|
required: true
|
||||||
|
type: number
|
||||||
|
runner:
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
docker:
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
|
||||||
|
env:
|
||||||
|
HF_HOME: /mnt/cache
|
||||||
|
TRANSFORMERS_IS_CI: yes
|
||||||
|
OMP_NUM_THREADS: 8
|
||||||
|
MKL_NUM_THREADS: 8
|
||||||
|
RUN_SLOW: yes
|
||||||
|
# For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
|
||||||
|
# This token is created under the bot `hf-transformers-bot`.
|
||||||
|
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||||
|
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
||||||
|
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||||
|
CUDA_VISIBLE_DEVICES: 0,1
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
run_models_gpu:
|
||||||
|
name: " "
|
||||||
|
strategy:
|
||||||
|
max-parallel: 1 # For now, not to parallelize. Can change later if it works well.
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
|
||||||
|
runs-on: ['${{ inputs.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
|
||||||
|
container:
|
||||||
|
image: ${{ inputs.docker }}
|
||||||
|
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
|
steps:
|
||||||
|
- name: Echo input and matrix info
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
echo "${{ inputs.folder_slices }}"
|
||||||
|
echo "${{ matrix.folders }}"
|
||||||
|
echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
|
||||||
|
|
||||||
|
- name: Echo folder ${{ matrix.folders }}
|
||||||
|
shell: bash
|
||||||
|
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
|
||||||
|
# set the artifact folder names (because the character `/` is not allowed).
|
||||||
|
run: |
|
||||||
|
echo "${{ matrix.folders }}"
|
||||||
|
matrix_folders=${{ matrix.folders }}
|
||||||
|
matrix_folders=${matrix_folders/'models/'/'models_'}
|
||||||
|
echo "$matrix_folders"
|
||||||
|
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Update clone
|
||||||
|
working-directory: /transformers
|
||||||
|
run: git fetch && git checkout ${{ github.sha }}
|
||||||
|
|
||||||
|
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||||
|
working-directory: /transformers
|
||||||
|
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
|
||||||
|
|
||||||
|
- name: Update / Install some packages (for Past CI)
|
||||||
|
if: ${{ contains(inputs.docker, '-past-') }}
|
||||||
|
working-directory: /transformers
|
||||||
|
run: |
|
||||||
|
python3 -m pip install -U datasets
|
||||||
|
|
||||||
|
- name: Update / Install some packages (for Past CI)
|
||||||
|
if: ${{ contains(inputs.docker, '-past-') && contains(inputs.docker, '-pytorch-') }}
|
||||||
|
working-directory: /transformers
|
||||||
|
run: |
|
||||||
|
python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
||||||
|
|
||||||
|
- name: ROCM-SMI
|
||||||
|
run: |
|
||||||
|
rocm-smi
|
||||||
|
|
||||||
|
- name: ROCM-INFO
|
||||||
|
run: |
|
||||||
|
rocminfo | grep "Agent" -A 14
|
||||||
|
|
||||||
|
- name: Show ROCR environment
|
||||||
|
run: |
|
||||||
|
echo "ROCR: $ROCR_VISIBLE_DEVICES"
|
||||||
|
|
||||||
|
- name: Environment
|
||||||
|
working-directory: /transformers
|
||||||
|
run: |
|
||||||
|
python3 utils/print_env.py
|
||||||
|
|
||||||
|
- name: Show installed libraries and their versions
|
||||||
|
working-directory: /transformers
|
||||||
|
run: pip freeze
|
||||||
|
|
||||||
|
- name: Run all tests on GPU
|
||||||
|
working-directory: /transformers
|
||||||
|
run: python3 -m pytest -rsfE -v --make-reports=${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} -m "not not_device_test"
|
||||||
|
|
||||||
|
- name: Failure short reports
|
||||||
|
if: ${{ failure() }}
|
||||||
|
continue-on-error: true
|
||||||
|
run: cat /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
|
||||||
|
|
||||||
|
- name: Run test
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
mkdir -p /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
|
||||||
|
echo "hello" > /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
|
||||||
|
echo "${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
|
||||||
|
|
||||||
|
- name: "Test suite reports artifacts: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
|
||||||
|
if: ${{ always() }}
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
|
||||||
|
path: /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
|
121
.github/workflows/model_jobs_intel_gaudi.yml
vendored
121
.github/workflows/model_jobs_intel_gaudi.yml
vendored
@ -1,121 +0,0 @@
|
|||||||
name: model jobs
|
|
||||||
|
|
||||||
on:
|
|
||||||
workflow_call:
|
|
||||||
inputs:
|
|
||||||
folder_slices:
|
|
||||||
required: true
|
|
||||||
type: string
|
|
||||||
slice_id:
|
|
||||||
required: true
|
|
||||||
type: number
|
|
||||||
runner:
|
|
||||||
required: true
|
|
||||||
type: string
|
|
||||||
machine_type:
|
|
||||||
required: true
|
|
||||||
type: string
|
|
||||||
report_name_prefix:
|
|
||||||
required: false
|
|
||||||
default: run_models_gpu
|
|
||||||
type: string
|
|
||||||
|
|
||||||
env:
|
|
||||||
RUN_SLOW: yes
|
|
||||||
PT_HPU_LAZY_MODE: 0
|
|
||||||
TRANSFORMERS_IS_CI: yes
|
|
||||||
PT_ENABLE_INT64_SUPPORT: 1
|
|
||||||
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
|
||||||
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
|
||||||
HF_HOME: /mnt/cache/.cache/huggingface
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
run_models_gpu:
|
|
||||||
name: " "
|
|
||||||
strategy:
|
|
||||||
max-parallel: 8
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
|
|
||||||
runs-on:
|
|
||||||
group: ${{ inputs.runner }}
|
|
||||||
container:
|
|
||||||
image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
|
||||||
options: --runtime=habana
|
|
||||||
-v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
|
|
||||||
--env OMPI_MCA_btl_vader_single_copy_mechanism=none
|
|
||||||
--env HABANA_VISIBLE_DEVICES
|
|
||||||
--env HABANA_VISIBLE_MODULES
|
|
||||||
--cap-add=sys_nice
|
|
||||||
--shm-size=64G
|
|
||||||
steps:
|
|
||||||
- name: Echo input and matrix info
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
echo "${{ inputs.folder_slices }}"
|
|
||||||
echo "${{ matrix.folders }}"
|
|
||||||
echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
|
|
||||||
|
|
||||||
- name: Echo folder ${{ matrix.folders }}
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
echo "${{ matrix.folders }}"
|
|
||||||
matrix_folders=${{ matrix.folders }}
|
|
||||||
matrix_folders=${matrix_folders/'models/'/'models_'}
|
|
||||||
echo "$matrix_folders"
|
|
||||||
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn
|
|
||||||
|
|
||||||
- name: HL-SMI
|
|
||||||
run: |
|
|
||||||
hl-smi
|
|
||||||
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
|
|
||||||
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
|
|
||||||
|
|
||||||
- name: Environment
|
|
||||||
run: python3 utils/print_env.py
|
|
||||||
|
|
||||||
- name: Show installed libraries and their versions
|
|
||||||
run: pip freeze
|
|
||||||
|
|
||||||
- name: Set `machine_type` for report and artifact names
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
if [ "${{ inputs.machine_type }}" = "1gaudi" ]; then
|
|
||||||
machine_type=single-gpu
|
|
||||||
elif [ "${{ inputs.machine_type }}" = "2gaudi" ]; then
|
|
||||||
machine_type=multi-gpu
|
|
||||||
else
|
|
||||||
machine_type=${{ inputs.machine_type }}
|
|
||||||
fi
|
|
||||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
- name: Run all tests on Gaudi
|
|
||||||
run: python3 -m pytest -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
|
|
||||||
|
|
||||||
- name: Failure short reports
|
|
||||||
if: ${{ failure() }}
|
|
||||||
continue-on-error: true
|
|
||||||
run: cat reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt
|
|
||||||
|
|
||||||
- name: Run test
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
mkdir -p reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
|
|
||||||
echo "hello" > reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt
|
|
||||||
echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports"
|
|
||||||
|
|
||||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
|
|
||||||
if: ${{ always() }}
|
|
||||||
uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
|
|
||||||
path: reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
|
|
134
.github/workflows/pr_build_doc_with_comment.yml
vendored
134
.github/workflows/pr_build_doc_with_comment.yml
vendored
@ -1,134 +0,0 @@
|
|||||||
name: PR - build doc via comment
|
|
||||||
on:
|
|
||||||
issue_comment:
|
|
||||||
types:
|
|
||||||
- created
|
|
||||||
branches-ignore:
|
|
||||||
- main
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.event.issue.number }}-${{ startsWith(github.event.comment.body, 'build-doc') }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
permissions: {}
|
|
||||||
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
get-pr-number:
|
|
||||||
name: Get PR number
|
|
||||||
if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad", "stevhliu", "ebezzam"]'), github.actor) && (startsWith(github.event.comment.body, 'build-doc')) }}
|
|
||||||
uses: ./.github/workflows/get-pr-number.yml
|
|
||||||
|
|
||||||
get-pr-info:
|
|
||||||
name: Get PR commit SHA
|
|
||||||
needs: get-pr-number
|
|
||||||
if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
|
|
||||||
uses: ./.github/workflows/get-pr-info.yml
|
|
||||||
with:
|
|
||||||
pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
|
|
||||||
|
|
||||||
verity_pr_commit:
|
|
||||||
name: Verity PR commit corresponds to a specific event by comparing timestamps
|
|
||||||
if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
|
|
||||||
runs-on: ubuntu-22.04
|
|
||||||
needs: get-pr-info
|
|
||||||
env:
|
|
||||||
COMMENT_DATE: ${{ github.event.comment.created_at }}
|
|
||||||
PR_MERGE_COMMIT_DATE: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_DATE }}
|
|
||||||
PR_MERGE_COMMIT_TIMESTAMP: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
|
|
||||||
steps:
|
|
||||||
- run: |
|
|
||||||
COMMENT_TIMESTAMP=$(date -d "${COMMENT_DATE}" +"%s")
|
|
||||||
echo "COMMENT_DATE: $COMMENT_DATE"
|
|
||||||
echo "PR_MERGE_COMMIT_DATE: $PR_MERGE_COMMIT_DATE"
|
|
||||||
echo "COMMENT_TIMESTAMP: $COMMENT_TIMESTAMP"
|
|
||||||
echo "PR_MERGE_COMMIT_TIMESTAMP: $PR_MERGE_COMMIT_TIMESTAMP"
|
|
||||||
if [ $COMMENT_TIMESTAMP -le $PR_MERGE_COMMIT_TIMESTAMP ]; then
|
|
||||||
echo "Last commit on the pull request is newer than the issue comment triggering this run! Abort!";
|
|
||||||
exit -1;
|
|
||||||
fi
|
|
||||||
|
|
||||||
create_run:
|
|
||||||
name: Create run
|
|
||||||
needs: [get-pr-number, get-pr-info]
|
|
||||||
if: ${{ needs.get-pr-number.outputs.PR_NUMBER != '' }}
|
|
||||||
permissions:
|
|
||||||
statuses: write
|
|
||||||
runs-on: ubuntu-22.04
|
|
||||||
steps:
|
|
||||||
- name: Create Run
|
|
||||||
id: create_run
|
|
||||||
env:
|
|
||||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
# Create a commit status (pending) for a run of this workflow. The status has to be updated later in `update_run_status`.
|
|
||||||
# See https://docs.github.com/en/rest/commits/statuses?apiVersion=2022-11-28#create-a-commit-status
|
|
||||||
GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
|
||||||
run: |
|
|
||||||
gh api \
|
|
||||||
--method POST \
|
|
||||||
-H "Accept: application/vnd.github+json" \
|
|
||||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
|
||||||
repos/${{ github.repository }}/statuses/${{ needs.get-pr-info.outputs.PR_HEAD_SHA }} \
|
|
||||||
-f "target_url=$GITHUB_RUN_URL" -f "state=pending" -f "description=Custom doc building job" -f "context=custom-doc-build"
|
|
||||||
|
|
||||||
reply_to_comment:
|
|
||||||
name: Reply to the comment
|
|
||||||
if: ${{ needs.create_run.result == 'success' }}
|
|
||||||
needs: [get-pr-number, create_run]
|
|
||||||
permissions:
|
|
||||||
pull-requests: write
|
|
||||||
runs-on: ubuntu-22.04
|
|
||||||
steps:
|
|
||||||
- name: Reply to the comment
|
|
||||||
env:
|
|
||||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
|
||||||
run: |
|
|
||||||
gh api \
|
|
||||||
--method POST \
|
|
||||||
-H "Accept: application/vnd.github+json" \
|
|
||||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
|
||||||
repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \
|
|
||||||
-f "body=[Building docs for all languages...](${{ env.GITHUB_RUN_URL }})"
|
|
||||||
|
|
||||||
build-doc:
|
|
||||||
name: Build doc
|
|
||||||
needs: [get-pr-number, get-pr-info]
|
|
||||||
if: ${{ needs.get-pr-number.outputs.PR_NUMBER != '' }}
|
|
||||||
uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
|
|
||||||
with:
|
|
||||||
commit_sha: ${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}
|
|
||||||
pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
|
|
||||||
package: transformers
|
|
||||||
languages: ar de en es fr hi it ko pt tr zh ja te
|
|
||||||
|
|
||||||
update_run_status:
|
|
||||||
name: Update Check Run Status
|
|
||||||
needs: [ get-pr-info, create_run, build-doc ]
|
|
||||||
permissions:
|
|
||||||
statuses: write
|
|
||||||
if: ${{ always() && needs.create_run.result == 'success' }}
|
|
||||||
runs-on: ubuntu-22.04
|
|
||||||
env:
|
|
||||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
|
||||||
STATUS_OK: ${{ contains(fromJSON('["skipped", "success"]'), needs.create_run.result) }}
|
|
||||||
steps:
|
|
||||||
- name: Get `build-doc` job status
|
|
||||||
run: |
|
|
||||||
echo "${{ needs.build-doc.result }}"
|
|
||||||
echo $STATUS_OK
|
|
||||||
if [ "$STATUS_OK" = "true" ]; then
|
|
||||||
echo "STATUS=success" >> $GITHUB_ENV
|
|
||||||
else
|
|
||||||
echo "STATUS=failure" >> $GITHUB_ENV
|
|
||||||
fi
|
|
||||||
|
|
||||||
- name: Update PR commit statuses
|
|
||||||
run: |
|
|
||||||
echo "${{ needs.build-doc.result }}"
|
|
||||||
echo "${{ env.STATUS }}"
|
|
||||||
gh api \
|
|
||||||
--method POST \
|
|
||||||
-H "Accept: application/vnd.github+json" \
|
|
||||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
|
||||||
repos/${{ github.repository }}/statuses/${{ needs.get-pr-info.outputs.PR_HEAD_SHA }} \
|
|
||||||
-f "target_url=$GITHUB_RUN_URL" -f "state=${{ env.STATUS }}" -f "description=Custom doc building job" -f "context=custom-doc-build"
|
|
177
.github/workflows/pr_run_slow_ci.yml
vendored
177
.github/workflows/pr_run_slow_ci.yml
vendored
@ -1,177 +0,0 @@
|
|||||||
name: PR slow CI
|
|
||||||
on:
|
|
||||||
pull_request_target:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
get-pr-number:
|
|
||||||
name: Get PR number
|
|
||||||
uses: ./.github/workflows/get-pr-number.yml
|
|
||||||
|
|
||||||
get-pr-info:
|
|
||||||
name: Get PR commit SHA
|
|
||||||
needs: get-pr-number
|
|
||||||
if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
|
|
||||||
uses: ./.github/workflows/get-pr-info.yml
|
|
||||||
with:
|
|
||||||
pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
|
|
||||||
|
|
||||||
get-jobs:
|
|
||||||
name: Get test files to run
|
|
||||||
runs-on: ubuntu-22.04
|
|
||||||
needs: [get-pr-number, get-pr-info]
|
|
||||||
outputs:
|
|
||||||
jobs: ${{ steps.get_jobs.outputs.jobs_to_run }}
|
|
||||||
steps:
|
|
||||||
- name: Get repository content
|
|
||||||
id: repo_content
|
|
||||||
uses: actions/github-script@v6
|
|
||||||
with:
|
|
||||||
script: |
|
|
||||||
const { data: tests_dir } = await github.rest.repos.getContent({
|
|
||||||
owner: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}',
|
|
||||||
repo: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}',
|
|
||||||
path: 'tests',
|
|
||||||
ref: '${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}',
|
|
||||||
});
|
|
||||||
|
|
||||||
const { data: tests_models_dir } = await github.rest.repos.getContent({
|
|
||||||
owner: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}',
|
|
||||||
repo: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}',
|
|
||||||
path: 'tests/models',
|
|
||||||
ref: '${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}',
|
|
||||||
});
|
|
||||||
|
|
||||||
const { data: tests_quantization_dir } = await github.rest.repos.getContent({
|
|
||||||
owner: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}',
|
|
||||||
repo: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}',
|
|
||||||
path: 'tests/quantization',
|
|
||||||
ref: '${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}',
|
|
||||||
});
|
|
||||||
|
|
||||||
core.setOutput('tests_dir', tests_dir);
|
|
||||||
core.setOutput('tests_models_dir', tests_models_dir);
|
|
||||||
core.setOutput('tests_quantization_dir', tests_quantization_dir);
|
|
||||||
|
|
||||||
# This checkout to the main branch
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: "0"
|
|
||||||
|
|
||||||
- name: Write pr_files file
|
|
||||||
run: |
|
|
||||||
cat > pr_files.txt << 'EOF'
|
|
||||||
${{ needs.get-pr-info.outputs.PR_FILES }}
|
|
||||||
EOF
|
|
||||||
|
|
||||||
- name: Write tests_dir file
|
|
||||||
run: |
|
|
||||||
cat > tests_dir.txt << 'EOF'
|
|
||||||
${{ steps.repo_content.outputs.tests_dir }}
|
|
||||||
EOF
|
|
||||||
|
|
||||||
- name: Write tests_models_dir file
|
|
||||||
run: |
|
|
||||||
cat > tests_models_dir.txt << 'EOF'
|
|
||||||
${{ steps.repo_content.outputs.tests_models_dir }}
|
|
||||||
EOF
|
|
||||||
|
|
||||||
- name: Write tests_quantization_dir file
|
|
||||||
run: |
|
|
||||||
cat > tests_quantization_dir.txt << 'EOF'
|
|
||||||
${{ steps.repo_content.outputs.tests_quantization_dir }}
|
|
||||||
EOF
|
|
||||||
|
|
||||||
- name: Run script to get jobs to run
|
|
||||||
id: get_jobs
|
|
||||||
run: |
|
|
||||||
python utils/get_pr_run_slow_jobs.py | tee output.txt
|
|
||||||
echo "jobs_to_run: $(tail -n 1 output.txt)"
|
|
||||||
echo "jobs_to_run=$(tail -n 1 output.txt)" >> $GITHUB_OUTPUT
|
|
||||||
|
|
||||||
send_comment:
|
|
||||||
# Will delete the previous comment and send a new one if:
|
|
||||||
# - either the content is changed
|
|
||||||
# - or the previous comment is 30 minutes or more old
|
|
||||||
name: Send a comment to suggest jobs to run
|
|
||||||
if: ${{ needs.get-jobs.outputs.jobs != '' }}
|
|
||||||
needs: [get-pr-number, get-jobs]
|
|
||||||
permissions:
|
|
||||||
pull-requests: write
|
|
||||||
runs-on: ubuntu-22.04
|
|
||||||
steps:
|
|
||||||
- name: Check and update comment if needed
|
|
||||||
uses: actions/github-script@v7
|
|
||||||
env:
|
|
||||||
BODY: "\n\nrun-slow: ${{ needs.get-jobs.outputs.jobs }}"
|
|
||||||
with:
|
|
||||||
script: |
|
|
||||||
const prNumber = ${{ needs.get-pr-number.outputs.PR_NUMBER }};
|
|
||||||
const commentPrefix = "**[For maintainers]** Suggested jobs to run (before merge)";
|
|
||||||
const thirtyMinutesAgo = new Date(Date.now() - 30 * 60 * 1000); // 30 minutes ago
|
|
||||||
const newBody = `${commentPrefix}${process.env.BODY}`;
|
|
||||||
|
|
||||||
// Get all comments on the PR
|
|
||||||
const { data: comments } = await github.rest.issues.listComments({
|
|
||||||
owner: context.repo.owner,
|
|
||||||
repo: context.repo.repo,
|
|
||||||
issue_number: prNumber
|
|
||||||
});
|
|
||||||
|
|
||||||
// Find existing comments that start with our prefix
|
|
||||||
const existingComments = comments.filter(comment =>
|
|
||||||
comment.user.login === 'github-actions[bot]' &&
|
|
||||||
comment.body.startsWith(commentPrefix)
|
|
||||||
);
|
|
||||||
|
|
||||||
let shouldCreateNewComment = true;
|
|
||||||
let commentsToDelete = [];
|
|
||||||
|
|
||||||
if (existingComments.length > 0) {
|
|
||||||
// Get the most recent comment
|
|
||||||
const mostRecentComment = existingComments
|
|
||||||
.sort((a, b) => new Date(b.created_at) - new Date(a.created_at))[0];
|
|
||||||
|
|
||||||
const commentDate = new Date(mostRecentComment.created_at);
|
|
||||||
const isOld = commentDate < thirtyMinutesAgo;
|
|
||||||
const isDifferentContent = mostRecentComment.body !== newBody;
|
|
||||||
|
|
||||||
console.log(`Most recent comment created: ${mostRecentComment.created_at}`);
|
|
||||||
console.log(`Is older than 30 minutes: ${isOld}`);
|
|
||||||
console.log(`Has different content: ${isDifferentContent}`);
|
|
||||||
|
|
||||||
if (isOld || isDifferentContent) {
|
|
||||||
// Delete all existing comments and create new one
|
|
||||||
commentsToDelete = existingComments;
|
|
||||||
console.log(`Will delete ${commentsToDelete.length} existing comment(s) and create new one`);
|
|
||||||
} else {
|
|
||||||
// Content is same and comment is recent, skip
|
|
||||||
shouldCreateNewComment = false;
|
|
||||||
console.log('Comment is recent and content unchanged, skipping update');
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
console.log('No existing comments found, will create new one');
|
|
||||||
}
|
|
||||||
|
|
||||||
// Delete old comments if needed
|
|
||||||
for (const comment of commentsToDelete) {
|
|
||||||
console.log(`Deleting comment #${comment.id} (created: ${comment.created_at})`);
|
|
||||||
await github.rest.issues.deleteComment({
|
|
||||||
owner: context.repo.owner,
|
|
||||||
repo: context.repo.repo,
|
|
||||||
comment_id: comment.id
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create new comment if needed
|
|
||||||
if (shouldCreateNewComment) {
|
|
||||||
await github.rest.issues.createComment({
|
|
||||||
owner: context.repo.owner,
|
|
||||||
repo: context.repo.repo,
|
|
||||||
issue_number: prNumber,
|
|
||||||
body: newBody
|
|
||||||
});
|
|
||||||
console.log('✅ New comment created');
|
|
||||||
} else {
|
|
||||||
console.log('ℹ️ No comment update needed');
|
|
||||||
}
|
|
250
.github/workflows/push-important-models.yml
vendored
250
.github/workflows/push-important-models.yml
vendored
@ -4,6 +4,17 @@ on:
|
|||||||
push:
|
push:
|
||||||
branches: [ main ]
|
branches: [ main ]
|
||||||
|
|
||||||
|
env:
|
||||||
|
OUTPUT_SLACK_CHANNEL_ID: "C06L2SGMEEA"
|
||||||
|
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||||
|
HF_HOME: /mnt/cache
|
||||||
|
TRANSFORMERS_IS_CI: yes
|
||||||
|
OMP_NUM_THREADS: 8
|
||||||
|
MKL_NUM_THREADS: 8
|
||||||
|
RUN_SLOW: yes # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. # This token is created under the bot `hf-transformers-bot`.
|
||||||
|
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
||||||
|
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
get_modified_models:
|
get_modified_models:
|
||||||
name: "Get all modified files"
|
name: "Get all modified files"
|
||||||
@ -14,144 +25,111 @@ jobs:
|
|||||||
- name: Check out code
|
- name: Check out code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Get changed files using `actions/github-script`
|
- name: Get changed files
|
||||||
id: get-changed-files
|
id: changed-files
|
||||||
uses: actions/github-script@v7
|
uses: tj-actions/changed-files@1c8e6069583811afb28f97afeaf8e7da80c6be5c
|
||||||
with:
|
with:
|
||||||
script: |
|
files: src/transformers/models/**
|
||||||
let files = [];
|
|
||||||
|
|
||||||
// Only handle push events
|
|
||||||
if (context.eventName === 'push') {
|
|
||||||
const afterSha = context.payload.after;
|
|
||||||
const branchName = context.payload.ref.replace('refs/heads/', '');
|
|
||||||
|
|
||||||
let baseSha;
|
|
||||||
|
|
||||||
if (branchName === 'main') {
|
|
||||||
console.log('Push to main branch, comparing to parent commit');
|
|
||||||
// Get the parent commit of the pushed commit
|
|
||||||
const { data: commit } = await github.rest.repos.getCommit({
|
|
||||||
owner: context.repo.owner,
|
|
||||||
repo: context.repo.repo,
|
|
||||||
ref: afterSha
|
|
||||||
});
|
|
||||||
baseSha = commit.parents[0]?.sha;
|
|
||||||
if (!baseSha) {
|
|
||||||
throw new Error('No parent commit found for the pushed commit');
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
console.log(`Push to branch ${branchName}, comparing to main`);
|
|
||||||
baseSha = 'main';
|
|
||||||
}
|
|
||||||
|
|
||||||
const { data: comparison } = await github.rest.repos.compareCommits({
|
|
||||||
owner: context.repo.owner,
|
|
||||||
repo: context.repo.repo,
|
|
||||||
base: baseSha,
|
|
||||||
head: afterSha
|
|
||||||
});
|
|
||||||
|
|
||||||
// Include added, modified, and renamed files
|
|
||||||
files = comparison.files
|
|
||||||
.filter(file => file.status === 'added' || file.status === 'modified' || file.status === 'renamed')
|
|
||||||
.map(file => file.filename);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Include all files under src/transformers/ (not just models subdirectory)
|
|
||||||
const filteredFiles = files.filter(file =>
|
|
||||||
file.startsWith('src/transformers/')
|
|
||||||
);
|
|
||||||
|
|
||||||
core.setOutput('changed_files', filteredFiles.join(' '));
|
|
||||||
core.setOutput('any_changed', filteredFiles.length > 0 ? 'true' : 'false');
|
|
||||||
|
|
||||||
- name: Parse changed files with Python
|
- name: Run step if only the files listed above change
|
||||||
if: steps.get-changed-files.outputs.any_changed == 'true'
|
if: steps.changed-files.outputs.any_changed == 'true'
|
||||||
env:
|
|
||||||
CHANGED_FILES: ${{ steps.get-changed-files.outputs.changed_files }}
|
|
||||||
id: set-matrix
|
id: set-matrix
|
||||||
|
env:
|
||||||
|
ALL_CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
|
||||||
run: |
|
run: |
|
||||||
python3 - << 'EOF'
|
model_arrays=()
|
||||||
import os
|
for file in $ALL_CHANGED_FILES; do
|
||||||
import sys
|
model_path="${file#*models/}"
|
||||||
import json
|
model_path="models/${model_path%%/*}"
|
||||||
|
if grep -qFx "$model_path" utils/important_models.txt; then
|
||||||
# Add the utils directory to Python path
|
# Append the file to the matrix string
|
||||||
sys.path.insert(0, 'utils')
|
model_arrays+=("$model_path")
|
||||||
|
fi
|
||||||
# Import the important models list
|
done
|
||||||
from important_files import IMPORTANT_MODELS
|
matrix_string=$(printf '"%s", ' "${model_arrays[@]}" | sed 's/, $//')
|
||||||
|
echo "matrix=[$matrix_string]" >> $GITHUB_OUTPUT
|
||||||
print(f"Important models: {IMPORTANT_MODELS}")
|
test_modified_files:
|
||||||
|
|
||||||
# Get the changed files from the previous step
|
|
||||||
changed_files_str = os.environ.get('CHANGED_FILES', '')
|
|
||||||
changed_files = changed_files_str.split() if changed_files_str else []
|
|
||||||
|
|
||||||
# Filter to only Python files
|
|
||||||
python_files = [f for f in changed_files if f.endswith('.py')]
|
|
||||||
print(f"Python files changed: {python_files}")
|
|
||||||
|
|
||||||
result_models = set()
|
|
||||||
|
|
||||||
# Specific files that trigger all models
|
|
||||||
transformers_utils_files = [
|
|
||||||
'modeling_utils.py',
|
|
||||||
'modeling_rope_utils.py',
|
|
||||||
'modeling_flash_attention_utils.py',
|
|
||||||
'modeling_attn_mask_utils.py',
|
|
||||||
'cache_utils.py',
|
|
||||||
'masking_utils.py',
|
|
||||||
'pytorch_utils.py'
|
|
||||||
]
|
|
||||||
|
|
||||||
# Single loop through all Python files
|
|
||||||
for file in python_files:
|
|
||||||
# Check for files under src/transformers/models/
|
|
||||||
if file.startswith('src/transformers/models/'):
|
|
||||||
remaining_path = file[len('src/transformers/models/'):]
|
|
||||||
if '/' in remaining_path:
|
|
||||||
model_dir = remaining_path.split('/')[0]
|
|
||||||
if model_dir in IMPORTANT_MODELS:
|
|
||||||
result_models.add(model_dir)
|
|
||||||
print(f"Added model directory: {model_dir}")
|
|
||||||
|
|
||||||
# Check for specific files under src/transformers/ or src/transformers/generation/ files
|
|
||||||
elif file.startswith('src/transformers/generation/') or \
|
|
||||||
(file.startswith('src/transformers/') and os.path.basename(file) in transformers_utils_files):
|
|
||||||
print(f"Found core file: {file} - including all important models")
|
|
||||||
result_models.update(IMPORTANT_MODELS)
|
|
||||||
break # No need to continue once we include all models
|
|
||||||
|
|
||||||
# Convert to sorted list and create matrix
|
|
||||||
result_list = sorted(list(result_models))
|
|
||||||
print(f"Final model list: {result_list}")
|
|
||||||
|
|
||||||
if result_list:
|
|
||||||
matrix_json = json.dumps(result_list)
|
|
||||||
print(f"matrix={matrix_json}")
|
|
||||||
|
|
||||||
# Write to GITHUB_OUTPUT
|
|
||||||
with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
|
|
||||||
f.write(f"matrix={matrix_json}\n")
|
|
||||||
else:
|
|
||||||
print("matrix=[]")
|
|
||||||
with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
|
|
||||||
f.write("matrix=[]\n")
|
|
||||||
EOF
|
|
||||||
|
|
||||||
model-ci:
|
|
||||||
name: Model CI
|
|
||||||
uses: ./.github/workflows/self-scheduled.yml
|
|
||||||
needs: get_modified_models
|
needs: get_modified_models
|
||||||
if: needs.get_modified_models.outputs.matrix != '' && needs.get_modified_models.outputs.matrix != '[]'
|
name: Slow & FA2 tests
|
||||||
with:
|
runs-on:
|
||||||
job: run_models_gpu
|
group: aws-g5-4xlarge-cache
|
||||||
slack_report_channel: "#transformers-ci-push"
|
container:
|
||||||
docker: huggingface/transformers-all-latest-gpu
|
image: huggingface/transformers-all-latest-gpu
|
||||||
ci_event: push
|
options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
report_repo_id: hf-internal-testing/transformers_ci_push
|
if: ${{ needs.get_modified_models.outputs.matrix != '[]' && needs.get_modified_models.outputs.matrix != '' && fromJson(needs.get_modified_models.outputs.matrix)[0] != null }}
|
||||||
commit_sha: ${{ github.sha }}
|
strategy:
|
||||||
models: ${{ needs.get_modified_models.outputs.matrix }}
|
fail-fast: false
|
||||||
secrets: inherit
|
matrix:
|
||||||
|
model-name: ${{ fromJson(needs.get_modified_models.outputs.matrix) }}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Check out code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Install locally transformers & other libs
|
||||||
|
run: |
|
||||||
|
apt install sudo
|
||||||
|
sudo -H pip install --upgrade pip
|
||||||
|
sudo -H pip uninstall -y transformers
|
||||||
|
sudo -H pip install -U -e ".[testing]"
|
||||||
|
MAX_JOBS=4 pip install flash-attn --no-build-isolation
|
||||||
|
pip install bitsandbytes
|
||||||
|
|
||||||
|
- name: NVIDIA-SMI
|
||||||
|
run: |
|
||||||
|
nvidia-smi
|
||||||
|
|
||||||
|
- name: Show installed libraries and their versions
|
||||||
|
run: pip freeze
|
||||||
|
|
||||||
|
- name: Run FA2 tests
|
||||||
|
id: run_fa2_tests
|
||||||
|
run:
|
||||||
|
pytest -rsfE -m "flash_attn_test" --make-reports=${{ matrix.model-name }}_fa2_tests/ tests/${{ matrix.model-name }}/test_modeling_*
|
||||||
|
|
||||||
|
- name: "Test suite reports artifacts: ${{ matrix.model-name }}_fa2_tests"
|
||||||
|
if: ${{ always() }}
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: ${{ matrix.model-name }}_fa2_tests
|
||||||
|
path: /transformers/reports/${{ matrix.model-name }}_fa2_tests
|
||||||
|
|
||||||
|
- name: Post to Slack
|
||||||
|
if: always()
|
||||||
|
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
||||||
|
with:
|
||||||
|
slack_channel: ${{ env.OUTPUT_SLACK_CHANNEL_ID }}
|
||||||
|
title: 🤗 Results of the FA2 tests - ${{ matrix.model-name }}
|
||||||
|
status: ${{ steps.run_fa2_tests.conclusion}}
|
||||||
|
slack_token: ${{ secrets.CI_SLACK_BOT_TOKEN }}
|
||||||
|
|
||||||
|
- name: Run integration tests
|
||||||
|
id: run_integration_tests
|
||||||
|
if: always()
|
||||||
|
run:
|
||||||
|
pytest -rsfE -k "IntegrationTest" --make-reports=tests_integration_${{ matrix.model-name }} tests/${{ matrix.model-name }}/test_modeling_*
|
||||||
|
|
||||||
|
- name: "Test suite reports artifacts: tests_integration_${{ matrix.model-name }}"
|
||||||
|
if: ${{ always() }}
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: tests_integration_${{ matrix.model-name }}
|
||||||
|
path: /transformers/reports/tests_integration_${{ matrix.model-name }}
|
||||||
|
|
||||||
|
- name: Post to Slack
|
||||||
|
if: always()
|
||||||
|
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
||||||
|
with:
|
||||||
|
slack_channel: ${{ env.OUTPUT_SLACK_CHANNEL_ID }}
|
||||||
|
title: 🤗 Results of the Integration tests - ${{ matrix.model-name }}
|
||||||
|
status: ${{ steps.run_integration_tests.conclusion}}
|
||||||
|
slack_token: ${{ secrets.CI_SLACK_BOT_TOKEN }}
|
||||||
|
|
||||||
|
- name: Tailscale # In order to be able to SSH when a test fails
|
||||||
|
if: ${{ runner.debug == '1'}}
|
||||||
|
uses: huggingface/tailscale-action@v1
|
||||||
|
with:
|
||||||
|
authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }}
|
||||||
|
slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
|
||||||
|
slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||||
|
waitForSSH: true
|
||||||
|
14
.github/workflows/self-comment-ci.yml
vendored
14
.github/workflows/self-comment-ci.yml
vendored
@ -29,7 +29,7 @@ jobs:
|
|||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
name: Get PR number
|
name: Get PR number
|
||||||
# For security: only allow team members to run
|
# For security: only allow team members to run
|
||||||
if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad", "stevhliu", "ebezzam", "remi-or"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
|
if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
|
||||||
outputs:
|
outputs:
|
||||||
PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
|
PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
|
||||||
steps:
|
steps:
|
||||||
@ -185,7 +185,7 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
folders: ${{ fromJson(needs.get-tests.outputs.models) }}
|
folders: ${{ fromJson(needs.get-tests.outputs.models) }}
|
||||||
machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
|
machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
|
||||||
runs-on:
|
runs-on:
|
||||||
group: '${{ matrix.machine_type }}'
|
group: '${{ matrix.machine_type }}'
|
||||||
container:
|
container:
|
||||||
@ -239,9 +239,9 @@ jobs:
|
|||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
echo "${{ matrix.machine_type }}"
|
echo "${{ matrix.machine_type }}"
|
||||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
|
||||||
machine_type=single-gpu
|
machine_type=single-gpu
|
||||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
|
||||||
machine_type=multi-gpu
|
machine_type=multi-gpu
|
||||||
else
|
else
|
||||||
machine_type=${{ matrix.machine_type }}
|
machine_type=${{ matrix.machine_type }}
|
||||||
@ -292,7 +292,7 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
folders: ${{ fromJson(needs.get-tests.outputs.quantizations) }}
|
folders: ${{ fromJson(needs.get-tests.outputs.quantizations) }}
|
||||||
machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
|
machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
|
||||||
runs-on:
|
runs-on:
|
||||||
group: '${{ matrix.machine_type }}'
|
group: '${{ matrix.machine_type }}'
|
||||||
container:
|
container:
|
||||||
@ -338,9 +338,9 @@ jobs:
|
|||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
echo "${{ matrix.machine_type }}"
|
echo "${{ matrix.machine_type }}"
|
||||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
|
||||||
machine_type=single-gpu
|
machine_type=single-gpu
|
||||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
|
||||||
machine_type=multi-gpu
|
machine_type=multi-gpu
|
||||||
else
|
else
|
||||||
machine_type=${{ matrix.machine_type }}
|
machine_type=${{ matrix.machine_type }}
|
||||||
|
61
.github/workflows/self-nightly-caller.yml
vendored
61
.github/workflows/self-nightly-caller.yml
vendored
@ -1,56 +1,43 @@
|
|||||||
name: Nvidia CI with nightly torch
|
name: Self-hosted runner (nightly-ci)
|
||||||
|
|
||||||
|
|
||||||
on:
|
on:
|
||||||
repository_dispatch:
|
repository_dispatch:
|
||||||
# triggered when the daily scheduled Nvidia CI is completed.
|
schedule:
|
||||||
# This way, we can compare the results more easily.
|
- cron: "17 2 * * *"
|
||||||
workflow_run:
|
|
||||||
workflows: ["Nvidia CI"]
|
|
||||||
branches: ["main"]
|
|
||||||
types: [completed]
|
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- run_ci_with_nightly_torch*
|
- run_nightly_ci*
|
||||||
|
|
||||||
# Used for `push` to easily modify the target workflow runs to compare against
|
|
||||||
env:
|
|
||||||
prev_workflow_run_id: ""
|
|
||||||
other_workflow_run_id: ""
|
|
||||||
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build_nightly_torch_ci_images:
|
build_nightly_ci_images:
|
||||||
name: Build CI Docker Images with nightly torch
|
name: Build Nightly CI Docker Images
|
||||||
|
if: (github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_nightly_ci'))
|
||||||
uses: ./.github/workflows/build-nightly-ci-docker-images.yml
|
uses: ./.github/workflows/build-nightly-ci-docker-images.yml
|
||||||
with:
|
|
||||||
job: latest-with-torch-nightly-docker
|
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
setup:
|
|
||||||
name: Setup
|
|
||||||
runs-on: ubuntu-22.04
|
|
||||||
steps:
|
|
||||||
- name: Setup
|
|
||||||
run: |
|
|
||||||
mkdir "setup_values"
|
|
||||||
echo "${{ inputs.prev_workflow_run_id || env.prev_workflow_run_id }}" > "setup_values/prev_workflow_run_id.txt"
|
|
||||||
echo "${{ inputs.other_workflow_run_id || env.other_workflow_run_id }}" > "setup_values/other_workflow_run_id.txt"
|
|
||||||
|
|
||||||
- name: Upload artifacts
|
|
||||||
uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: setup_values
|
|
||||||
path: setup_values
|
|
||||||
|
|
||||||
model-ci:
|
model-ci:
|
||||||
name: Model CI
|
name: Model CI
|
||||||
needs: build_nightly_torch_ci_images
|
needs: [build_nightly_ci_images]
|
||||||
uses: ./.github/workflows/self-scheduled.yml
|
uses: ./.github/workflows/self-scheduled.yml
|
||||||
with:
|
with:
|
||||||
job: run_models_gpu
|
job: run_models_gpu
|
||||||
slack_report_channel: "#transformers-ci-past-future"
|
slack_report_channel: "#transformers-ci-past-future"
|
||||||
|
runner: ci
|
||||||
docker: huggingface/transformers-all-latest-torch-nightly-gpu
|
docker: huggingface/transformers-all-latest-torch-nightly-gpu
|
||||||
ci_event: Nightly CI
|
ci_event: Nightly CI
|
||||||
report_repo_id: hf-internal-testing/transformers_daily_ci_with_torch_nightly
|
secrets: inherit
|
||||||
commit_sha: ${{ github.event.workflow_run.head_sha || github.sha }}
|
|
||||||
|
deepspeed-ci:
|
||||||
|
name: DeepSpeed CI
|
||||||
|
needs: [build_nightly_ci_images]
|
||||||
|
uses: ./.github/workflows/self-scheduled.yml
|
||||||
|
with:
|
||||||
|
job: run_torch_cuda_extensions_gpu
|
||||||
|
slack_report_channel: "#transformers-ci-past-future"
|
||||||
|
runner: ci
|
||||||
|
# test deepspeed nightly build with the latest release torch
|
||||||
|
docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
|
||||||
|
ci_event: Nightly CI
|
||||||
|
working-directory-prefix: /workspace
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
25
.github/workflows/self-push-amd-mi300-caller.yml
vendored
Normal file
25
.github/workflows/self-push-amd-mi300-caller.yml
vendored
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
name: Self-hosted runner (AMD mi300 CI caller)
|
||||||
|
|
||||||
|
on:
|
||||||
|
#workflow_run:
|
||||||
|
# workflows: ["Self-hosted runner (push-caller)"]
|
||||||
|
# branches: ["main"]
|
||||||
|
# types: [completed]
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- run_amd_push_ci_caller*
|
||||||
|
paths:
|
||||||
|
- "src/**"
|
||||||
|
- "tests/**"
|
||||||
|
- ".github/**"
|
||||||
|
- "templates/**"
|
||||||
|
- "utils/**"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
run_amd_ci:
|
||||||
|
name: AMD mi300
|
||||||
|
if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && (startsWith(github.ref_name, 'run_amd_push_ci_caller') || startsWith(github.ref_name, 'mi300-ci'))))
|
||||||
|
uses: ./.github/workflows/self-push-amd.yml
|
||||||
|
with:
|
||||||
|
gpu_flavor: mi300
|
||||||
|
secrets: inherit
|
32
.github/workflows/self-push.yml
vendored
32
.github/workflows/self-push.yml
vendored
@ -31,12 +31,12 @@ jobs:
|
|||||||
name: Setup
|
name: Setup
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
|
machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
|
||||||
runs-on:
|
runs-on:
|
||||||
group: '${{ matrix.machine_type }}'
|
group: '${{ matrix.machine_type }}'
|
||||||
container:
|
container:
|
||||||
image: huggingface/transformers-all-latest-gpu-push-ci
|
image: huggingface/transformers-all-latest-gpu-push-ci
|
||||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
outputs:
|
outputs:
|
||||||
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||||
test_map: ${{ steps.set-matrix.outputs.test_map }}
|
test_map: ${{ steps.set-matrix.outputs.test_map }}
|
||||||
@ -131,12 +131,12 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
||||||
machine_type: [aws-g5-4xlarge-cache]
|
machine_type: [aws-g4dn-2xlarge-cache]
|
||||||
runs-on:
|
runs-on:
|
||||||
group: '${{ matrix.machine_type }}'
|
group: '${{ matrix.machine_type }}'
|
||||||
container:
|
container:
|
||||||
image: huggingface/transformers-all-latest-gpu-push-ci
|
image: huggingface/transformers-all-latest-gpu-push-ci
|
||||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
env:
|
env:
|
||||||
# For the meaning of these environment variables, see the job `Setup`
|
# For the meaning of these environment variables, see the job `Setup`
|
||||||
CI_BRANCH_PUSH: ${{ github.event.ref }}
|
CI_BRANCH_PUSH: ${{ github.event.ref }}
|
||||||
@ -169,9 +169,9 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
echo "${{ matrix.machine_type }}"
|
echo "${{ matrix.machine_type }}"
|
||||||
|
|
||||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
|
||||||
machine_type=single-gpu
|
machine_type=single-gpu
|
||||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
|
||||||
machine_type=multi-gpu
|
machine_type=multi-gpu
|
||||||
else
|
else
|
||||||
machine_type=${{ matrix.machine_type }}
|
machine_type=${{ matrix.machine_type }}
|
||||||
@ -244,7 +244,7 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
||||||
machine_type: [aws-g5-12xlarge-cache]
|
machine_type: [aws-g4dn-12xlarge-cache]
|
||||||
runs-on:
|
runs-on:
|
||||||
group: '${{ matrix.machine_type }}'
|
group: '${{ matrix.machine_type }}'
|
||||||
container:
|
container:
|
||||||
@ -282,9 +282,9 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
echo "${{ matrix.machine_type }}"
|
echo "${{ matrix.machine_type }}"
|
||||||
|
|
||||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
|
||||||
machine_type=single-gpu
|
machine_type=single-gpu
|
||||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
|
||||||
machine_type=multi-gpu
|
machine_type=multi-gpu
|
||||||
else
|
else
|
||||||
machine_type=${{ matrix.machine_type }}
|
machine_type=${{ matrix.machine_type }}
|
||||||
@ -357,12 +357,12 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
machine_type: [aws-g5-4xlarge-cache]
|
machine_type: [aws-g4dn-2xlarge-cache]
|
||||||
runs-on:
|
runs-on:
|
||||||
group: '${{ matrix.machine_type }}'
|
group: '${{ matrix.machine_type }}'
|
||||||
container:
|
container:
|
||||||
image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
|
image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
|
||||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
env:
|
env:
|
||||||
# For the meaning of these environment variables, see the job `Setup`
|
# For the meaning of these environment variables, see the job `Setup`
|
||||||
CI_BRANCH_PUSH: ${{ github.event.ref }}
|
CI_BRANCH_PUSH: ${{ github.event.ref }}
|
||||||
@ -395,9 +395,9 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
echo "${{ matrix.machine_type }}"
|
echo "${{ matrix.machine_type }}"
|
||||||
|
|
||||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
|
||||||
machine_type=single-gpu
|
machine_type=single-gpu
|
||||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
|
||||||
machine_type=multi-gpu
|
machine_type=multi-gpu
|
||||||
else
|
else
|
||||||
machine_type=${{ matrix.machine_type }}
|
machine_type=${{ matrix.machine_type }}
|
||||||
@ -467,7 +467,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
machine_type: [aws-g5-12xlarge-cache]
|
machine_type: [aws-g4dn-12xlarge-cache]
|
||||||
runs-on:
|
runs-on:
|
||||||
group: '${{ matrix.machine_type }}'
|
group: '${{ matrix.machine_type }}'
|
||||||
container:
|
container:
|
||||||
@ -505,9 +505,9 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
echo "${{ matrix.machine_type }}"
|
echo "${{ matrix.machine_type }}"
|
||||||
|
|
||||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
|
||||||
machine_type=single-gpu
|
machine_type=single-gpu
|
||||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
|
||||||
machine_type=multi-gpu
|
machine_type=multi-gpu
|
||||||
else
|
else
|
||||||
machine_type=${{ matrix.machine_type }}
|
machine_type=${{ matrix.machine_type }}
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
name: Self-hosted runner scale set (AMD mi325 scheduled CI caller)
|
name: Self-hosted runner scale set (AMD mi300 scheduled CI caller)
|
||||||
|
|
||||||
# Note: For every job in this workflow, the name of the runner scale set is finalized in the runner yaml i.e. huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml
|
# Note: For every job in this workflow, the name of the runner scale set is finalized in the runner yaml i.e. huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml
|
||||||
# For example, 1gpu scale set: amd-mi325-ci-1gpu
|
# For example, 1gpu scale set: amd-mi300-ci-1gpu
|
||||||
# 2gpu scale set: amd-mi325-ci-2gpu
|
# 2gpu scale set: amd-mi300-ci-2gpu
|
||||||
|
|
||||||
on:
|
on:
|
||||||
workflow_run:
|
workflow_run:
|
||||||
@ -20,11 +20,10 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
job: run_models_gpu
|
job: run_models_gpu
|
||||||
slack_report_channel: "#amd-hf-ci"
|
slack_report_channel: "#amd-hf-ci"
|
||||||
runner_scale_set: amd-mi325-ci
|
runner_scale_set: amd-mi300-ci
|
||||||
docker: huggingface/transformers-pytorch-amd-gpu
|
docker: huggingface/transformers-pytorch-amd-gpu
|
||||||
ci_event: Scheduled CI (AMD) - mi325
|
ci_event: Scheduled CI (AMD) - mi300
|
||||||
report_repo_id: optimum-amd/transformers_daily_ci
|
report_repo_id: optimum-amd/transformers_daily_ci
|
||||||
env_file: /etc/podinfo/gha-gpu-isolation-settings
|
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
torch-pipeline:
|
torch-pipeline:
|
||||||
@ -33,11 +32,10 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
job: run_pipelines_torch_gpu
|
job: run_pipelines_torch_gpu
|
||||||
slack_report_channel: "#amd-hf-ci"
|
slack_report_channel: "#amd-hf-ci"
|
||||||
runner_scale_set: amd-mi325-ci
|
runner_scale_set: amd-mi300-ci
|
||||||
docker: huggingface/transformers-pytorch-amd-gpu
|
docker: huggingface/transformers-pytorch-amd-gpu
|
||||||
ci_event: Scheduled CI (AMD) - mi325
|
ci_event: Scheduled CI (AMD) - mi300
|
||||||
report_repo_id: optimum-amd/transformers_daily_ci
|
report_repo_id: optimum-amd/transformers_daily_ci
|
||||||
env_file: /etc/podinfo/gha-gpu-isolation-settings
|
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
example-ci:
|
example-ci:
|
||||||
@ -46,11 +44,10 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
job: run_examples_gpu
|
job: run_examples_gpu
|
||||||
slack_report_channel: "#amd-hf-ci"
|
slack_report_channel: "#amd-hf-ci"
|
||||||
runner_scale_set: amd-mi325-ci
|
runner_scale_set: amd-mi300-ci
|
||||||
docker: huggingface/transformers-pytorch-amd-gpu
|
docker: huggingface/transformers-pytorch-amd-gpu
|
||||||
ci_event: Scheduled CI (AMD) - mi325
|
ci_event: Scheduled CI (AMD) - mi300
|
||||||
report_repo_id: optimum-amd/transformers_daily_ci
|
report_repo_id: optimum-amd/transformers_daily_ci
|
||||||
env_file: /etc/podinfo/gha-gpu-isolation-settings
|
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
deepspeed-ci:
|
deepspeed-ci:
|
||||||
@ -59,9 +56,8 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
job: run_torch_cuda_extensions_gpu
|
job: run_torch_cuda_extensions_gpu
|
||||||
slack_report_channel: "#amd-hf-ci"
|
slack_report_channel: "#amd-hf-ci"
|
||||||
runner_scale_set: amd-mi325-ci
|
runner_scale_set: amd-mi300-ci
|
||||||
docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
|
docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
|
||||||
ci_event: Scheduled CI (AMD) - mi325
|
ci_event: Scheduled CI (AMD) - mi300
|
||||||
report_repo_id: optimum-amd/transformers_daily_ci
|
report_repo_id: optimum-amd/transformers_daily_ci
|
||||||
env_file: /etc/podinfo/gha-gpu-isolation-settings
|
|
||||||
secrets: inherit
|
secrets: inherit
|
@ -1,63 +0,0 @@
|
|||||||
name: Self-hosted runner scale set (AMD mi355 scheduled CI caller)
|
|
||||||
|
|
||||||
# Note: For every job in this workflow, the name of the runner scale set is finalized in the runner yaml i.e. huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml
|
|
||||||
# For example, 1gpu : amd-mi355-ci-1gpu
|
|
||||||
# 2gpu : amd-mi355-ci-2gpu
|
|
||||||
|
|
||||||
on:
|
|
||||||
workflow_run:
|
|
||||||
workflows: ["Self-hosted runner (AMD scheduled CI caller)"]
|
|
||||||
branches: ["main"]
|
|
||||||
types: [completed]
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- run_amd_scheduled_ci_caller*
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
model-ci:
|
|
||||||
name: Model CI
|
|
||||||
uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
|
|
||||||
with:
|
|
||||||
job: run_models_gpu
|
|
||||||
slack_report_channel: "#amd-hf-ci"
|
|
||||||
runner_scale_set: amd-mi355-ci
|
|
||||||
docker: huggingface/testing-rocm7.0-preview
|
|
||||||
ci_event: Scheduled CI (AMD) - mi355
|
|
||||||
report_repo_id: optimum-amd/transformers_daily_ci
|
|
||||||
secrets: inherit
|
|
||||||
|
|
||||||
torch-pipeline:
|
|
||||||
name: Torch pipeline CI
|
|
||||||
uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
|
|
||||||
with:
|
|
||||||
job: run_pipelines_torch_gpu
|
|
||||||
slack_report_channel: "#amd-hf-ci"
|
|
||||||
runner_scale_set: amd-mi355-ci
|
|
||||||
docker: huggingface/testing-rocm7.0-preview
|
|
||||||
ci_event: Scheduled CI (AMD) - mi355
|
|
||||||
report_repo_id: optimum-amd/transformers_daily_ci
|
|
||||||
secrets: inherit
|
|
||||||
|
|
||||||
example-ci:
|
|
||||||
name: Example CI
|
|
||||||
uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
|
|
||||||
with:
|
|
||||||
job: run_examples_gpu
|
|
||||||
slack_report_channel: "#amd-hf-ci"
|
|
||||||
runner_scale_set: amd-mi355-ci
|
|
||||||
docker: huggingface/testing-rocm7.0-preview
|
|
||||||
ci_event: Scheduled CI (AMD) - mi355
|
|
||||||
report_repo_id: optimum-amd/transformers_daily_ci
|
|
||||||
secrets: inherit
|
|
||||||
|
|
||||||
deepspeed-ci:
|
|
||||||
name: DeepSpeed CI
|
|
||||||
uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled_arc_scale_set.yaml@main
|
|
||||||
with:
|
|
||||||
job: run_torch_cuda_extensions_gpu
|
|
||||||
slack_report_channel: "#amd-hf-ci"
|
|
||||||
runner_scale_set: amd-mi355-ci
|
|
||||||
docker: huggingface/testing-rocm7.0-preview
|
|
||||||
ci_event: Scheduled CI (AMD) - mi355
|
|
||||||
report_repo_id: optimum-amd/transformers_daily_ci
|
|
||||||
secrets: inherit
|
|
73
.github/workflows/self-scheduled-caller.yml
vendored
73
.github/workflows/self-scheduled-caller.yml
vendored
@ -1,4 +1,5 @@
|
|||||||
name: Nvidia CI
|
name: Self-hosted runner (scheduled)
|
||||||
|
|
||||||
|
|
||||||
on:
|
on:
|
||||||
repository_dispatch:
|
repository_dispatch:
|
||||||
@ -6,7 +7,7 @@ on:
|
|||||||
- cron: "17 2 * * *"
|
- cron: "17 2 * * *"
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- run_nvidia_ci*
|
- trigger-remove-script-datasets-in-tests
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
inputs:
|
inputs:
|
||||||
prev_workflow_run_id:
|
prev_workflow_run_id:
|
||||||
@ -21,10 +22,10 @@ on:
|
|||||||
default: ""
|
default: ""
|
||||||
|
|
||||||
|
|
||||||
# Used for `push` to easily modify the target workflow runs to compare against
|
# Used for `push` to easily modiffy the target workflow runs to compare against
|
||||||
env:
|
env:
|
||||||
prev_workflow_run_id: ""
|
prev_workflow_run_id: ""
|
||||||
other_workflow_run_id: ""
|
other_workflow_run_id: "15770139098"
|
||||||
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
@ -50,70 +51,8 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
job: run_models_gpu
|
job: run_models_gpu
|
||||||
slack_report_channel: "#transformers-ci-daily-models"
|
slack_report_channel: "#transformers-ci-daily-models"
|
||||||
docker: huggingface/transformers-all-latest-gpu
|
runner: daily-ci
|
||||||
ci_event: Daily CI
|
|
||||||
runner_type: "a10"
|
|
||||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
|
||||||
commit_sha: ${{ github.sha }}
|
|
||||||
secrets: inherit
|
|
||||||
|
|
||||||
torch-pipeline:
|
|
||||||
name: Torch pipeline CI
|
|
||||||
uses: ./.github/workflows/self-scheduled.yml
|
|
||||||
with:
|
|
||||||
job: run_pipelines_torch_gpu
|
|
||||||
slack_report_channel: "#transformers-ci-daily-pipeline-torch"
|
|
||||||
docker: huggingface/transformers-pytorch-gpu
|
|
||||||
ci_event: Daily CI
|
|
||||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
|
||||||
commit_sha: ${{ github.sha }}
|
|
||||||
secrets: inherit
|
|
||||||
|
|
||||||
example-ci:
|
|
||||||
name: Example CI
|
|
||||||
uses: ./.github/workflows/self-scheduled.yml
|
|
||||||
with:
|
|
||||||
job: run_examples_gpu
|
|
||||||
slack_report_channel: "#transformers-ci-daily-examples"
|
|
||||||
docker: huggingface/transformers-all-latest-gpu
|
docker: huggingface/transformers-all-latest-gpu
|
||||||
ci_event: Daily CI
|
ci_event: Daily CI
|
||||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||||
commit_sha: ${{ github.sha }}
|
|
||||||
secrets: inherit
|
|
||||||
|
|
||||||
trainer-fsdp-ci:
|
|
||||||
name: Trainer/FSDP CI
|
|
||||||
uses: ./.github/workflows/self-scheduled.yml
|
|
||||||
with:
|
|
||||||
job: run_trainer_and_fsdp_gpu
|
|
||||||
slack_report_channel: "#transformers-ci-daily-training"
|
|
||||||
docker: huggingface/transformers-all-latest-gpu
|
|
||||||
ci_event: Daily CI
|
|
||||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
|
||||||
commit_sha: ${{ github.sha }}
|
|
||||||
secrets: inherit
|
|
||||||
|
|
||||||
deepspeed-ci:
|
|
||||||
name: DeepSpeed CI
|
|
||||||
uses: ./.github/workflows/self-scheduled.yml
|
|
||||||
with:
|
|
||||||
job: run_torch_cuda_extensions_gpu
|
|
||||||
slack_report_channel: "#transformers-ci-daily-training"
|
|
||||||
docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
|
|
||||||
ci_event: Daily CI
|
|
||||||
working-directory-prefix: /workspace
|
|
||||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
|
||||||
commit_sha: ${{ github.sha }}
|
|
||||||
secrets: inherit
|
|
||||||
|
|
||||||
quantization-ci:
|
|
||||||
name: Quantization CI
|
|
||||||
uses: ./.github/workflows/self-scheduled.yml
|
|
||||||
with:
|
|
||||||
job: run_quantization_torch_gpu
|
|
||||||
slack_report_channel: "#transformers-ci-daily-quantization"
|
|
||||||
docker: huggingface/transformers-quantization-latest-gpu
|
|
||||||
ci_event: Daily CI
|
|
||||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
|
||||||
commit_sha: ${{ github.sha }}
|
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
342
.github/workflows/self-scheduled-intel-gaudi.yml
vendored
342
.github/workflows/self-scheduled-intel-gaudi.yml
vendored
@ -1,342 +0,0 @@
|
|||||||
name: Self-hosted runner (scheduled-intel-gaudi)
|
|
||||||
|
|
||||||
on:
|
|
||||||
workflow_call:
|
|
||||||
inputs:
|
|
||||||
job:
|
|
||||||
required: true
|
|
||||||
type: string
|
|
||||||
slack_report_channel:
|
|
||||||
required: true
|
|
||||||
type: string
|
|
||||||
runner_scale_set:
|
|
||||||
required: true
|
|
||||||
type: string
|
|
||||||
ci_event:
|
|
||||||
required: true
|
|
||||||
type: string
|
|
||||||
report_repo_id:
|
|
||||||
required: true
|
|
||||||
type: string
|
|
||||||
|
|
||||||
env:
|
|
||||||
NUM_SLICES: 2
|
|
||||||
RUN_SLOW: yes
|
|
||||||
PT_HPU_LAZY_MODE: 0
|
|
||||||
TRANSFORMERS_IS_CI: yes
|
|
||||||
PT_ENABLE_INT64_SUPPORT: 1
|
|
||||||
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
|
||||||
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
|
||||||
HF_HOME: /mnt/cache/.cache/huggingface
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
setup:
|
|
||||||
if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
|
|
||||||
name: Setup
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
outputs:
|
|
||||||
slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
|
|
||||||
folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
|
|
||||||
quantization_matrix: ${{ steps.set-matrix.outputs.quantization_matrix }}
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
|
|
||||||
- name: Set up Python
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: "3.10"
|
|
||||||
|
|
||||||
- id: set-matrix
|
|
||||||
if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
|
|
||||||
name: Identify models to test
|
|
||||||
working-directory: tests
|
|
||||||
run: |
|
|
||||||
if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
|
|
||||||
echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
|
|
||||||
echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
|
|
||||||
elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
|
|
||||||
echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
|
|
||||||
echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
|
|
||||||
fi
|
|
||||||
|
|
||||||
- id: set-matrix-quantization
|
|
||||||
if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
|
|
||||||
name: Identify quantization method to test
|
|
||||||
working-directory: tests
|
|
||||||
run: |
|
|
||||||
echo "quantization_matrix=$(python3 -c 'import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ; print(d)')" >> $GITHUB_OUTPUT
|
|
||||||
|
|
||||||
run_models_gpu:
|
|
||||||
if: ${{ inputs.job == 'run_models_gpu' }}
|
|
||||||
name: " "
|
|
||||||
needs: setup
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
machine_type: [1gaudi, 2gaudi]
|
|
||||||
slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
|
|
||||||
uses: ./.github/workflows/model_jobs_intel_gaudi.yml
|
|
||||||
with:
|
|
||||||
slice_id: ${{ matrix.slice_id }}
|
|
||||||
machine_type: ${{ matrix.machine_type }}
|
|
||||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
|
||||||
runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
|
||||||
secrets: inherit
|
|
||||||
|
|
||||||
run_trainer_and_fsdp_gpu:
|
|
||||||
if: ${{ inputs.job == 'run_trainer_and_fsdp_gpu' }}
|
|
||||||
name: " "
|
|
||||||
needs: setup
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
machine_type: [1gaudi, 2gaudi]
|
|
||||||
slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
|
|
||||||
uses: ./.github/workflows/model_jobs_intel_gaudi.yml
|
|
||||||
with:
|
|
||||||
slice_id: ${{ matrix.slice_id }}
|
|
||||||
machine_type: ${{ matrix.machine_type }}
|
|
||||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
|
||||||
runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
|
||||||
report_name_prefix: run_trainer_and_fsdp_gpu
|
|
||||||
secrets: inherit
|
|
||||||
|
|
||||||
run_pipelines_torch_gpu:
|
|
||||||
if: ${{ inputs.job == 'run_pipelines_torch_gpu' }}
|
|
||||||
name: Pipelines
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
machine_type: [1gaudi, 2gaudi]
|
|
||||||
runs-on:
|
|
||||||
group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
|
||||||
container:
|
|
||||||
image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
|
||||||
options: --runtime=habana
|
|
||||||
-v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
|
|
||||||
--env OMPI_MCA_btl_vader_single_copy_mechanism=none
|
|
||||||
--env HABANA_VISIBLE_DEVICES
|
|
||||||
--env HABANA_VISIBLE_MODULES
|
|
||||||
--cap-add=sys_nice
|
|
||||||
--shm-size=64G
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
|
|
||||||
|
|
||||||
- name: HL-SMI
|
|
||||||
run: |
|
|
||||||
hl-smi
|
|
||||||
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
|
|
||||||
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
|
|
||||||
|
|
||||||
- name: Environment
|
|
||||||
run: python3 utils/print_env.py
|
|
||||||
|
|
||||||
- name: Show installed libraries and their versions
|
|
||||||
run: pip freeze
|
|
||||||
|
|
||||||
- name: Set `machine_type` for report and artifact names
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
|
|
||||||
machine_type=single-gpu
|
|
||||||
elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
|
|
||||||
machine_type=multi-gpu
|
|
||||||
else
|
|
||||||
machine_type=${{ matrix.machine_type }}
|
|
||||||
fi
|
|
||||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
- name: Run all pipeline tests on Intel Gaudi
|
|
||||||
run: |
|
|
||||||
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines -m "not not_device_test"
|
|
||||||
|
|
||||||
- name: Failure short reports
|
|
||||||
if: ${{ failure() }}
|
|
||||||
continue-on-error: true
|
|
||||||
run: |
|
|
||||||
cat reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt
|
|
||||||
|
|
||||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports"
|
|
||||||
if: ${{ always() }}
|
|
||||||
uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
|
|
||||||
path: reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
|
|
||||||
|
|
||||||
run_examples_gpu:
|
|
||||||
if: ${{ inputs.job == 'run_examples_gpu' }}
|
|
||||||
name: Examples directory
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
machine_type: [1gaudi]
|
|
||||||
runs-on:
|
|
||||||
group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
|
||||||
container:
|
|
||||||
image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
|
||||||
options: --runtime=habana
|
|
||||||
-v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
|
|
||||||
--env OMPI_MCA_btl_vader_single_copy_mechanism=none
|
|
||||||
--env HABANA_VISIBLE_DEVICES
|
|
||||||
--env HABANA_VISIBLE_MODULES
|
|
||||||
--cap-add=sys_nice
|
|
||||||
--shm-size=64G
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
|
|
||||||
|
|
||||||
- name: HL-SMI
|
|
||||||
run: |
|
|
||||||
hl-smi
|
|
||||||
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
|
|
||||||
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
|
|
||||||
|
|
||||||
- name: Environment
|
|
||||||
run: |
|
|
||||||
python3 utils/print_env.py
|
|
||||||
|
|
||||||
- name: Show installed libraries and their versions
|
|
||||||
run: |
|
|
||||||
pip freeze
|
|
||||||
|
|
||||||
- name: Set `machine_type` for report and artifact names
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
|
|
||||||
machine_type=single-gpu
|
|
||||||
elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
|
|
||||||
machine_type=multi-gpu
|
|
||||||
else
|
|
||||||
machine_type=${{ matrix.machine_type }}
|
|
||||||
fi
|
|
||||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
- name: Run examples tests on Intel Gaudi
|
|
||||||
run: |
|
|
||||||
pip install -r examples/pytorch/_tests_requirements.txt
|
|
||||||
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_examples_gpu_test_reports examples/pytorch -m "not not_device_test"
|
|
||||||
|
|
||||||
- name: Failure short reports
|
|
||||||
if: ${{ failure() }}
|
|
||||||
continue-on-error: true
|
|
||||||
run: |
|
|
||||||
cat reports/${{ env.machine_type }}_run_examples_gpu_test_reports/failures_short.txt
|
|
||||||
|
|
||||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_examples_gpu_test_reports"
|
|
||||||
if: ${{ always() }}
|
|
||||||
uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: ${{ env.machine_type }}_run_examples_gpu_test_reports
|
|
||||||
path: reports/${{ env.machine_type }}_run_examples_gpu_test_reports
|
|
||||||
|
|
||||||
run_torch_cuda_extensions_gpu:
|
|
||||||
if: ${{ inputs.job == 'run_torch_cuda_extensions_gpu' }}
|
|
||||||
name: Intel Gaudi deepspeed tests
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
machine_type: [1gaudi, 2gaudi]
|
|
||||||
runs-on:
|
|
||||||
group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
|
||||||
container:
|
|
||||||
image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
|
||||||
options: --runtime=habana
|
|
||||||
-v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
|
|
||||||
--env OMPI_MCA_btl_vader_single_copy_mechanism=none
|
|
||||||
--env HABANA_VISIBLE_DEVICES
|
|
||||||
--env HABANA_VISIBLE_MODULES
|
|
||||||
--cap-add=sys_nice
|
|
||||||
--shm-size=64G
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
|
|
||||||
pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0
|
|
||||||
|
|
||||||
- name: HL-SMI
|
|
||||||
run: |
|
|
||||||
hl-smi
|
|
||||||
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
|
|
||||||
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
|
|
||||||
|
|
||||||
- name: Environment
|
|
||||||
run: |
|
|
||||||
python3 utils/print_env.py
|
|
||||||
|
|
||||||
- name: Show installed libraries and their versions
|
|
||||||
run: |
|
|
||||||
pip freeze
|
|
||||||
|
|
||||||
- name: Set `machine_type` for report and artifact names
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
|
|
||||||
machine_type=single-gpu
|
|
||||||
elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
|
|
||||||
machine_type=multi-gpu
|
|
||||||
else
|
|
||||||
machine_type=${{ matrix.machine_type }}
|
|
||||||
fi
|
|
||||||
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
- name: Run all deepspeed tests on intel Gaudi
|
|
||||||
run: |
|
|
||||||
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed -m "not not_device_test"
|
|
||||||
|
|
||||||
- name: Failure short reports
|
|
||||||
if: ${{ failure() }}
|
|
||||||
continue-on-error: true
|
|
||||||
run: |
|
|
||||||
cat reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
|
|
||||||
|
|
||||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
|
|
||||||
if: ${{ always() }}
|
|
||||||
uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
|
|
||||||
path: reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
|
|
||||||
|
|
||||||
send_results:
|
|
||||||
name: Slack Report
|
|
||||||
needs:
|
|
||||||
[
|
|
||||||
setup,
|
|
||||||
run_models_gpu,
|
|
||||||
run_examples_gpu,
|
|
||||||
run_torch_cuda_extensions_gpu,
|
|
||||||
run_pipelines_torch_gpu,
|
|
||||||
run_trainer_and_fsdp_gpu,
|
|
||||||
]
|
|
||||||
if: ${{ always() }}
|
|
||||||
uses: ./.github/workflows/slack-report.yml
|
|
||||||
with:
|
|
||||||
job: ${{ inputs.job }}
|
|
||||||
setup_status: ${{ needs.setup.result }}
|
|
||||||
slack_report_channel: ${{ inputs.slack_report_channel }}
|
|
||||||
quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
|
|
||||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
|
||||||
report_repo_id: ${{ inputs.report_repo_id }}
|
|
||||||
ci_event: ${{ inputs.ci_event }}
|
|
||||||
|
|
||||||
secrets: inherit
|
|
@ -1,67 +0,0 @@
|
|||||||
name: Self-hosted runner (Intel Gaudi3 scheduled CI caller)
|
|
||||||
|
|
||||||
on:
|
|
||||||
repository_dispatch:
|
|
||||||
workflow_dispatch:
|
|
||||||
schedule:
|
|
||||||
- cron: "17 2 * * *"
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
model-ci:
|
|
||||||
name: Model CI
|
|
||||||
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
|
||||||
with:
|
|
||||||
job: run_models_gpu
|
|
||||||
ci_event: Scheduled CI (Intel) - Gaudi3
|
|
||||||
runner_scale_set: itac-bm-emr-gaudi3-dell
|
|
||||||
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
|
||||||
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
|
|
||||||
|
|
||||||
secrets: inherit
|
|
||||||
|
|
||||||
pipeline-ci:
|
|
||||||
name: Pipeline CI
|
|
||||||
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
|
||||||
with:
|
|
||||||
job: run_pipelines_torch_gpu
|
|
||||||
ci_event: Scheduled CI (Intel) - Gaudi3
|
|
||||||
runner_scale_set: itac-bm-emr-gaudi3-dell
|
|
||||||
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
|
||||||
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
|
|
||||||
|
|
||||||
secrets: inherit
|
|
||||||
|
|
||||||
example-ci:
|
|
||||||
name: Example CI
|
|
||||||
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
|
||||||
with:
|
|
||||||
job: run_examples_gpu
|
|
||||||
ci_event: Scheduled CI (Intel) - Gaudi3
|
|
||||||
runner_scale_set: itac-bm-emr-gaudi3-dell
|
|
||||||
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
|
||||||
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
|
|
||||||
|
|
||||||
secrets: inherit
|
|
||||||
|
|
||||||
deepspeed-ci:
|
|
||||||
name: DeepSpeed CI
|
|
||||||
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
|
||||||
with:
|
|
||||||
job: run_torch_cuda_extensions_gpu
|
|
||||||
ci_event: Scheduled CI (Intel) - Gaudi3
|
|
||||||
runner_scale_set: itac-bm-emr-gaudi3-dell
|
|
||||||
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
|
||||||
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
|
|
||||||
|
|
||||||
secrets: inherit
|
|
||||||
|
|
||||||
trainer-fsdp-ci:
|
|
||||||
name: Trainer/FSDP CI
|
|
||||||
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
|
||||||
with:
|
|
||||||
job: run_trainer_and_fsdp_gpu
|
|
||||||
ci_event: Scheduled CI (Intel) - Gaudi3
|
|
||||||
runner_scale_set: itac-bm-emr-gaudi3-dell
|
|
||||||
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
|
||||||
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
|
|
||||||
secrets: inherit
|
|
80
.github/workflows/self-scheduled.yml
vendored
80
.github/workflows/self-scheduled.yml
vendored
@ -1,4 +1,4 @@
|
|||||||
name: Nvidia CI (job definitions)
|
name: Self-hosted runner (scheduled)
|
||||||
|
|
||||||
# Note that each job's dependencies go into a corresponding docker file.
|
# Note that each job's dependencies go into a corresponding docker file.
|
||||||
#
|
#
|
||||||
@ -15,6 +15,9 @@ on:
|
|||||||
slack_report_channel:
|
slack_report_channel:
|
||||||
required: true
|
required: true
|
||||||
type: string
|
type: string
|
||||||
|
runner:
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
docker:
|
docker:
|
||||||
required: true
|
required: true
|
||||||
type: string
|
type: string
|
||||||
@ -28,16 +31,7 @@ on:
|
|||||||
report_repo_id:
|
report_repo_id:
|
||||||
required: true
|
required: true
|
||||||
type: string
|
type: string
|
||||||
commit_sha:
|
|
||||||
required: false
|
|
||||||
type: string
|
|
||||||
runner_type:
|
|
||||||
required: false
|
|
||||||
type: string
|
|
||||||
models:
|
|
||||||
default: ""
|
|
||||||
required: false
|
|
||||||
type: string
|
|
||||||
|
|
||||||
env:
|
env:
|
||||||
HF_HOME: /mnt/cache
|
HF_HOME: /mnt/cache
|
||||||
@ -55,26 +49,25 @@ env:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
setup:
|
setup:
|
||||||
name: Setup
|
|
||||||
if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu", "run_quantization_torch_gpu"]'), inputs.job)
|
if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu", "run_quantization_torch_gpu"]'), inputs.job)
|
||||||
|
name: Setup
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
|
machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
|
||||||
runs-on:
|
runs-on:
|
||||||
group: '${{ matrix.machine_type }}'
|
group: '${{ matrix.machine_type }}'
|
||||||
container:
|
container:
|
||||||
image: huggingface/transformers-all-latest-gpu
|
image: huggingface/transformers-all-latest-gpu
|
||||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
outputs:
|
outputs:
|
||||||
folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
|
folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
|
||||||
slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
|
slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
|
||||||
runner_map: ${{ steps.set-matrix.outputs.runner_map }}
|
|
||||||
quantization_matrix: ${{ steps.set-matrix-quantization.outputs.quantization_matrix }}
|
quantization_matrix: ${{ steps.set-matrix-quantization.outputs.quantization_matrix }}
|
||||||
steps:
|
steps:
|
||||||
- name: Update clone
|
- name: Update clone
|
||||||
working-directory: /transformers
|
working-directory: /transformers
|
||||||
run: |
|
run: |
|
||||||
git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
|
git fetch && git checkout ${{ github.sha }}
|
||||||
|
|
||||||
- name: Cleanup
|
- name: Cleanup
|
||||||
working-directory: /transformers
|
working-directory: /transformers
|
||||||
@ -93,9 +86,8 @@ jobs:
|
|||||||
working-directory: /transformers/tests
|
working-directory: /transformers/tests
|
||||||
run: |
|
run: |
|
||||||
if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
|
if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
|
||||||
echo "folder_slices=$(python3 ../utils/split_model_tests.py --models '${{ inputs.models }}' --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
|
echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
|
||||||
echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
|
echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
|
||||||
echo "runner_map=$(python3 ../utils/get_runner_map.py)" >> $GITHUB_OUTPUT
|
|
||||||
elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
|
elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
|
||||||
echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
|
echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
|
||||||
echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
|
echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
|
||||||
@ -119,18 +111,15 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
machine_type: [single-gpu, multi-gpu]
|
machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
|
||||||
slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
|
slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
|
||||||
uses: ./.github/workflows/model_jobs.yml
|
uses: ./.github/workflows/model_jobs.yml
|
||||||
with:
|
with:
|
||||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||||
machine_type: ${{ matrix.machine_type }}
|
machine_type: ${{ matrix.machine_type }}
|
||||||
slice_id: ${{ matrix.slice_id }}
|
slice_id: ${{ matrix.slice_id }}
|
||||||
runner_map: ${{ needs.setup.outputs.runner_map }}
|
runner: ${{ inputs.runner }}
|
||||||
docker: ${{ inputs.docker }}
|
docker: ${{ inputs.docker }}
|
||||||
commit_sha: ${{ inputs.commit_sha || github.sha }}
|
|
||||||
runner_type: ${{ inputs.runner_type }}
|
|
||||||
report_repo_id: ${{ inputs.report_repo_id }}
|
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
run_trainer_and_fsdp_gpu:
|
run_trainer_and_fsdp_gpu:
|
||||||
@ -140,16 +129,15 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
|
machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
|
||||||
slice_id: [0, 1]
|
slice_id: [0, 1]
|
||||||
uses: ./.github/workflows/model_jobs.yml
|
uses: ./.github/workflows/model_jobs.yml
|
||||||
with:
|
with:
|
||||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||||
machine_type: ${{ matrix.machine_type }}
|
machine_type: ${{ matrix.machine_type }}
|
||||||
slice_id: ${{ matrix.slice_id }}
|
slice_id: ${{ matrix.slice_id }}
|
||||||
runner_map: ${{ needs.setup.outputs.runner_map }}
|
runner: ${{ inputs.runner }}
|
||||||
docker: ${{ inputs.docker }}
|
docker: ${{ inputs.docker }}
|
||||||
commit_sha: ${{ inputs.commit_sha || github.sha }}
|
|
||||||
report_name_prefix: run_trainer_and_fsdp_gpu
|
report_name_prefix: run_trainer_and_fsdp_gpu
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
@ -159,7 +147,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
|
machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
|
||||||
runs-on:
|
runs-on:
|
||||||
group: '${{ matrix.machine_type }}'
|
group: '${{ matrix.machine_type }}'
|
||||||
container:
|
container:
|
||||||
@ -168,7 +156,7 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
- name: Update clone
|
- name: Update clone
|
||||||
working-directory: /transformers
|
working-directory: /transformers
|
||||||
run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
|
run: git fetch && git checkout ${{ github.sha }}
|
||||||
|
|
||||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||||
working-directory: /transformers
|
working-directory: /transformers
|
||||||
@ -193,9 +181,9 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
echo "${{ matrix.machine_type }}"
|
echo "${{ matrix.machine_type }}"
|
||||||
|
|
||||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
|
||||||
machine_type=single-gpu
|
machine_type=single-gpu
|
||||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
|
||||||
machine_type=multi-gpu
|
machine_type=multi-gpu
|
||||||
else
|
else
|
||||||
machine_type=${{ matrix.machine_type }}
|
machine_type=${{ matrix.machine_type }}
|
||||||
@ -227,16 +215,16 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
machine_type: [aws-g5-4xlarge-cache]
|
machine_type: [aws-g4dn-4xlarge-cache]
|
||||||
runs-on:
|
runs-on:
|
||||||
group: '${{ matrix.machine_type }}'
|
group: '${{ matrix.machine_type }}'
|
||||||
container:
|
container:
|
||||||
image: huggingface/transformers-all-latest-gpu
|
image: huggingface/transformers-all-latest-gpu
|
||||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
steps:
|
steps:
|
||||||
- name: Update clone
|
- name: Update clone
|
||||||
working-directory: /transformers
|
working-directory: /transformers
|
||||||
run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
|
run: git fetch && git checkout ${{ github.sha }}
|
||||||
|
|
||||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||||
working-directory: /transformers
|
working-directory: /transformers
|
||||||
@ -261,9 +249,9 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
echo "${{ matrix.machine_type }}"
|
echo "${{ matrix.machine_type }}"
|
||||||
|
|
||||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
|
||||||
machine_type=single-gpu
|
machine_type=single-gpu
|
||||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
|
||||||
machine_type=multi-gpu
|
machine_type=multi-gpu
|
||||||
else
|
else
|
||||||
machine_type=${{ matrix.machine_type }}
|
machine_type=${{ matrix.machine_type }}
|
||||||
@ -296,7 +284,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
|
machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
|
||||||
runs-on:
|
runs-on:
|
||||||
group: '${{ matrix.machine_type }}'
|
group: '${{ matrix.machine_type }}'
|
||||||
container:
|
container:
|
||||||
@ -305,7 +293,7 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
- name: Update clone
|
- name: Update clone
|
||||||
working-directory: ${{ inputs.working-directory-prefix }}/transformers
|
working-directory: ${{ inputs.working-directory-prefix }}/transformers
|
||||||
run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
|
run: git fetch && git checkout ${{ github.sha }}
|
||||||
|
|
||||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||||
working-directory: ${{ inputs.working-directory-prefix }}/transformers
|
working-directory: ${{ inputs.working-directory-prefix }}/transformers
|
||||||
@ -358,9 +346,9 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
echo "${{ matrix.machine_type }}"
|
echo "${{ matrix.machine_type }}"
|
||||||
|
|
||||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
|
||||||
machine_type=single-gpu
|
machine_type=single-gpu
|
||||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
|
||||||
machine_type=multi-gpu
|
machine_type=multi-gpu
|
||||||
else
|
else
|
||||||
machine_type=${{ matrix.machine_type }}
|
machine_type=${{ matrix.machine_type }}
|
||||||
@ -395,7 +383,7 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }}
|
folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }}
|
||||||
machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
|
machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
|
||||||
runs-on:
|
runs-on:
|
||||||
group: '${{ matrix.machine_type }}'
|
group: '${{ matrix.machine_type }}'
|
||||||
container:
|
container:
|
||||||
@ -413,7 +401,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Update clone
|
- name: Update clone
|
||||||
working-directory: /transformers
|
working-directory: /transformers
|
||||||
run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
|
run: git fetch && git checkout ${{ github.sha }}
|
||||||
|
|
||||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||||
working-directory: /transformers
|
working-directory: /transformers
|
||||||
@ -438,9 +426,9 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
echo "${{ matrix.machine_type }}"
|
echo "${{ matrix.machine_type }}"
|
||||||
|
|
||||||
if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
|
if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
|
||||||
machine_type=single-gpu
|
machine_type=single-gpu
|
||||||
elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
|
elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
|
||||||
machine_type=multi-gpu
|
machine_type=multi-gpu
|
||||||
else
|
else
|
||||||
machine_type=${{ matrix.machine_type }}
|
machine_type=${{ matrix.machine_type }}
|
||||||
@ -477,7 +465,6 @@ jobs:
|
|||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
fetch-depth: 2
|
fetch-depth: 2
|
||||||
ref: ${{ inputs.commit_sha || github.sha }}
|
|
||||||
|
|
||||||
- name: Install transformers
|
- name: Install transformers
|
||||||
run: pip install transformers
|
run: pip install transformers
|
||||||
@ -520,7 +507,7 @@ jobs:
|
|||||||
run_quantization_torch_gpu,
|
run_quantization_torch_gpu,
|
||||||
run_extract_warnings
|
run_extract_warnings
|
||||||
]
|
]
|
||||||
if: always() && !cancelled()
|
if: ${{ always() }}
|
||||||
uses: ./.github/workflows/slack-report.yml
|
uses: ./.github/workflows/slack-report.yml
|
||||||
with:
|
with:
|
||||||
job: ${{ inputs.job }}
|
job: ${{ inputs.job }}
|
||||||
@ -532,7 +519,6 @@ jobs:
|
|||||||
quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
|
quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
|
||||||
ci_event: ${{ inputs.ci_event }}
|
ci_event: ${{ inputs.ci_event }}
|
||||||
report_repo_id: ${{ inputs.report_repo_id }}
|
report_repo_id: ${{ inputs.report_repo_id }}
|
||||||
commit_sha: ${{ inputs.commit_sha || github.sha }}
|
|
||||||
|
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
@ -543,7 +529,7 @@ jobs:
|
|||||||
uses: ./.github/workflows/check_failed_tests.yml
|
uses: ./.github/workflows/check_failed_tests.yml
|
||||||
with:
|
with:
|
||||||
docker: ${{ inputs.docker }}
|
docker: ${{ inputs.docker }}
|
||||||
start_sha: ${{ inputs.commit_sha || github.sha }}
|
start_sha: ${{ github.sha }}
|
||||||
job: ${{ inputs.job }}
|
job: ${{ inputs.job }}
|
||||||
slack_report_channel: ${{ inputs.slack_report_channel }}
|
slack_report_channel: ${{ inputs.slack_report_channel }}
|
||||||
ci_event: ${{ inputs.ci_event }}
|
ci_event: ${{ inputs.ci_event }}
|
||||||
|
16
.github/workflows/slack-report.yml
vendored
16
.github/workflows/slack-report.yml
vendored
@ -24,10 +24,6 @@ on:
|
|||||||
report_repo_id:
|
report_repo_id:
|
||||||
required: true
|
required: true
|
||||||
type: string
|
type: string
|
||||||
commit_sha:
|
|
||||||
required: false
|
|
||||||
type: string
|
|
||||||
|
|
||||||
|
|
||||||
env:
|
env:
|
||||||
TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
|
TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
|
||||||
@ -36,7 +32,7 @@ jobs:
|
|||||||
send_results:
|
send_results:
|
||||||
name: Send results to webhook
|
name: Send results to webhook
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
if: always() && !cancelled()
|
if: always()
|
||||||
steps:
|
steps:
|
||||||
- name: Preliminary job status
|
- name: Preliminary job status
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -45,10 +41,6 @@ jobs:
|
|||||||
echo "Setup status: ${{ inputs.setup_status }}"
|
echo "Setup status: ${{ inputs.setup_status }}"
|
||||||
|
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
|
||||||
fetch-depth: 2
|
|
||||||
ref: ${{ inputs.commit_sha || github.sha }}
|
|
||||||
|
|
||||||
- uses: actions/download-artifact@v4
|
- uses: actions/download-artifact@v4
|
||||||
|
|
||||||
- name: Prepare some setup values
|
- name: Prepare some setup values
|
||||||
@ -75,9 +67,7 @@ jobs:
|
|||||||
SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }}
|
SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }}
|
||||||
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||||
CI_EVENT: ${{ inputs.ci_event }}
|
CI_EVENT: ${{ inputs.ci_event }}
|
||||||
# This `CI_TITLE` would be empty for `schedule` or `workflow_run` events.
|
CI_SHA: ${{ github.sha }}
|
||||||
CI_TITLE: ${{ github.event.head_commit.message }}
|
|
||||||
CI_SHA: ${{ inputs.commit_sha || github.sha }}
|
|
||||||
CI_TEST_JOB: ${{ inputs.job }}
|
CI_TEST_JOB: ${{ inputs.job }}
|
||||||
SETUP_STATUS: ${{ inputs.setup_status }}
|
SETUP_STATUS: ${{ inputs.setup_status }}
|
||||||
REPORT_REPO_ID: ${{ inputs.report_repo_id }}
|
REPORT_REPO_ID: ${{ inputs.report_repo_id }}
|
||||||
@ -93,7 +83,7 @@ jobs:
|
|||||||
python utils/notification_service.py "${{ inputs.quantization_matrix }}"
|
python utils/notification_service.py "${{ inputs.quantization_matrix }}"
|
||||||
else
|
else
|
||||||
python utils/notification_service.py "${{ inputs.folder_slices }}"
|
python utils/notification_service.py "${{ inputs.folder_slices }}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
|
# Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
|
||||||
- name: Failure table artifacts
|
- name: Failure table artifacts
|
||||||
|
3
.gitignore
vendored
3
.gitignore
vendored
@ -167,6 +167,3 @@ tags
|
|||||||
|
|
||||||
# ruff
|
# ruff
|
||||||
.ruff_cache
|
.ruff_cache
|
||||||
|
|
||||||
# modular conversion
|
|
||||||
*.modular_backup
|
|
||||||
|
@ -68,7 +68,8 @@ already reported** (use the search bar on GitHub under Issues). Your issue shoul
|
|||||||
|
|
||||||
Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so we can quickly resolve it:
|
Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so we can quickly resolve it:
|
||||||
|
|
||||||
* Your **OS type and version** and **Python**, and **PyTorch** versions when applicable.
|
* Your **OS type and version** and **Python**, **PyTorch** and
|
||||||
|
**TensorFlow** versions when applicable.
|
||||||
* A short, self-contained, code snippet that allows us to reproduce the bug in
|
* A short, self-contained, code snippet that allows us to reproduce the bug in
|
||||||
less than 30s.
|
less than 30s.
|
||||||
* The *full* traceback if an exception is raised.
|
* The *full* traceback if an exception is raised.
|
||||||
@ -164,7 +165,8 @@ You'll need **[Python 3.9](https://github.com/huggingface/transformers/blob/main
|
|||||||
mode with the `-e` flag.
|
mode with the `-e` flag.
|
||||||
|
|
||||||
Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a
|
Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a
|
||||||
failure with this command. If that's the case make sure to install Pytorch then do:
|
failure with this command. If that's the case make sure to install the Deep Learning framework you are working with
|
||||||
|
(PyTorch, TensorFlow and/or Flax) then do:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install -e ".[quality]"
|
pip install -e ".[quality]"
|
||||||
|
1
Makefile
1
Makefile
@ -52,7 +52,6 @@ repo-consistency:
|
|||||||
python utils/check_doctest_list.py
|
python utils/check_doctest_list.py
|
||||||
python utils/update_metadata.py --check-only
|
python utils/update_metadata.py --check-only
|
||||||
python utils/check_docstrings.py
|
python utils/check_docstrings.py
|
||||||
python utils/add_dates.py
|
|
||||||
|
|
||||||
# this target runs checks on all files
|
# this target runs checks on all files
|
||||||
|
|
||||||
|
12
README.md
12
README.md
@ -44,7 +44,7 @@ limitations under the License.
|
|||||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ja.md">日本語</a> |
|
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ja.md">日本語</a> |
|
||||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_hd.md">हिन्दी</a> |
|
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_hd.md">हिन्दी</a> |
|
||||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ru.md">Русский</a> |
|
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ru.md">Русский</a> |
|
||||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_pt-br.md">Português</a> |
|
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_pt-br.md">Рortuguês</a> |
|
||||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_te.md">తెలుగు</a> |
|
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_te.md">తెలుగు</a> |
|
||||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
|
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
|
||||||
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
|
<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
|
||||||
@ -80,7 +80,7 @@ Explore the [Hub](https://huggingface.com/) today to find a model and use Transf
|
|||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
Transformers works with Python 3.9+ and [PyTorch](https://pytorch.org/get-started/locally/) 2.2+.
|
Transformers works with Python 3.9+ [PyTorch](https://pytorch.org/get-started/locally/) 2.1+, [TensorFlow](https://www.tensorflow.org/install/pip) 2.6+, and [Flax](https://flax.readthedocs.io/en/latest/) 0.4.1+.
|
||||||
|
|
||||||
Create and activate a virtual environment with [venv](https://docs.python.org/3/library/venv.html) or [uv](https://docs.astral.sh/uv/), a fast Rust-based Python package and project manager.
|
Create and activate a virtual environment with [venv](https://docs.python.org/3/library/venv.html) or [uv](https://docs.astral.sh/uv/), a fast Rust-based Python package and project manager.
|
||||||
|
|
||||||
@ -147,7 +147,7 @@ chat = [
|
|||||||
{"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
|
{"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
|
||||||
]
|
]
|
||||||
|
|
||||||
pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", dtype=torch.bfloat16, device_map="auto")
|
pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")
|
||||||
response = pipeline(chat, max_new_tokens=512)
|
response = pipeline(chat, max_new_tokens=512)
|
||||||
print(response[0]["generated_text"][-1]["content"])
|
print(response[0]["generated_text"][-1]["content"])
|
||||||
```
|
```
|
||||||
@ -242,7 +242,7 @@ pipeline(
|
|||||||
|
|
||||||
- This library is not a modular toolbox of building blocks for neural nets. The code in the model files is not refactored with additional abstractions on purpose, so that researchers can quickly iterate on each of the models without diving into additional abstractions/files.
|
- This library is not a modular toolbox of building blocks for neural nets. The code in the model files is not refactored with additional abstractions on purpose, so that researchers can quickly iterate on each of the models without diving into additional abstractions/files.
|
||||||
- The training API is optimized to work with PyTorch models provided by Transformers. For generic machine learning loops, you should use another library like [Accelerate](https://huggingface.co/docs/accelerate).
|
- The training API is optimized to work with PyTorch models provided by Transformers. For generic machine learning loops, you should use another library like [Accelerate](https://huggingface.co/docs/accelerate).
|
||||||
- The [example scripts](https://github.com/huggingface/transformers/tree/main/examples) are only *examples*. They may not necessarily work out-of-the-box on your specific use case and you'll need to adapt the code for it to work.
|
- The [example scripts]((https://github.com/huggingface/transformers/tree/main/examples)) are only *examples*. They may not necessarily work out-of-the-box on your specific use case and you'll need to adapt the code for it to work.
|
||||||
|
|
||||||
## 100 projects using Transformers
|
## 100 projects using Transformers
|
||||||
|
|
||||||
@ -280,8 +280,8 @@ Expand each modality below to see a few example models for various use cases.
|
|||||||
- Automatic mask generation with [SAM](https://huggingface.co/facebook/sam-vit-base)
|
- Automatic mask generation with [SAM](https://huggingface.co/facebook/sam-vit-base)
|
||||||
- Depth estimation with [DepthPro](https://huggingface.co/apple/DepthPro-hf)
|
- Depth estimation with [DepthPro](https://huggingface.co/apple/DepthPro-hf)
|
||||||
- Image classification with [DINO v2](https://huggingface.co/facebook/dinov2-base)
|
- Image classification with [DINO v2](https://huggingface.co/facebook/dinov2-base)
|
||||||
- Keypoint detection with [SuperPoint](https://huggingface.co/magic-leap-community/superpoint)
|
- Keypoint detection with [SuperGlue](https://huggingface.co/magic-leap-community/superglue_outdoor)
|
||||||
- Keypoint matching with [SuperGlue](https://huggingface.co/magic-leap-community/superglue_outdoor)
|
- Keypoint matching with [SuperGlue](https://huggingface.co/magic-leap-community/superglue)
|
||||||
- Object detection with [RT-DETRv2](https://huggingface.co/PekingU/rtdetr_v2_r50vd)
|
- Object detection with [RT-DETRv2](https://huggingface.co/PekingU/rtdetr_v2_r50vd)
|
||||||
- Pose Estimation with [VitPose](https://huggingface.co/usyd-community/vitpose-base-simple)
|
- Pose Estimation with [VitPose](https://huggingface.co/usyd-community/vitpose-base-simple)
|
||||||
- Universal segmentation with [OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_swin_large)
|
- Universal segmentation with [OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_swin_large)
|
||||||
|
@ -14,7 +14,7 @@ Models uploaded on the Hugging Face Hub come in different formats. We heavily re
|
|||||||
models in the [`safetensors`](https://github.com/huggingface/safetensors) format (which is the default prioritized
|
models in the [`safetensors`](https://github.com/huggingface/safetensors) format (which is the default prioritized
|
||||||
by the transformers library), as developed specifically to prevent arbitrary code execution on your system.
|
by the transformers library), as developed specifically to prevent arbitrary code execution on your system.
|
||||||
|
|
||||||
To avoid loading models from unsafe formats (e.g. [pickle](https://docs.python.org/3/library/pickle.html), you should use the `use_safetensors` parameter. If doing so, in the event that no .safetensors file is present, transformers will error when loading the model.
|
To avoid loading models from unsafe formats(e.g. [pickle](https://docs.python.org/3/library/pickle.html), you should use the `use_safetensors` parameter. If doing so, in the event that no .safetensors file is present, transformers will error when loading the model.
|
||||||
|
|
||||||
### Remote code
|
### Remote code
|
||||||
|
|
||||||
|
@ -288,7 +288,7 @@ Keywords: Music understanding, Music generation
|
|||||||
|
|
||||||
## [dalle-flow](https://github.com/jina-ai/dalle-flow)
|
## [dalle-flow](https://github.com/jina-ai/dalle-flow)
|
||||||
|
|
||||||
DALL·E Flow is an interactive workflow for generating high-definition images from a text prompt. It leverages DALL·E-Mega, GLID-3 XL, and Stable Diffusion to generate image candidates, and then calls CLIP-as-service to rank the candidates w.r.t. the prompt.
|
DALL·E Flow is an interactive workflow for generating high-definition images from a text prompt. Itt leverages DALL·E-Mega, GLID-3 XL, and Stable Diffusion to generate image candidates, and then calls CLIP-as-service to rank the candidates w.r.t. the prompt.
|
||||||
The preferred candidate is fed to GLID-3 XL for diffusion, which often enriches the texture and background. Finally, the candidate is upscaled to 1024x1024 via SwinIR.
|
The preferred candidate is fed to GLID-3 XL for diffusion, which often enriches the texture and background. Finally, the candidate is upscaled to 1024x1024 via SwinIR.
|
||||||
|
|
||||||
Keywords: High-definition image generation, Stable Diffusion, DALL-E Mega, GLID-3 XL, CLIP, SwinIR
|
Keywords: High-definition image generation, Stable Diffusion, DALL-E Mega, GLID-3 XL, CLIP, SwinIR
|
||||||
@ -526,7 +526,7 @@ Keywords: Model deployment, CLoud, Mobile, Edge
|
|||||||
|
|
||||||
## [underthesea](https://github.com/undertheseanlp/underthesea)
|
## [underthesea](https://github.com/undertheseanlp/underthesea)
|
||||||
|
|
||||||
[underthesea](https://github.com/undertheseanlp/underthesea) is a Vietnamese NLP toolkit. Underthesea is a suite of open source Python modules data sets and tutorials supporting research and development in Vietnamese Natural Language Processing. We provide extremely easy API to quickly apply pretrained NLP models to your Vietnamese text, such as word segmentation, part-of-speech tagging (PoS), named entity recognition (NER), text classification and dependency parsing.
|
[underthesea](https://github.com/undertheseanlp/underthesea) is a Vietnamese NLP toolkit. Underthesea is a suite of open source Python modules data sets and tutorials supporting research and development in Vietnamese Natural Language Processing. We provides extremely easy API to quickly apply pretrained NLP models to your Vietnamese text, such as word segmentation, part-of-speech tagging (PoS), named entity recognition (NER), text classification and dependency parsing.
|
||||||
|
|
||||||
Keywords: Vietnamese, NLP
|
Keywords: Vietnamese, NLP
|
||||||
|
|
||||||
|
1
benchmark/.gitignore
vendored
1
benchmark/.gitignore
vendored
@ -1 +0,0 @@
|
|||||||
benchmark_results/
|
|
@ -1,345 +0,0 @@
|
|||||||
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
from logging import Logger
|
|
||||||
import os
|
|
||||||
from threading import Event, Thread
|
|
||||||
from time import perf_counter, sleep
|
|
||||||
from typing import Optional
|
|
||||||
import sys
|
|
||||||
|
|
||||||
# Add the parent directory to Python path to import benchmarks_entrypoint
|
|
||||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
from benchmarks_entrypoint import MetricsRecorder
|
|
||||||
|
|
||||||
import gpustat
|
|
||||||
import psutil
|
|
||||||
import psycopg2
|
|
||||||
|
|
||||||
# Optional heavy ML dependencies - only required when actually running the benchmark
|
|
||||||
try:
|
|
||||||
import torch
|
|
||||||
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StaticCache
|
|
||||||
TRANSFORMERS_AVAILABLE = True
|
|
||||||
except ImportError:
|
|
||||||
TRANSFORMERS_AVAILABLE = False
|
|
||||||
torch = None
|
|
||||||
AutoModelForCausalLM = None
|
|
||||||
AutoTokenizer = None
|
|
||||||
GenerationConfig = None
|
|
||||||
StaticCache = None
|
|
||||||
|
|
||||||
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
|
||||||
os.environ["TOKENIZERS_PARALLELISM"] = "1"
|
|
||||||
|
|
||||||
# Only set torch precision if torch is available
|
|
||||||
if TRANSFORMERS_AVAILABLE:
|
|
||||||
torch.set_float32_matmul_precision("high")
|
|
||||||
|
|
||||||
|
|
||||||
def collect_metrics(benchmark_id, continue_metric_collection, metrics_recorder):
|
|
||||||
p = psutil.Process(os.getpid())
|
|
||||||
while not continue_metric_collection.is_set():
|
|
||||||
with p.oneshot():
|
|
||||||
cpu_util = p.cpu_percent()
|
|
||||||
mem_megabytes = p.memory_info().rss / (1024 * 1024)
|
|
||||||
gpu_stats = gpustat.GPUStatCollection.new_query()
|
|
||||||
gpu_util = gpu_stats[0]["utilization.gpu"]
|
|
||||||
gpu_mem_megabytes = gpu_stats[0]["memory.used"]
|
|
||||||
metrics_recorder.collect_device_measurements(
|
|
||||||
benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes
|
|
||||||
)
|
|
||||||
sleep(0.01)
|
|
||||||
|
|
||||||
|
|
||||||
def run_benchmark(
|
|
||||||
logger: Logger, repository: str, branch: str, commit_id: str, commit_msg: str, metrics_recorder=None, num_tokens_to_generate=100
|
|
||||||
):
|
|
||||||
# Check if required ML dependencies are available
|
|
||||||
if not TRANSFORMERS_AVAILABLE:
|
|
||||||
logger.error("Transformers and torch are required to run the LLaMA benchmark. Please install them with:")
|
|
||||||
logger.error("pip install torch transformers")
|
|
||||||
logger.error("Skipping LLaMA benchmark due to missing dependencies.")
|
|
||||||
return
|
|
||||||
|
|
||||||
continue_metric_collection = Event()
|
|
||||||
metrics_thread = None
|
|
||||||
model_id = "meta-llama/Llama-2-7b-hf"
|
|
||||||
|
|
||||||
# If no metrics_recorder is provided, create one for backward compatibility
|
|
||||||
if metrics_recorder is None:
|
|
||||||
try:
|
|
||||||
metrics_recorder = MetricsRecorder(
|
|
||||||
psycopg2.connect("dbname=metrics"), logger, repository, branch, commit_id, commit_msg, True
|
|
||||||
)
|
|
||||||
should_close_recorder = True
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to create metrics recorder: {e}")
|
|
||||||
return
|
|
||||||
else:
|
|
||||||
should_close_recorder = False
|
|
||||||
try:
|
|
||||||
gpu_stats = gpustat.GPUStatCollection.new_query()
|
|
||||||
gpu_name = gpu_stats[0]["name"]
|
|
||||||
benchmark_id = metrics_recorder.initialise_benchmark({"gpu_name": gpu_name, "model_id": model_id})
|
|
||||||
logger.info(f"running benchmark #{benchmark_id} on {gpu_name} for {model_id}")
|
|
||||||
metrics_thread = Thread(
|
|
||||||
target=collect_metrics,
|
|
||||||
args=[benchmark_id, continue_metric_collection, metrics_recorder],
|
|
||||||
)
|
|
||||||
metrics_thread.start()
|
|
||||||
logger.info("started background thread to fetch device metrics")
|
|
||||||
|
|
||||||
os.environ["TOKENIZERS_PARALLELISM"] = "false" # silence warnings when compiling
|
|
||||||
|
|
||||||
device = "cuda"
|
|
||||||
|
|
||||||
logger.info("downloading weights")
|
|
||||||
# This is to avoid counting download in model load time measurement
|
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.float16)
|
|
||||||
gen_config = GenerationConfig(do_sample=False, top_p=1, temperature=1)
|
|
||||||
logger.info("loading model")
|
|
||||||
start = perf_counter()
|
|
||||||
model = AutoModelForCausalLM.from_pretrained(
|
|
||||||
model_id, dtype=torch.float16, generation_config=gen_config
|
|
||||||
).eval()
|
|
||||||
model.to(device)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
end = perf_counter()
|
|
||||||
model_load_time = end - start
|
|
||||||
logger.info(f"loaded model in: {model_load_time}s")
|
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
||||||
|
|
||||||
prompt = "Why dogs are so cute?"
|
|
||||||
inputs = tokenizer(prompt, return_tensors="pt").to(device)
|
|
||||||
|
|
||||||
# Specify the max length (including both the prompt and the response)
|
|
||||||
# When calling `generate` with `cache_implementation="static" later, this is also used to create a `StaticCache` object
|
|
||||||
# with sequence length = `max_length`. The longer the more you will re-use it
|
|
||||||
seq_length = inputs["input_ids"].shape[1]
|
|
||||||
model.generation_config.max_length = seq_length + num_tokens_to_generate
|
|
||||||
batch_size = inputs["input_ids"].shape[0]
|
|
||||||
|
|
||||||
# Copied from the gpt-fast repo
|
|
||||||
def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization
|
|
||||||
q = torch.empty_like(probs_sort).exponential_(1)
|
|
||||||
return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
|
|
||||||
|
|
||||||
def logits_to_probs(logits, temperature: float = 1.0, top_k: Optional[int] = None):
|
|
||||||
logits = logits / max(temperature, 1e-5)
|
|
||||||
|
|
||||||
if top_k is not None:
|
|
||||||
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
|
|
||||||
pivot = v.select(-1, -1).unsqueeze(-1)
|
|
||||||
logits = torch.where(logits < pivot, -float("Inf"), logits)
|
|
||||||
probs = torch.nn.functional.softmax(logits, dim=-1)
|
|
||||||
return probs
|
|
||||||
|
|
||||||
def sample(logits, temperature: float = 1.0, top_k: Optional[int] = None):
|
|
||||||
probs = logits_to_probs(logits[0, -1], temperature, top_k)
|
|
||||||
idx_next = multinomial_sample_one_no_sync(probs)
|
|
||||||
return idx_next, probs
|
|
||||||
|
|
||||||
# First eager forward pass
|
|
||||||
logger.info("running first eager forward pass")
|
|
||||||
start = perf_counter()
|
|
||||||
outputs = model(**inputs)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
end = perf_counter()
|
|
||||||
first_eager_fwd_pass_time = end - start
|
|
||||||
logger.info(f"completed first eager forward pass in: {first_eager_fwd_pass_time}s")
|
|
||||||
|
|
||||||
# Second eager forward pass (should be faster)
|
|
||||||
logger.info("running second eager forward pass")
|
|
||||||
start = perf_counter()
|
|
||||||
outputs = model(**inputs)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
end = perf_counter()
|
|
||||||
second_eager_fwd_pass_time = end - start
|
|
||||||
logger.info(f"completed second eager forward pass in: {second_eager_fwd_pass_time}s")
|
|
||||||
|
|
||||||
# First eager generation
|
|
||||||
logger.info("running first eager generation")
|
|
||||||
start = perf_counter()
|
|
||||||
output = model.generate(**inputs)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
end = perf_counter()
|
|
||||||
first_eager_generate_time = end - start
|
|
||||||
logger.info(f"completed first eager generation in: {first_eager_generate_time}s")
|
|
||||||
logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
|
|
||||||
|
|
||||||
# Second eager generation (should be faster)
|
|
||||||
logger.info("running second eager generation")
|
|
||||||
start = perf_counter()
|
|
||||||
output = model.generate(**inputs)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
end = perf_counter()
|
|
||||||
second_eager_generate_time = end - start
|
|
||||||
logger.info(f"completed second eager generation in: {second_eager_generate_time}s")
|
|
||||||
logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
|
|
||||||
|
|
||||||
logger.info("running generation timing loop")
|
|
||||||
|
|
||||||
input_pos = torch.arange(0, seq_length, device=device)
|
|
||||||
inputs = inputs["input_ids"]
|
|
||||||
|
|
||||||
start = perf_counter()
|
|
||||||
with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH):
|
|
||||||
logits = model(inputs, position_ids=input_pos).logits
|
|
||||||
next_token, probs = sample(logits, temperature=0.6, top_k=5)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
end = perf_counter()
|
|
||||||
time_to_first_token = end - start
|
|
||||||
|
|
||||||
input_pos = torch.tensor([seq_length], device=device, dtype=torch.int)
|
|
||||||
next_token = next_token.clone()
|
|
||||||
start = perf_counter()
|
|
||||||
with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH):
|
|
||||||
logits = model(next_token, position_ids=input_pos).logits
|
|
||||||
next_token, probs = sample(logits, temperature=0.6, top_k=5)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
end = perf_counter()
|
|
||||||
time_to_second_token = end - start
|
|
||||||
|
|
||||||
input_pos = torch.tensor([seq_length + 1], device=device, dtype=torch.int)
|
|
||||||
next_token = next_token.clone()
|
|
||||||
start = perf_counter()
|
|
||||||
with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH):
|
|
||||||
logits = model(next_token, position_ids=input_pos).logits
|
|
||||||
next_token, probs = sample(logits, temperature=0.6, top_k=5)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
end = perf_counter()
|
|
||||||
time_to_third_token = end - start
|
|
||||||
|
|
||||||
logger.info("running longer generation timing loop")
|
|
||||||
|
|
||||||
total_time = 0
|
|
||||||
for i in range(20):
|
|
||||||
input_pos = torch.tensor([seq_length + 2 + i], device=device, dtype=torch.int)
|
|
||||||
next_token = next_token.clone()
|
|
||||||
start = perf_counter()
|
|
||||||
with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH):
|
|
||||||
logits = model(next_token, position_ids=input_pos).logits
|
|
||||||
next_token, probs = sample(logits, temperature=0.6, top_k=5)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
end = perf_counter()
|
|
||||||
total_time += end - start
|
|
||||||
|
|
||||||
mean_time_to_next_token = total_time / 20
|
|
||||||
|
|
||||||
logger.info("running compilation benchmarks")
|
|
||||||
|
|
||||||
# Now compile the model
|
|
||||||
model = torch.compile(model, mode="max-autotune", fullgraph=True)
|
|
||||||
|
|
||||||
# StaticCache for generation
|
|
||||||
with torch.device(device):
|
|
||||||
model.setup_caches(max_batch_size=batch_size, max_seq_len=seq_length + num_tokens_to_generate)
|
|
||||||
|
|
||||||
input_pos = torch.arange(0, seq_length, device=device)
|
|
||||||
inputs = tokenizer(prompt, return_tensors="pt").to(device)["input_ids"]
|
|
||||||
|
|
||||||
logger.info("compiling model")
|
|
||||||
|
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.float16, generation_config=gen_config)
|
|
||||||
model.to(device)
|
|
||||||
model = torch.compile(model, mode="max-autotune", fullgraph=True)
|
|
||||||
|
|
||||||
past_key_values = StaticCache(
|
|
||||||
model.config,
|
|
||||||
max_batch_size=batch_size,
|
|
||||||
device=device,
|
|
||||||
dtype=torch.float16,
|
|
||||||
max_cache_len=seq_length + 128,
|
|
||||||
)
|
|
||||||
# 1st call
|
|
||||||
start = perf_counter()
|
|
||||||
output = model.generate(**inputs, past_key_values=past_key_values)
|
|
||||||
end = perf_counter()
|
|
||||||
first_compile_generate_time = end - start
|
|
||||||
logger.info(f"completed first compile generation in: {first_compile_generate_time}s")
|
|
||||||
logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
|
|
||||||
|
|
||||||
past_key_values = StaticCache(
|
|
||||||
model.config,
|
|
||||||
max_batch_size=batch_size,
|
|
||||||
device=device,
|
|
||||||
dtype=torch.float16,
|
|
||||||
max_cache_len=seq_length + 128,
|
|
||||||
)
|
|
||||||
# 2nd call
|
|
||||||
start = perf_counter()
|
|
||||||
output = model.generate(**inputs, past_key_values=past_key_values)
|
|
||||||
end = perf_counter()
|
|
||||||
second_compile_generate_time = end - start
|
|
||||||
logger.info(f"completed second compile generation in: {second_compile_generate_time}s")
|
|
||||||
logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
|
|
||||||
|
|
||||||
past_key_values = StaticCache(
|
|
||||||
model.config,
|
|
||||||
max_batch_size=batch_size,
|
|
||||||
device=device,
|
|
||||||
dtype=torch.float16,
|
|
||||||
max_cache_len=seq_length + 128,
|
|
||||||
)
|
|
||||||
# 3rd call
|
|
||||||
start = perf_counter()
|
|
||||||
output = model.generate(**inputs, past_key_values=past_key_values)
|
|
||||||
end = perf_counter()
|
|
||||||
third_compile_generate_time = end - start
|
|
||||||
logger.info(f"completed third compile generation in: {third_compile_generate_time}s")
|
|
||||||
logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
|
|
||||||
|
|
||||||
past_key_values = StaticCache(
|
|
||||||
model.config,
|
|
||||||
max_batch_size=batch_size,
|
|
||||||
device=device,
|
|
||||||
dtype=torch.float16,
|
|
||||||
max_cache_len=seq_length + 128,
|
|
||||||
)
|
|
||||||
# 4th call
|
|
||||||
start = perf_counter()
|
|
||||||
output = model.generate(**inputs, past_key_values=past_key_values)
|
|
||||||
end = perf_counter()
|
|
||||||
fourth_compile_generate_time = end - start
|
|
||||||
logger.info(f"completed fourth compile generation in: {fourth_compile_generate_time}s")
|
|
||||||
logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
|
|
||||||
|
|
||||||
metrics_recorder.collect_model_measurements(
|
|
||||||
benchmark_id,
|
|
||||||
{
|
|
||||||
"model_load_time": model_load_time,
|
|
||||||
"first_eager_forward_pass_time_secs": first_eager_fwd_pass_time,
|
|
||||||
"second_eager_forward_pass_time_secs": second_eager_fwd_pass_time,
|
|
||||||
"first_eager_generate_time_secs": first_eager_generate_time,
|
|
||||||
"second_eager_generate_time_secs": second_eager_generate_time,
|
|
||||||
"time_to_first_token_secs": time_to_first_token,
|
|
||||||
"time_to_second_token_secs": time_to_second_token,
|
|
||||||
"time_to_third_token_secs": time_to_third_token,
|
|
||||||
"time_to_next_token_mean_secs": mean_time_to_next_token,
|
|
||||||
"first_compile_generate_time_secs": first_compile_generate_time,
|
|
||||||
"second_compile_generate_time_secs": second_compile_generate_time,
|
|
||||||
"third_compile_generate_time_secs": third_compile_generate_time,
|
|
||||||
"fourth_compile_generate_time_secs": fourth_compile_generate_time,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Caught exception: {e}")
|
|
||||||
continue_metric_collection.set()
|
|
||||||
if metrics_thread is not None:
|
|
||||||
metrics_thread.join()
|
|
||||||
|
|
||||||
# Only close the recorder if we created it locally
|
|
||||||
if should_close_recorder:
|
|
||||||
metrics_recorder.close()
|
|
@ -1,35 +1,15 @@
|
|||||||
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import argparse
|
import argparse
|
||||||
import importlib.util
|
import importlib.util
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import json
|
from typing import Dict, Tuple
|
||||||
import uuid
|
|
||||||
from datetime import datetime
|
|
||||||
from typing import Dict, Tuple, Optional, List
|
|
||||||
|
|
||||||
import pandas as pd
|
from psycopg2.extensions import register_adapter
|
||||||
|
from psycopg2.extras import Json
|
||||||
|
|
||||||
try:
|
|
||||||
from psycopg2.extensions import register_adapter
|
register_adapter(dict, Json)
|
||||||
from psycopg2.extras import Json
|
|
||||||
register_adapter(dict, Json)
|
|
||||||
PSYCOPG2_AVAILABLE = True
|
|
||||||
except ImportError:
|
|
||||||
PSYCOPG2_AVAILABLE = False
|
|
||||||
|
|
||||||
|
|
||||||
class ImportModuleException(Exception):
|
class ImportModuleException(Exception):
|
||||||
@ -38,239 +18,61 @@ class ImportModuleException(Exception):
|
|||||||
|
|
||||||
class MetricsRecorder:
|
class MetricsRecorder:
|
||||||
def __init__(
|
def __init__(
|
||||||
self, connection, logger: logging.Logger, repository: str, branch: str, commit_id: str, commit_msg: str,
|
self, connection, logger: logging.Logger, repository: str, branch: str, commit_id: str, commit_msg: str
|
||||||
collect_csv_data: bool = True
|
|
||||||
):
|
):
|
||||||
self.conn = connection
|
self.conn = connection
|
||||||
self.use_database = connection is not None
|
self.conn.autocommit = True
|
||||||
if self.use_database:
|
|
||||||
self.conn.autocommit = True
|
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
self.repository = repository
|
self.repository = repository
|
||||||
self.branch = branch
|
self.branch = branch
|
||||||
self.commit_id = commit_id
|
self.commit_id = commit_id
|
||||||
self.commit_msg = commit_msg
|
self.commit_msg = commit_msg
|
||||||
self.collect_csv_data = collect_csv_data
|
|
||||||
|
|
||||||
# For CSV export - store all data in pandas DataFrames (only if CSV collection is enabled)
|
|
||||||
if self.collect_csv_data:
|
|
||||||
# Initialize empty DataFrames with proper schemas
|
|
||||||
self.benchmarks_df = pd.DataFrame(columns=[
|
|
||||||
'benchmark_id', 'repository', 'branch', 'commit_id', 'commit_message',
|
|
||||||
'metadata', 'created_at'
|
|
||||||
])
|
|
||||||
self.device_measurements_df = pd.DataFrame(columns=[
|
|
||||||
'benchmark_id', 'cpu_util', 'mem_megabytes', 'gpu_util',
|
|
||||||
'gpu_mem_megabytes', 'time'
|
|
||||||
])
|
|
||||||
self.model_measurements_df = pd.DataFrame(columns=[
|
|
||||||
'benchmark_id', 'time', 'model_load_time', 'first_eager_forward_pass_time_secs',
|
|
||||||
'second_eager_forward_pass_time_secs', 'first_eager_generate_time_secs',
|
|
||||||
'second_eager_generate_time_secs', 'time_to_first_token_secs',
|
|
||||||
'time_to_second_token_secs', 'time_to_third_token_secs',
|
|
||||||
'time_to_next_token_mean_secs', 'first_compile_generate_time_secs',
|
|
||||||
'second_compile_generate_time_secs', 'third_compile_generate_time_secs',
|
|
||||||
'fourth_compile_generate_time_secs'
|
|
||||||
])
|
|
||||||
else:
|
|
||||||
self.benchmarks_df = None
|
|
||||||
self.device_measurements_df = None
|
|
||||||
self.model_measurements_df = None
|
|
||||||
|
|
||||||
def initialise_benchmark(self, metadata: dict[str, str]) -> str:
|
def initialise_benchmark(self, metadata: dict[str, str]) -> int:
|
||||||
"""
|
"""
|
||||||
Creates a new benchmark, returns the benchmark id (UUID)
|
Creates a new benchmark, returns the benchmark id
|
||||||
"""
|
"""
|
||||||
# Generate a unique UUID for this benchmark
|
# gpu_name: str, model_id: str
|
||||||
benchmark_id = str(uuid.uuid4())
|
with self.conn.cursor() as cur:
|
||||||
|
cur.execute(
|
||||||
if self.use_database:
|
"INSERT INTO benchmarks (repository, branch, commit_id, commit_message, metadata) VALUES (%s, %s, %s, %s, %s) RETURNING benchmark_id",
|
||||||
with self.conn.cursor() as cur:
|
(self.repository, self.branch, self.commit_id, self.commit_msg, metadata),
|
||||||
cur.execute(
|
)
|
||||||
"INSERT INTO benchmarks (benchmark_id, repository, branch, commit_id, commit_message, metadata) VALUES (%s, %s, %s, %s, %s, %s)",
|
benchmark_id = cur.fetchone()[0]
|
||||||
(benchmark_id, self.repository, self.branch, self.commit_id, self.commit_msg, metadata),
|
logger.debug(f"initialised benchmark #{benchmark_id}")
|
||||||
)
|
return benchmark_id
|
||||||
self.logger.debug(f"initialised benchmark #{benchmark_id}")
|
|
||||||
|
|
||||||
# Store benchmark data for CSV export (if enabled)
|
|
||||||
if self.collect_csv_data:
|
|
||||||
# Add row to pandas DataFrame
|
|
||||||
new_row = pd.DataFrame([{
|
|
||||||
'benchmark_id': benchmark_id,
|
|
||||||
'repository': self.repository,
|
|
||||||
'branch': self.branch,
|
|
||||||
'commit_id': self.commit_id,
|
|
||||||
'commit_message': self.commit_msg,
|
|
||||||
'metadata': json.dumps(metadata),
|
|
||||||
'created_at': datetime.utcnow().isoformat()
|
|
||||||
}])
|
|
||||||
self.benchmarks_df = pd.concat([self.benchmarks_df, new_row], ignore_index=True)
|
|
||||||
|
|
||||||
mode_info = []
|
|
||||||
if self.use_database:
|
|
||||||
mode_info.append("database")
|
|
||||||
if self.collect_csv_data:
|
|
||||||
mode_info.append("CSV")
|
|
||||||
mode_str = " + ".join(mode_info) if mode_info else "no storage"
|
|
||||||
|
|
||||||
self.logger.debug(f"initialised benchmark #{benchmark_id} ({mode_str} mode)")
|
|
||||||
return benchmark_id
|
|
||||||
|
|
||||||
def collect_device_measurements(self, benchmark_id: str, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes):
|
def collect_device_measurements(self, benchmark_id: int, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes):
|
||||||
"""
|
"""
|
||||||
Collect device metrics, such as CPU & GPU usage. These are "static", as in you cannot pass arbitrary arguments to the function.
|
Collect device metrics, such as CPU & GPU usage. These are "static", as in you cannot pass arbitrary arguments to the function.
|
||||||
"""
|
"""
|
||||||
# Store device measurements for CSV export (if enabled)
|
with self.conn.cursor() as cur:
|
||||||
if self.collect_csv_data:
|
cur.execute(
|
||||||
# Add row to pandas DataFrame
|
"INSERT INTO device_measurements (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes) VALUES (%s, %s, %s, %s, %s)",
|
||||||
new_row = pd.DataFrame([{
|
(benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes),
|
||||||
'benchmark_id': benchmark_id,
|
)
|
||||||
'cpu_util': cpu_util,
|
|
||||||
'mem_megabytes': mem_megabytes,
|
|
||||||
'gpu_util': gpu_util,
|
|
||||||
'gpu_mem_megabytes': gpu_mem_megabytes,
|
|
||||||
'time': datetime.utcnow().isoformat()
|
|
||||||
}])
|
|
||||||
self.device_measurements_df = pd.concat([self.device_measurements_df, new_row], ignore_index=True)
|
|
||||||
|
|
||||||
# Store in database if available
|
|
||||||
if self.use_database:
|
|
||||||
with self.conn.cursor() as cur:
|
|
||||||
cur.execute(
|
|
||||||
"INSERT INTO device_measurements (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes) VALUES (%s, %s, %s, %s, %s)",
|
|
||||||
(benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes),
|
|
||||||
)
|
|
||||||
|
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
f"collected device measurements for benchmark #{benchmark_id} [CPU util: {cpu_util}, mem MBs: {mem_megabytes}, GPU util: {gpu_util}, GPU mem MBs: {gpu_mem_megabytes}]"
|
f"inserted device measurements for benchmark #{benchmark_id} [CPU util: {cpu_util}, mem MBs: {mem_megabytes}, GPU util: {gpu_util}, GPU mem MBs: {gpu_mem_megabytes}]"
|
||||||
)
|
)
|
||||||
|
|
||||||
def collect_model_measurements(self, benchmark_id: str, measurements: dict[str, float]):
|
def collect_model_measurements(self, benchmark_id: int, measurements: dict[str, float]):
|
||||||
# Store model measurements for CSV export (if enabled)
|
with self.conn.cursor() as cur:
|
||||||
if self.collect_csv_data:
|
cur.execute(
|
||||||
# Add row to pandas DataFrame with flattened measurements
|
"""
|
||||||
row_data = {
|
INSERT INTO model_measurements (
|
||||||
'benchmark_id': benchmark_id,
|
benchmark_id,
|
||||||
'time': datetime.utcnow().isoformat()
|
measurements
|
||||||
}
|
) VALUES (%s, %s)
|
||||||
# Flatten the measurements dict into the row
|
""",
|
||||||
row_data.update(measurements)
|
(
|
||||||
|
benchmark_id,
|
||||||
new_row = pd.DataFrame([row_data])
|
measurements,
|
||||||
self.model_measurements_df = pd.concat([self.model_measurements_df, new_row], ignore_index=True)
|
),
|
||||||
|
)
|
||||||
# Store in database if available
|
self.logger.debug(f"inserted model measurements for benchmark #{benchmark_id}: {measurements}")
|
||||||
if self.use_database:
|
|
||||||
with self.conn.cursor() as cur:
|
|
||||||
cur.execute(
|
|
||||||
"""
|
|
||||||
INSERT INTO model_measurements (
|
|
||||||
benchmark_id,
|
|
||||||
measurements
|
|
||||||
) VALUES (%s, %s)
|
|
||||||
""",
|
|
||||||
(
|
|
||||||
benchmark_id,
|
|
||||||
measurements,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
self.logger.debug(f"collected model measurements for benchmark #{benchmark_id}: {measurements}")
|
|
||||||
|
|
||||||
def export_to_csv(self, output_dir: str = "benchmark_results"):
|
|
||||||
"""
|
|
||||||
Export all collected data to CSV files using pandas DataFrames
|
|
||||||
"""
|
|
||||||
if not self.collect_csv_data:
|
|
||||||
self.logger.warning("CSV data collection is disabled - no CSV files will be generated")
|
|
||||||
return
|
|
||||||
|
|
||||||
if not os.path.exists(output_dir):
|
|
||||||
os.makedirs(output_dir)
|
|
||||||
self.logger.info(f"Created output directory: {output_dir}")
|
|
||||||
|
|
||||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
||||||
files_created = []
|
|
||||||
|
|
||||||
# Export using pandas DataFrames
|
|
||||||
self._export_pandas_data(output_dir, timestamp, files_created)
|
|
||||||
|
|
||||||
self.logger.info(f"CSV export complete! Created {len(files_created)} files in {output_dir}")
|
|
||||||
|
|
||||||
def _export_pandas_data(self, output_dir: str, timestamp: str, files_created: list):
|
|
||||||
"""
|
|
||||||
Export CSV files using pandas DataFrames
|
|
||||||
"""
|
|
||||||
# Export benchmarks
|
|
||||||
benchmarks_file = os.path.join(output_dir, f"benchmarks_{timestamp}.csv")
|
|
||||||
self.benchmarks_df.to_csv(benchmarks_file, index=False)
|
|
||||||
files_created.append(benchmarks_file)
|
|
||||||
self.logger.info(f"Exported {len(self.benchmarks_df)} benchmark records to {benchmarks_file}")
|
|
||||||
|
|
||||||
# Export device measurements
|
|
||||||
device_file = os.path.join(output_dir, f"device_measurements_{timestamp}.csv")
|
|
||||||
self.device_measurements_df.to_csv(device_file, index=False)
|
|
||||||
files_created.append(device_file)
|
|
||||||
self.logger.info(f"Exported {len(self.device_measurements_df)} device measurement records to {device_file}")
|
|
||||||
|
|
||||||
# Export model measurements (already flattened)
|
|
||||||
model_file = os.path.join(output_dir, f"model_measurements_{timestamp}.csv")
|
|
||||||
self.model_measurements_df.to_csv(model_file, index=False)
|
|
||||||
files_created.append(model_file)
|
|
||||||
self.logger.info(f"Exported {len(self.model_measurements_df)} model measurement records to {model_file}")
|
|
||||||
|
|
||||||
# Create comprehensive summary using pandas operations
|
|
||||||
summary_file = os.path.join(output_dir, f"benchmark_summary_{timestamp}.csv")
|
|
||||||
self._create_summary(summary_file)
|
|
||||||
files_created.append(summary_file)
|
|
||||||
|
|
||||||
def _create_summary(self, summary_file: str):
|
|
||||||
"""
|
|
||||||
Create a comprehensive summary CSV using pandas operations
|
|
||||||
"""
|
|
||||||
if len(self.benchmarks_df) == 0:
|
|
||||||
# Create empty summary file
|
|
||||||
summary_df = pd.DataFrame()
|
|
||||||
summary_df.to_csv(summary_file, index=False)
|
|
||||||
self.logger.info(f"Created empty benchmark summary at {summary_file}")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Start with benchmarks as the base
|
|
||||||
summary_df = self.benchmarks_df.copy()
|
|
||||||
|
|
||||||
# Add model measurements (join on benchmark_id)
|
|
||||||
if len(self.model_measurements_df) > 0:
|
|
||||||
# Drop 'time' column from model measurements to avoid conflicts
|
|
||||||
model_df = self.model_measurements_df.drop(columns=['time'], errors='ignore')
|
|
||||||
summary_df = summary_df.merge(model_df, on='benchmark_id', how='left')
|
|
||||||
|
|
||||||
# Calculate device measurement aggregates using pandas groupby
|
|
||||||
if len(self.device_measurements_df) > 0:
|
|
||||||
device_agg = self.device_measurements_df.groupby('benchmark_id').agg({
|
|
||||||
'cpu_util': ['mean', 'max', 'std', 'count'],
|
|
||||||
'mem_megabytes': ['mean', 'max', 'std'],
|
|
||||||
'gpu_util': ['mean', 'max', 'std'],
|
|
||||||
'gpu_mem_megabytes': ['mean', 'max', 'std']
|
|
||||||
}).round(3)
|
|
||||||
|
|
||||||
# Flatten column names
|
|
||||||
device_agg.columns = [f"{col[0]}_{col[1]}" for col in device_agg.columns]
|
|
||||||
device_agg = device_agg.reset_index()
|
|
||||||
|
|
||||||
# Rename count column to be more descriptive
|
|
||||||
if 'cpu_util_count' in device_agg.columns:
|
|
||||||
device_agg = device_agg.rename(columns={'cpu_util_count': 'device_measurement_count'})
|
|
||||||
|
|
||||||
# Merge with summary
|
|
||||||
summary_df = summary_df.merge(device_agg, on='benchmark_id', how='left')
|
|
||||||
|
|
||||||
# Export the comprehensive summary
|
|
||||||
summary_df.to_csv(summary_file, index=False)
|
|
||||||
self.logger.info(f"Created comprehensive benchmark summary with {len(summary_df)} records at {summary_file}")
|
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
if self.use_database and self.conn:
|
self.conn.close()
|
||||||
self.conn.close()
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@ -283,7 +85,7 @@ handler.setFormatter(formatter)
|
|||||||
logger.addHandler(handler)
|
logger.addHandler(handler)
|
||||||
|
|
||||||
|
|
||||||
def parse_arguments() -> tuple[str, str, str, str, bool, str]:
|
def parse_arguments() -> tuple[str, str, str, str]:
|
||||||
"""
|
"""
|
||||||
Parse command line arguments for the benchmarking CLI.
|
Parse command line arguments for the benchmarking CLI.
|
||||||
"""
|
"""
|
||||||
@ -312,27 +114,10 @@ def parse_arguments() -> tuple[str, str, str, str, bool, str]:
|
|||||||
type=str,
|
type=str,
|
||||||
help="The commit message associated with the commit, truncated to 70 characters.",
|
help="The commit message associated with the commit, truncated to 70 characters.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--csv",
|
|
||||||
action="store_true",
|
|
||||||
default=False,
|
|
||||||
help="Enable CSV output files generation."
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--csv-output-dir",
|
|
||||||
type=str,
|
|
||||||
default="benchmark_results",
|
|
||||||
help="Directory for CSV output files (default: benchmark_results)."
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# CSV is disabled by default, only enabled when --csv is used
|
|
||||||
generate_csv = args.csv
|
|
||||||
|
|
||||||
return args.repository, args.branch, args.commit_id, args.commit_msg, generate_csv, args.csv_output_dir
|
return args.repository, args.branch, args.commit_id, args.commit_msg
|
||||||
|
|
||||||
|
|
||||||
def import_from_path(module_name, file_path):
|
def import_from_path(module_name, file_path):
|
||||||
@ -346,124 +131,22 @@ def import_from_path(module_name, file_path):
|
|||||||
raise ImportModuleException(f"failed to load python module: {e}")
|
raise ImportModuleException(f"failed to load python module: {e}")
|
||||||
|
|
||||||
|
|
||||||
def create_database_connection():
|
|
||||||
"""
|
|
||||||
Try to create a database connection. Returns None if connection fails.
|
|
||||||
"""
|
|
||||||
if not PSYCOPG2_AVAILABLE:
|
|
||||||
logger.warning("psycopg2 not available - running in CSV-only mode")
|
|
||||||
return None
|
|
||||||
|
|
||||||
try:
|
|
||||||
import psycopg2
|
|
||||||
conn = psycopg2.connect("dbname=metrics")
|
|
||||||
logger.info("Successfully connected to database")
|
|
||||||
return conn
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Failed to connect to database: {e}. Running in CSV-only mode")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def create_global_metrics_recorder(repository: str, branch: str, commit_id: str, commit_msg: str,
|
|
||||||
generate_csv: bool = False) -> MetricsRecorder:
|
|
||||||
"""
|
|
||||||
Create a global metrics recorder that will be used across all benchmarks.
|
|
||||||
"""
|
|
||||||
connection = create_database_connection()
|
|
||||||
recorder = MetricsRecorder(connection, logger, repository, branch, commit_id, commit_msg, generate_csv)
|
|
||||||
|
|
||||||
# Log the storage mode
|
|
||||||
storage_modes = []
|
|
||||||
if connection is not None:
|
|
||||||
storage_modes.append("database")
|
|
||||||
if generate_csv:
|
|
||||||
storage_modes.append("CSV")
|
|
||||||
|
|
||||||
if not storage_modes:
|
|
||||||
logger.warning("Running benchmarks with NO data storage (no database connection, CSV disabled)")
|
|
||||||
logger.warning("Use --csv flag to enable CSV output when database is unavailable")
|
|
||||||
else:
|
|
||||||
logger.info(f"Running benchmarks with: {' + '.join(storage_modes)} storage")
|
|
||||||
|
|
||||||
return recorder
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
benchmarks_folder_path = os.path.dirname(os.path.realpath(__file__))
|
benchmarks_folder_path = os.path.dirname(os.path.realpath(__file__))
|
||||||
benches_folder_path = os.path.join(benchmarks_folder_path, "benches")
|
|
||||||
|
|
||||||
repository, branch, commit_id, commit_msg, generate_csv, csv_output_dir = parse_arguments()
|
repository, branch, commit_id, commit_msg = parse_arguments()
|
||||||
|
|
||||||
# Create a global metrics recorder
|
for entry in os.scandir(benchmarks_folder_path):
|
||||||
global_metrics_recorder = create_global_metrics_recorder(repository, branch, commit_id, commit_msg, generate_csv)
|
try:
|
||||||
|
|
||||||
successful_benchmarks = 0
|
|
||||||
failed_benchmarks = 0
|
|
||||||
|
|
||||||
# Automatically discover all benchmark modules in benches/ folder
|
|
||||||
benchmark_modules = []
|
|
||||||
|
|
||||||
if os.path.exists(benches_folder_path):
|
|
||||||
logger.debug(f"Scanning for benchmarks in: {benches_folder_path}")
|
|
||||||
for entry in os.scandir(benches_folder_path):
|
|
||||||
if not entry.name.endswith(".py"):
|
if not entry.name.endswith(".py"):
|
||||||
continue
|
continue
|
||||||
if entry.name.startswith("__"): # Skip __init__.py, __pycache__, etc.
|
if entry.path == __file__:
|
||||||
continue
|
continue
|
||||||
|
logger.debug(f"loading: {entry.name}")
|
||||||
# Check if the file has a run_benchmark function
|
module = import_from_path(entry.name.split(".")[0], entry.path)
|
||||||
try:
|
logger.info(f"running benchmarks in: {entry.name}")
|
||||||
logger.debug(f"checking if benches/{entry.name} has run_benchmark function")
|
module.run_benchmark(logger, repository, branch, commit_id, commit_msg)
|
||||||
module = import_from_path(entry.name.split(".")[0], entry.path)
|
|
||||||
if hasattr(module, 'run_benchmark'):
|
|
||||||
benchmark_modules.append(entry.name)
|
|
||||||
logger.debug(f"discovered benchmark: {entry.name}")
|
|
||||||
else:
|
|
||||||
logger.debug(f"skipping {entry.name} - no run_benchmark function found")
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"failed to check benches/{entry.name}: {e}")
|
|
||||||
else:
|
|
||||||
logger.warning(f"Benches directory not found: {benches_folder_path}")
|
|
||||||
|
|
||||||
if benchmark_modules:
|
|
||||||
logger.info(f"Discovered {len(benchmark_modules)} benchmark(s): {benchmark_modules}")
|
|
||||||
else:
|
|
||||||
logger.warning("No benchmark modules found in benches/ directory")
|
|
||||||
|
|
||||||
for module_name in benchmark_modules:
|
|
||||||
module_path = os.path.join(benches_folder_path, module_name)
|
|
||||||
try:
|
|
||||||
logger.debug(f"loading: {module_name}")
|
|
||||||
module = import_from_path(module_name.split(".")[0], module_path)
|
|
||||||
logger.info(f"running benchmarks in: {module_name}")
|
|
||||||
|
|
||||||
# Check if the module has an updated run_benchmark function that accepts metrics_recorder
|
|
||||||
try:
|
|
||||||
# Try the new signature first
|
|
||||||
module.run_benchmark(logger, repository, branch, commit_id, commit_msg, global_metrics_recorder)
|
|
||||||
except TypeError:
|
|
||||||
# Fall back to the old signature for backward compatibility
|
|
||||||
logger.warning(f"Module {module_name} using old run_benchmark signature - database connection will be created per module")
|
|
||||||
module.run_benchmark(logger, repository, branch, commit_id, commit_msg)
|
|
||||||
|
|
||||||
successful_benchmarks += 1
|
|
||||||
except ImportModuleException as e:
|
except ImportModuleException as e:
|
||||||
logger.error(e)
|
logger.error(e)
|
||||||
failed_benchmarks += 1
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"error running benchmarks for {module_name}: {e}")
|
logger.error(f"error running benchmarks for {entry.name}: {e}")
|
||||||
failed_benchmarks += 1
|
|
||||||
|
|
||||||
# Export CSV results at the end (if enabled)
|
|
||||||
try:
|
|
||||||
if generate_csv:
|
|
||||||
global_metrics_recorder.export_to_csv(csv_output_dir)
|
|
||||||
logger.info(f"CSV reports have been generated and saved to the {csv_output_dir} directory")
|
|
||||||
else:
|
|
||||||
logger.info("CSV generation disabled - no CSV files created (use --csv to enable)")
|
|
||||||
|
|
||||||
logger.info(f"Benchmark run completed. Successful: {successful_benchmarks}, Failed: {failed_benchmarks}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to export CSV results: {e}")
|
|
||||||
finally:
|
|
||||||
global_metrics_recorder.close()
|
|
||||||
|
@ -19,7 +19,7 @@ backend:
|
|||||||
model: meta-llama/Llama-2-7b-hf
|
model: meta-llama/Llama-2-7b-hf
|
||||||
cache_implementation: static
|
cache_implementation: static
|
||||||
torch_compile: true
|
torch_compile: true
|
||||||
dtype: float16
|
torch_dtype: float16
|
||||||
torch_compile_config:
|
torch_compile_config:
|
||||||
backend: inductor
|
backend: inductor
|
||||||
mode: reduce-overhead
|
mode: reduce-overhead
|
||||||
|
34
benchmark/init_db.sql
Normal file
34
benchmark/init_db.sql
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
CREATE TABLE IF NOT EXISTS benchmarks (
|
||||||
|
benchmark_id SERIAL PRIMARY KEY,
|
||||||
|
repository VARCHAR(255),
|
||||||
|
branch VARCHAR(255),
|
||||||
|
commit_id VARCHAR(72),
|
||||||
|
commit_message VARCHAR(70),
|
||||||
|
metadata jsonb,
|
||||||
|
created_at timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC')
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS benchmarks_benchmark_id_idx ON benchmarks (benchmark_id);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS benchmarks_branch_idx ON benchmarks (branch);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS device_measurements (
|
||||||
|
measurement_id SERIAL PRIMARY KEY,
|
||||||
|
benchmark_id int REFERENCES benchmarks (benchmark_id),
|
||||||
|
cpu_util double precision,
|
||||||
|
mem_megabytes double precision,
|
||||||
|
gpu_util double precision,
|
||||||
|
gpu_mem_megabytes double precision,
|
||||||
|
time timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC')
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS device_measurements_branch_idx ON device_measurements (benchmark_id);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS model_measurements (
|
||||||
|
measurement_id SERIAL PRIMARY KEY,
|
||||||
|
benchmark_id int REFERENCES benchmarks (benchmark_id),
|
||||||
|
measurements jsonb,
|
||||||
|
time timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC')
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS model_measurements_branch_idx ON model_measurements (benchmark_id);
|
346
benchmark/llama.py
Normal file
346
benchmark/llama.py
Normal file
@ -0,0 +1,346 @@
|
|||||||
|
from logging import Logger
|
||||||
|
import os
|
||||||
|
from threading import Event, Thread
|
||||||
|
from time import perf_counter, sleep
|
||||||
|
from typing import Optional
|
||||||
|
from benchmarks_entrypoint import MetricsRecorder
|
||||||
|
import gpustat
|
||||||
|
import psutil
|
||||||
|
import psycopg2
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StaticCache
|
||||||
|
|
||||||
|
|
||||||
|
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
||||||
|
|
||||||
|
os.environ["TOKENIZERS_PARALLELISM"] = "1"
|
||||||
|
torch.set_float32_matmul_precision("high")
|
||||||
|
|
||||||
|
|
||||||
|
def collect_metrics(benchmark_id, continue_metric_collection, metrics_recorder):
|
||||||
|
p = psutil.Process(os.getpid())
|
||||||
|
while not continue_metric_collection.is_set():
|
||||||
|
with p.oneshot():
|
||||||
|
cpu_util = p.cpu_percent()
|
||||||
|
mem_megabytes = p.memory_info().rss / (1024 * 1024)
|
||||||
|
gpu_stats = gpustat.GPUStatCollection.new_query()
|
||||||
|
gpu_util = gpu_stats[0]["utilization.gpu"]
|
||||||
|
gpu_mem_megabytes = gpu_stats[0]["memory.used"]
|
||||||
|
metrics_recorder.collect_device_measurements(
|
||||||
|
benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes
|
||||||
|
)
|
||||||
|
sleep(0.01)
|
||||||
|
|
||||||
|
|
||||||
|
def run_benchmark(
|
||||||
|
logger: Logger, repository: str, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100
|
||||||
|
):
|
||||||
|
continue_metric_collection = Event()
|
||||||
|
metrics_thread = None
|
||||||
|
model_id = "meta-llama/Llama-2-7b-hf"
|
||||||
|
metrics_recorder = MetricsRecorder(
|
||||||
|
psycopg2.connect("dbname=metrics"), logger, repository, branch, commit_id, commit_msg
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
gpu_stats = gpustat.GPUStatCollection.new_query()
|
||||||
|
gpu_name = gpu_stats[0]["name"]
|
||||||
|
benchmark_id = metrics_recorder.initialise_benchmark({"gpu_name": gpu_name, "model_id": model_id})
|
||||||
|
logger.info(f"running benchmark #{benchmark_id} on {gpu_name} for {model_id}")
|
||||||
|
metrics_thread = Thread(
|
||||||
|
target=collect_metrics,
|
||||||
|
args=[benchmark_id, continue_metric_collection, metrics_recorder],
|
||||||
|
)
|
||||||
|
metrics_thread.start()
|
||||||
|
logger.info("started background thread to fetch device metrics")
|
||||||
|
|
||||||
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" # silence warnings when compiling
|
||||||
|
|
||||||
|
device = "cuda"
|
||||||
|
|
||||||
|
logger.info("downloading weights")
|
||||||
|
# This is to avoid counting download in model load time measurement
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)
|
||||||
|
gen_config = GenerationConfig(do_sample=False, top_p=1, temperature=1)
|
||||||
|
logger.info("loading model")
|
||||||
|
start = perf_counter()
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_id, torch_dtype=torch.float16, generation_config=gen_config
|
||||||
|
).eval()
|
||||||
|
model.to(device)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
end = perf_counter()
|
||||||
|
model_load_time = end - start
|
||||||
|
logger.info(f"loaded model in: {model_load_time}s")
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||||
|
|
||||||
|
prompt = "Why dogs are so cute?"
|
||||||
|
inputs = tokenizer(prompt, return_tensors="pt").to(device)
|
||||||
|
|
||||||
|
# Specify the max length (including both the prompt and the response)
|
||||||
|
# When calling `generate` with `cache_implementation="static" later, this is also used to create a `StaticCache` object
|
||||||
|
# with sequence length = `max_length`. The longer the more you will re-use it
|
||||||
|
seq_length = inputs["input_ids"].shape[1]
|
||||||
|
model.generation_config.max_length = seq_length + num_tokens_to_generate
|
||||||
|
batch_size = inputs["input_ids"].shape[0]
|
||||||
|
|
||||||
|
# Copied from the gpt-fast repo
|
||||||
|
def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization
|
||||||
|
q = torch.empty_like(probs_sort).exponential_(1)
|
||||||
|
return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
|
||||||
|
|
||||||
|
def logits_to_probs(logits, temperature: float = 1.0, top_k: Optional[int] = None):
|
||||||
|
logits = logits / max(temperature, 1e-5)
|
||||||
|
|
||||||
|
if top_k is not None:
|
||||||
|
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
|
||||||
|
pivot = v.select(-1, -1).unsqueeze(-1)
|
||||||
|
logits = torch.where(logits < pivot, -float("Inf"), logits)
|
||||||
|
probs = torch.nn.functional.softmax(logits, dim=-1)
|
||||||
|
return probs
|
||||||
|
|
||||||
|
def sample(logits, temperature: float = 1.0, top_k: Optional[int] = None):
|
||||||
|
probs = logits_to_probs(logits[:, -1], temperature, top_k)
|
||||||
|
idx_next = multinomial_sample_one_no_sync(probs)
|
||||||
|
return idx_next, probs
|
||||||
|
|
||||||
|
def decode_one_token(model, cur_token, cache_position, past_key_values):
|
||||||
|
logits = model(
|
||||||
|
cur_token,
|
||||||
|
cache_position=cache_position,
|
||||||
|
past_key_values=past_key_values,
|
||||||
|
return_dict=False,
|
||||||
|
use_cache=True,
|
||||||
|
)[0]
|
||||||
|
new_token = sample(logits, temperature=0.6, top_k=5)[0]
|
||||||
|
return new_token
|
||||||
|
|
||||||
|
#########
|
||||||
|
# Eager #
|
||||||
|
#########
|
||||||
|
with torch.no_grad():
|
||||||
|
past_key_values = StaticCache(
|
||||||
|
model.config,
|
||||||
|
max_batch_size=batch_size,
|
||||||
|
device=device,
|
||||||
|
dtype=torch.float16,
|
||||||
|
max_cache_len=seq_length + num_tokens_to_generate,
|
||||||
|
)
|
||||||
|
cache_position = torch.arange(seq_length, device=device)
|
||||||
|
start = perf_counter()
|
||||||
|
model(
|
||||||
|
**inputs,
|
||||||
|
cache_position=cache_position,
|
||||||
|
past_key_values=past_key_values,
|
||||||
|
return_dict=False,
|
||||||
|
use_cache=True,
|
||||||
|
)
|
||||||
|
end = perf_counter()
|
||||||
|
first_eager_fwd_pass_time = end - start
|
||||||
|
logger.info(f"completed first eager fwd pass in: {first_eager_fwd_pass_time}s")
|
||||||
|
start = perf_counter()
|
||||||
|
output = model.generate(**inputs, do_sample=False)
|
||||||
|
end = perf_counter()
|
||||||
|
first_eager_generate_time = end - start
|
||||||
|
logger.info(f"completed first eager generation in: {first_eager_generate_time}s")
|
||||||
|
logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
|
||||||
|
|
||||||
|
past_key_values = StaticCache(
|
||||||
|
model.config,
|
||||||
|
max_batch_size=batch_size,
|
||||||
|
device=device,
|
||||||
|
dtype=torch.float16,
|
||||||
|
max_cache_len=seq_length + num_tokens_to_generate,
|
||||||
|
)
|
||||||
|
cache_position = torch.arange(seq_length, device=device)
|
||||||
|
start = perf_counter()
|
||||||
|
model(
|
||||||
|
**inputs,
|
||||||
|
cache_position=cache_position,
|
||||||
|
past_key_values=past_key_values,
|
||||||
|
return_dict=False,
|
||||||
|
use_cache=True,
|
||||||
|
)
|
||||||
|
end = perf_counter()
|
||||||
|
second_eager_fwd_pass_time = end - start
|
||||||
|
logger.info(f"completed second eager fwd pass in: {second_eager_fwd_pass_time}s")
|
||||||
|
start = perf_counter()
|
||||||
|
model.generate(**inputs, do_sample=False)
|
||||||
|
end = perf_counter()
|
||||||
|
second_eager_generate_time = end - start
|
||||||
|
logger.info(f"completed second eager generation in: {second_eager_generate_time}s")
|
||||||
|
logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
|
||||||
|
|
||||||
|
torch.compiler.reset()
|
||||||
|
|
||||||
|
################
|
||||||
|
# Forward pass #
|
||||||
|
################
|
||||||
|
|
||||||
|
# `torch.compile(model, ...)` is not recommended as you compile callbacks
|
||||||
|
# and full generate. We recommend compiling only the forward for now.
|
||||||
|
# "reduce-overhead" will use cudagraphs.
|
||||||
|
generated_ids = torch.zeros(
|
||||||
|
(batch_size, num_tokens_to_generate + seq_length), dtype=torch.int, device=device
|
||||||
|
)
|
||||||
|
|
||||||
|
generated_ids[:, :seq_length] = inputs["input_ids"]
|
||||||
|
decode_one_token = torch.compile(decode_one_token, mode="reduce-overhead", fullgraph=True)
|
||||||
|
# model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
|
||||||
|
# TODO use decode_one_token(model, input_id.clone(), cache_position) for verification
|
||||||
|
past_key_values = StaticCache(
|
||||||
|
model.config,
|
||||||
|
max_batch_size=batch_size,
|
||||||
|
device=device,
|
||||||
|
dtype=torch.float16,
|
||||||
|
max_cache_len=seq_length + num_tokens_to_generate + 10,
|
||||||
|
)
|
||||||
|
cache_position = torch.arange(seq_length, device=device)
|
||||||
|
all_generated_tokens = []
|
||||||
|
### First compile, prefill
|
||||||
|
start = perf_counter()
|
||||||
|
next_token = decode_one_token(
|
||||||
|
model, inputs["input_ids"], cache_position=cache_position, past_key_values=past_key_values
|
||||||
|
)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
end = perf_counter()
|
||||||
|
time_to_first_token = end - start
|
||||||
|
logger.info(f"completed first compile generation in: {time_to_first_token}s")
|
||||||
|
cache_position += 1
|
||||||
|
all_generated_tokens += next_token.tolist()
|
||||||
|
|
||||||
|
cache_position = torch.tensor([seq_length], device=device)
|
||||||
|
### First compile, decoding
|
||||||
|
start = perf_counter()
|
||||||
|
next_token = decode_one_token(
|
||||||
|
model, next_token.clone(), cache_position=cache_position, past_key_values=past_key_values
|
||||||
|
)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
end = perf_counter()
|
||||||
|
time_to_second_token = end - start
|
||||||
|
logger.info(f"completed second compile generation in: {time_to_second_token}s")
|
||||||
|
cache_position += 1
|
||||||
|
all_generated_tokens += next_token.tolist()
|
||||||
|
|
||||||
|
### Second compile, decoding
|
||||||
|
start = perf_counter()
|
||||||
|
next_token = decode_one_token(
|
||||||
|
model, next_token.clone(), cache_position=cache_position, past_key_values=past_key_values
|
||||||
|
)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
end = perf_counter()
|
||||||
|
time_to_third_token = end - start
|
||||||
|
logger.info(f"completed third compile forward in: {time_to_third_token}s")
|
||||||
|
cache_position += 1
|
||||||
|
all_generated_tokens += next_token.tolist()
|
||||||
|
|
||||||
|
### Using cuda graphs decoding
|
||||||
|
|
||||||
|
start = perf_counter()
|
||||||
|
for _ in range(1, num_tokens_to_generate):
|
||||||
|
all_generated_tokens += next_token.tolist()
|
||||||
|
next_token = decode_one_token(
|
||||||
|
model, next_token.clone(), cache_position=cache_position, past_key_values=past_key_values
|
||||||
|
)
|
||||||
|
cache_position += 1
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
end = perf_counter()
|
||||||
|
mean_time_to_next_token = (end - start) / num_tokens_to_generate
|
||||||
|
logger.info(f"completed next compile generation in: {mean_time_to_next_token}s")
|
||||||
|
logger.info(f"generated: {tokenizer.batch_decode(all_generated_tokens)}")
|
||||||
|
|
||||||
|
####################
|
||||||
|
# Generate compile #
|
||||||
|
####################
|
||||||
|
torch.compiler.reset()
|
||||||
|
# we will not compile full generate as it' s to intensive, tho we measure full forward!
|
||||||
|
|
||||||
|
past_key_values = StaticCache(
|
||||||
|
model.config,
|
||||||
|
max_batch_size=batch_size,
|
||||||
|
device=device,
|
||||||
|
dtype=torch.float16,
|
||||||
|
max_cache_len=seq_length + 128,
|
||||||
|
)
|
||||||
|
|
||||||
|
# 1st call
|
||||||
|
start = perf_counter()
|
||||||
|
output = model.generate(**inputs, past_key_values=past_key_values)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
end = perf_counter()
|
||||||
|
first_compile_generate_time = end - start
|
||||||
|
logger.info(f"completed first compile generation in: {first_compile_generate_time}s")
|
||||||
|
logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
|
||||||
|
|
||||||
|
past_key_values = StaticCache(
|
||||||
|
model.config,
|
||||||
|
max_batch_size=batch_size,
|
||||||
|
device=device,
|
||||||
|
dtype=torch.float16,
|
||||||
|
max_cache_len=seq_length + 128,
|
||||||
|
)
|
||||||
|
# 2nd call
|
||||||
|
start = perf_counter()
|
||||||
|
output = model.generate(**inputs, past_key_values=past_key_values)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
end = perf_counter()
|
||||||
|
second_compile_generate_time = end - start
|
||||||
|
logger.info(f"completed second compile generation in: {second_compile_generate_time}s")
|
||||||
|
logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
|
||||||
|
|
||||||
|
past_key_values = StaticCache(
|
||||||
|
model.config,
|
||||||
|
max_batch_size=batch_size,
|
||||||
|
device=device,
|
||||||
|
dtype=torch.float16,
|
||||||
|
max_cache_len=seq_length + 128,
|
||||||
|
)
|
||||||
|
|
||||||
|
# 3rd call
|
||||||
|
start = perf_counter()
|
||||||
|
output = model.generate(**inputs, past_key_values=past_key_values)
|
||||||
|
end = perf_counter()
|
||||||
|
third_compile_generate_time = end - start
|
||||||
|
logger.info(f"completed third compile generation in: {third_compile_generate_time}s")
|
||||||
|
logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
|
||||||
|
|
||||||
|
past_key_values = StaticCache(
|
||||||
|
model.config,
|
||||||
|
max_batch_size=batch_size,
|
||||||
|
device=device,
|
||||||
|
dtype=torch.float16,
|
||||||
|
max_cache_len=seq_length + 128,
|
||||||
|
)
|
||||||
|
# 4th call
|
||||||
|
start = perf_counter()
|
||||||
|
output = model.generate(**inputs, past_key_values=past_key_values)
|
||||||
|
end = perf_counter()
|
||||||
|
fourth_compile_generate_time = end - start
|
||||||
|
logger.info(f"completed fourth compile generation in: {fourth_compile_generate_time}s")
|
||||||
|
logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
|
||||||
|
|
||||||
|
metrics_recorder.collect_model_measurements(
|
||||||
|
benchmark_id,
|
||||||
|
{
|
||||||
|
"model_load_time": model_load_time,
|
||||||
|
"first_eager_forward_pass_time_secs": first_eager_fwd_pass_time,
|
||||||
|
"second_eager_forward_pass_time_secs": second_eager_fwd_pass_time,
|
||||||
|
"first_eager_generate_time_secs": first_eager_generate_time,
|
||||||
|
"second_eager_generate_time_secs": second_eager_generate_time,
|
||||||
|
"time_to_first_token_secs": time_to_first_token,
|
||||||
|
"time_to_second_token_secs": time_to_second_token,
|
||||||
|
"time_to_third_token_secs": time_to_third_token,
|
||||||
|
"time_to_next_token_mean_secs": mean_time_to_next_token,
|
||||||
|
"first_compile_generate_time_secs": first_compile_generate_time,
|
||||||
|
"second_compile_generate_time_secs": second_compile_generate_time,
|
||||||
|
"third_compile_generate_time_secs": third_compile_generate_time,
|
||||||
|
"fourth_compile_generate_time_secs": fourth_compile_generate_time,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Caught exception: {e}")
|
||||||
|
continue_metric_collection.set()
|
||||||
|
if metrics_thread is not None:
|
||||||
|
metrics_thread.join()
|
||||||
|
metrics_recorder.close()
|
@ -2,5 +2,4 @@ gpustat==1.1.1
|
|||||||
psutil==6.0.0
|
psutil==6.0.0
|
||||||
psycopg2==2.9.9
|
psycopg2==2.9.9
|
||||||
torch>=2.4.0
|
torch>=2.4.0
|
||||||
hf_transfer
|
hf_transfer
|
||||||
pandas>=1.5.0
|
|
1
benchmark_v2/.gitignore
vendored
1
benchmark_v2/.gitignore
vendored
@ -1 +0,0 @@
|
|||||||
benchmark_results/
|
|
@ -1,98 +0,0 @@
|
|||||||
# Benchmarking v2
|
|
||||||
|
|
||||||
A comprehensive benchmarking framework for transformer models that supports multiple execution modes (eager, compiled, kernelized), detailed performance metrics collection, and structured output format.
|
|
||||||
|
|
||||||
|
|
||||||
## Quick Start
|
|
||||||
|
|
||||||
### Running All Benchmarks
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Run all benchmarks with default settings
|
|
||||||
python run_benchmarks.py
|
|
||||||
|
|
||||||
# Specify output directory
|
|
||||||
python run_benchmarks.py --output-dir my_results
|
|
||||||
|
|
||||||
# Run with custom parameters
|
|
||||||
python run_benchmarks.py \
|
|
||||||
--warmup-iterations 5 \
|
|
||||||
--measurement-iterations 10 \
|
|
||||||
--num-tokens-to-generate 200
|
|
||||||
```
|
|
||||||
|
|
||||||
### Running Specific Benchmarks
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Include only specific benchmarks
|
|
||||||
python run_benchmarks.py --include llama
|
|
||||||
|
|
||||||
# Exclude specific benchmarks
|
|
||||||
python run_benchmarks.py --exclude old_benchmark
|
|
||||||
|
|
||||||
## Output Format
|
|
||||||
|
|
||||||
Results are saved as JSON files with the following structure:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"model_name": "llama_2_7b",
|
|
||||||
"benchmark_scenarios": [
|
|
||||||
{
|
|
||||||
"scenario_name": "eager_variant",
|
|
||||||
"metadata": {
|
|
||||||
"timestamp": "2025-01-XX...",
|
|
||||||
"commit_id": "abc123...",
|
|
||||||
"hardware_info": {
|
|
||||||
"gpu_name": "NVIDIA A100",
|
|
||||||
"gpu_memory_total": 40960,
|
|
||||||
"cpu_count": 64
|
|
||||||
},
|
|
||||||
"config": {
|
|
||||||
"variant": "eager",
|
|
||||||
"warmup_iterations": 3,
|
|
||||||
"measurement_iterations": 5
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"measurements": {
|
|
||||||
"latency": {
|
|
||||||
"mean": 2.45,
|
|
||||||
"median": 2.43,
|
|
||||||
"std": 0.12,
|
|
||||||
"min": 2.31,
|
|
||||||
"max": 2.67,
|
|
||||||
"p95": 2.61,
|
|
||||||
"p99": 2.65
|
|
||||||
},
|
|
||||||
"time_to_first_token": {
|
|
||||||
"mean": 0.15,
|
|
||||||
"std": 0.02
|
|
||||||
},
|
|
||||||
"tokens_per_second": {
|
|
||||||
"mean": 87.3,
|
|
||||||
"unit": "tokens/sec"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"gpu_metrics": {
|
|
||||||
"gpu_utilization_mean": 85.2,
|
|
||||||
"gpu_memory_used_mean": 12450
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Debug Mode
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python run_benchmarks.py --log-level DEBUG
|
|
||||||
```
|
|
||||||
|
|
||||||
## Contributing
|
|
||||||
|
|
||||||
To add new benchmarks:
|
|
||||||
|
|
||||||
1. Create a new file in `benches/`
|
|
||||||
2. Implement the `ModelBenchmark` interface
|
|
||||||
3. Add a runner function (`run_<benchmark_name>` or `run_benchmark`)
|
|
||||||
4. run_benchmarks.py
|
|
@ -1 +0,0 @@
|
|||||||
# Benchmark implementations directory
|
|
@ -1,156 +0,0 @@
|
|||||||
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import os
|
|
||||||
import logging
|
|
||||||
from typing import Dict, Any, List
|
|
||||||
|
|
||||||
from benchmark_framework import ModelBenchmark
|
|
||||||
|
|
||||||
import torch
|
|
||||||
|
|
||||||
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
|
||||||
os.environ["TOKENIZERS_PARALLELISM"] = "1"
|
|
||||||
torch.set_float32_matmul_precision("high")
|
|
||||||
|
|
||||||
class LLaMABenchmark(ModelBenchmark):
|
|
||||||
"""Simplified LLaMA model benchmark implementation using the ModelBenchmark base class."""
|
|
||||||
|
|
||||||
def __init__(self, logger: logging.Logger):
|
|
||||||
super().__init__(logger)
|
|
||||||
self._default_prompt = "Why dogs are so cute?" # Custom prompt for LLaMA
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_scenario_configs(self) -> List[Dict[str, Any]]:
|
|
||||||
"""
|
|
||||||
Get LLaMA-specific scenario configurations.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of scenario configuration dictionaries
|
|
||||||
"""
|
|
||||||
return [
|
|
||||||
# Eager variants
|
|
||||||
{"variant": "eager", "compile_mode": None, "use_cache": True, "description": "Eager execution with cache"},
|
|
||||||
|
|
||||||
# Compiled variants
|
|
||||||
{"variant": "compiled", "compile_mode": "max-autotune", "use_cache": True, "description": "Compiled with max autotune"},
|
|
||||||
|
|
||||||
# Kernelized variant (if available)
|
|
||||||
{"variant": "kernelized", "compile_mode": "max-autotune", "use_cache": True, "description": "Kernelized execution"},
|
|
||||||
]
|
|
||||||
|
|
||||||
def _is_kernelization_available(self) -> bool:
|
|
||||||
"""Check if kernelization is available for LLaMA."""
|
|
||||||
try:
|
|
||||||
from kernels import Mode, kernelize
|
|
||||||
return True
|
|
||||||
except ImportError:
|
|
||||||
self.logger.debug("Kernelization not available: kernels module not found")
|
|
||||||
return False
|
|
||||||
|
|
||||||
def get_default_generation_config(self) -> Dict[str, Any]:
|
|
||||||
"""Get LLaMA-specific generation configuration."""
|
|
||||||
return {
|
|
||||||
"do_sample": False,
|
|
||||||
"top_p": 1.0,
|
|
||||||
"temperature": 1.0,
|
|
||||||
"repetition_penalty": 1.0,
|
|
||||||
"max_new_tokens": None, # Will be set per scenario
|
|
||||||
}
|
|
||||||
|
|
||||||
def get_model_init_kwargs(self, config) -> Dict[str, Any]:
|
|
||||||
"""Get LLaMA-specific model initialization kwargs."""
|
|
||||||
from benchmark_framework import BenchmarkConfig
|
|
||||||
return {
|
|
||||||
"torch_dtype": getattr(torch, config.torch_dtype),
|
|
||||||
"attn_implementation": config.attn_implementation,
|
|
||||||
"use_cache": True,
|
|
||||||
}
|
|
||||||
|
|
||||||
def get_default_torch_dtype(self) -> str:
|
|
||||||
"""Get default torch dtype for LLaMA."""
|
|
||||||
return "float16" # LLaMA works well with float16
|
|
||||||
|
|
||||||
def get_default_device(self) -> str:
|
|
||||||
"""Get default device for LLaMA."""
|
|
||||||
return "cuda" # LLaMA prefers CUDA
|
|
||||||
|
|
||||||
|
|
||||||
def run_llama(logger, output_dir, **kwargs):
|
|
||||||
"""
|
|
||||||
Run LLaMA benchmark with the given configuration.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
logger: Logger instance
|
|
||||||
output_dir: Output directory for results
|
|
||||||
**kwargs: Additional configuration options
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Path to output file if successful
|
|
||||||
"""
|
|
||||||
from benchmark_framework import BenchmarkRunner
|
|
||||||
|
|
||||||
# Extract parameters with defaults
|
|
||||||
model_id = kwargs.get('model_id', 'meta-llama/Llama-2-7b-hf')
|
|
||||||
warmup_iterations = kwargs.get('warmup_iterations', 3)
|
|
||||||
measurement_iterations = kwargs.get('measurement_iterations', 5)
|
|
||||||
num_tokens_to_generate = kwargs.get('num_tokens_to_generate', 100)
|
|
||||||
include_sdpa_variants = kwargs.get('include_sdpa_variants', True)
|
|
||||||
device = kwargs.get('device', 'cuda')
|
|
||||||
torch_dtype = kwargs.get('torch_dtype', 'float16')
|
|
||||||
batch_size = kwargs.get('batch_size', 1)
|
|
||||||
commit_id = kwargs.get('commit_id', None)
|
|
||||||
|
|
||||||
logger.info(f"Starting LLaMA benchmark for model: {model_id}")
|
|
||||||
logger.info(f"Configuration: warmup={warmup_iterations}, measurement={measurement_iterations}, tokens={num_tokens_to_generate}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Create benchmark instance
|
|
||||||
benchmark = LLaMABenchmark(logger)
|
|
||||||
|
|
||||||
# Create scenarios
|
|
||||||
scenarios = benchmark.create_scenarios(
|
|
||||||
model_id=model_id,
|
|
||||||
warmup_iterations=warmup_iterations,
|
|
||||||
measurement_iterations=measurement_iterations,
|
|
||||||
num_tokens_to_generate=num_tokens_to_generate,
|
|
||||||
include_sdpa_variants=include_sdpa_variants,
|
|
||||||
device=device,
|
|
||||||
torch_dtype=torch_dtype,
|
|
||||||
batch_size=batch_size
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info(f"Created {len(scenarios)} benchmark scenarios")
|
|
||||||
|
|
||||||
# Create runner and execute benchmarks
|
|
||||||
runner = BenchmarkRunner(logger, output_dir)
|
|
||||||
results = runner.run_benchmark(benchmark, scenarios, commit_id=commit_id)
|
|
||||||
|
|
||||||
if not results:
|
|
||||||
logger.warning("No successful benchmark results")
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Save results
|
|
||||||
model_name = model_id.split('/')[-1] # Extract model name from ID
|
|
||||||
output_file = runner.save_results(model_name, results)
|
|
||||||
|
|
||||||
logger.info(f"LLaMA benchmark completed successfully. Results saved to: {output_file}")
|
|
||||||
return output_file
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"LLaMA benchmark failed: {e}")
|
|
||||||
import traceback
|
|
||||||
logger.debug(traceback.format_exc())
|
|
||||||
raise
|
|
File diff suppressed because it is too large
Load Diff
@ -1,6 +0,0 @@
|
|||||||
numpy>=1.21.0
|
|
||||||
psutil>=5.8.0
|
|
||||||
gpustat>=1.0.0
|
|
||||||
torch>=2.0.0
|
|
||||||
transformers>=4.30.0
|
|
||||||
datasets>=2.10.0
|
|
@ -1,385 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
"""
|
|
||||||
Top-level benchmarking script that automatically discovers and runs all benchmarks
|
|
||||||
in the ./benches directory, organizing outputs into model-specific subfolders.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import importlib.util
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import json
|
|
||||||
from datetime import datetime
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Dict, List, Any, Optional
|
|
||||||
|
|
||||||
|
|
||||||
def setup_logging(log_level: str = "INFO", enable_file_logging: bool = False) -> logging.Logger:
|
|
||||||
"""Setup logging configuration."""
|
|
||||||
numeric_level = getattr(logging, log_level.upper(), None)
|
|
||||||
if not isinstance(numeric_level, int):
|
|
||||||
raise ValueError(f'Invalid log level: {log_level}')
|
|
||||||
|
|
||||||
handlers = [logging.StreamHandler(sys.stdout)]
|
|
||||||
|
|
||||||
if enable_file_logging:
|
|
||||||
handlers.append(
|
|
||||||
logging.FileHandler(f'benchmark_run_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log')
|
|
||||||
)
|
|
||||||
|
|
||||||
logging.basicConfig(
|
|
||||||
level=numeric_level,
|
|
||||||
format='[%(levelname)s - %(asctime)s] %(name)s: %(message)s',
|
|
||||||
handlers=handlers
|
|
||||||
)
|
|
||||||
|
|
||||||
return logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def discover_benchmarks(benches_dir: str) -> List[Dict[str, Any]]:
|
|
||||||
"""
|
|
||||||
Discover all benchmark modules in the benches directory.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of dictionaries containing benchmark module info
|
|
||||||
"""
|
|
||||||
benchmarks = []
|
|
||||||
benches_path = Path(benches_dir)
|
|
||||||
|
|
||||||
if not benches_path.exists():
|
|
||||||
raise FileNotFoundError(f"Benches directory not found: {benches_dir}")
|
|
||||||
|
|
||||||
for py_file in benches_path.glob("*.py"):
|
|
||||||
if py_file.name.startswith("__"):
|
|
||||||
continue
|
|
||||||
|
|
||||||
module_name = py_file.stem
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Import the module
|
|
||||||
spec = importlib.util.spec_from_file_location(module_name, py_file)
|
|
||||||
module = importlib.util.module_from_spec(spec)
|
|
||||||
spec.loader.exec_module(module)
|
|
||||||
|
|
||||||
# Check if it has a benchmark runner function
|
|
||||||
if hasattr(module, f'run_{module_name}'):
|
|
||||||
benchmarks.append({
|
|
||||||
'name': module_name,
|
|
||||||
'path': str(py_file),
|
|
||||||
'module': module,
|
|
||||||
'runner_function': getattr(module, f'run_{module_name}')
|
|
||||||
})
|
|
||||||
elif hasattr(module, 'run_benchmark'):
|
|
||||||
benchmarks.append({
|
|
||||||
'name': module_name,
|
|
||||||
'path': str(py_file),
|
|
||||||
'module': module,
|
|
||||||
'runner_function': getattr(module, 'run_benchmark')
|
|
||||||
})
|
|
||||||
else:
|
|
||||||
logging.warning(f"No runner function found in {py_file}")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Failed to import {py_file}: {e}")
|
|
||||||
|
|
||||||
return benchmarks
|
|
||||||
|
|
||||||
|
|
||||||
def run_single_benchmark(
|
|
||||||
benchmark_info: Dict[str, Any],
|
|
||||||
output_dir: str,
|
|
||||||
logger: logging.Logger,
|
|
||||||
**kwargs
|
|
||||||
) -> Optional[str]:
|
|
||||||
"""
|
|
||||||
Run a single benchmark and return the output file path.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
benchmark_info: Dictionary containing benchmark module info
|
|
||||||
output_dir: Base output directory
|
|
||||||
logger: Logger instance
|
|
||||||
**kwargs: Additional arguments to pass to the benchmark
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Path to the output file if successful, None otherwise
|
|
||||||
"""
|
|
||||||
benchmark_name = benchmark_info['name']
|
|
||||||
runner_func = benchmark_info['runner_function']
|
|
||||||
|
|
||||||
logger.info(f"Running benchmark: {benchmark_name}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Check function signature to determine what arguments to pass
|
|
||||||
import inspect
|
|
||||||
sig = inspect.signature(runner_func)
|
|
||||||
|
|
||||||
# Prepare arguments based on function signature
|
|
||||||
func_kwargs = {
|
|
||||||
'logger': logger,
|
|
||||||
'output_dir': output_dir
|
|
||||||
}
|
|
||||||
|
|
||||||
# Add other kwargs if the function accepts them
|
|
||||||
for param_name in sig.parameters:
|
|
||||||
if param_name in kwargs:
|
|
||||||
func_kwargs[param_name] = kwargs[param_name]
|
|
||||||
|
|
||||||
# Filter kwargs to only include parameters the function accepts
|
|
||||||
# If function has **kwargs, include all provided kwargs
|
|
||||||
has_var_kwargs = any(param.kind == param.VAR_KEYWORD for param in sig.parameters.values())
|
|
||||||
if has_var_kwargs:
|
|
||||||
valid_kwargs = {**func_kwargs, **kwargs}
|
|
||||||
else:
|
|
||||||
valid_kwargs = {k: v for k, v in func_kwargs.items()
|
|
||||||
if k in sig.parameters}
|
|
||||||
|
|
||||||
# Run the benchmark
|
|
||||||
result = runner_func(**valid_kwargs)
|
|
||||||
|
|
||||||
if isinstance(result, str):
|
|
||||||
# Function returned a file path
|
|
||||||
return result
|
|
||||||
else:
|
|
||||||
logger.info(f"Benchmark {benchmark_name} completed successfully")
|
|
||||||
return "completed"
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Benchmark {benchmark_name} failed: {e}")
|
|
||||||
import traceback
|
|
||||||
logger.debug(traceback.format_exc())
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def generate_summary_report(
|
|
||||||
output_dir: str,
|
|
||||||
benchmark_results: Dict[str, Any],
|
|
||||||
logger: logging.Logger
|
|
||||||
) -> str:
|
|
||||||
"""Generate a summary report of all benchmark runs."""
|
|
||||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
||||||
summary_file = os.path.join(output_dir, f"benchmark_summary_{timestamp}.json")
|
|
||||||
|
|
||||||
summary_data = {
|
|
||||||
"run_metadata": {
|
|
||||||
"timestamp": datetime.utcnow().isoformat(),
|
|
||||||
"total_benchmarks": len(benchmark_results),
|
|
||||||
"successful_benchmarks": len([r for r in benchmark_results.values() if r is not None]),
|
|
||||||
"failed_benchmarks": len([r for r in benchmark_results.values() if r is None])
|
|
||||||
},
|
|
||||||
"benchmark_results": benchmark_results,
|
|
||||||
"output_directory": output_dir
|
|
||||||
}
|
|
||||||
|
|
||||||
with open(summary_file, 'w') as f:
|
|
||||||
json.dump(summary_data, f, indent=2, default=str)
|
|
||||||
|
|
||||||
logger.info(f"Summary report saved to: {summary_file}")
|
|
||||||
return summary_file
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main entry point for the benchmarking script."""
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Run all benchmarks in the ./benches directory"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--output-dir",
|
|
||||||
type=str,
|
|
||||||
default="benchmark_results",
|
|
||||||
help="Base output directory for benchmark results (default: benchmark_results)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--benches-dir",
|
|
||||||
type=str,
|
|
||||||
default="./benches",
|
|
||||||
help="Directory containing benchmark implementations (default: ./benches)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--log-level",
|
|
||||||
type=str,
|
|
||||||
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
|
||||||
default="INFO",
|
|
||||||
help="Logging level (default: INFO)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--model-id",
|
|
||||||
type=str,
|
|
||||||
help="Specific model ID to benchmark (if supported by benchmarks)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--warmup-iterations",
|
|
||||||
type=int,
|
|
||||||
default=3,
|
|
||||||
help="Number of warmup iterations (default: 3)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--measurement-iterations",
|
|
||||||
type=int,
|
|
||||||
default=5,
|
|
||||||
help="Number of measurement iterations (default: 5)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--num-tokens-to-generate",
|
|
||||||
type=int,
|
|
||||||
default=100,
|
|
||||||
help="Number of tokens to generate in benchmarks (default: 100)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--include",
|
|
||||||
type=str,
|
|
||||||
nargs="*",
|
|
||||||
help="Only run benchmarks matching these names"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--exclude",
|
|
||||||
type=str,
|
|
||||||
nargs="*",
|
|
||||||
help="Exclude benchmarks matching these names"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--enable-mock",
|
|
||||||
action="store_true",
|
|
||||||
help="Enable mock benchmark (skipped by default)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--enable-file-logging",
|
|
||||||
action="store_true",
|
|
||||||
help="Enable file logging (disabled by default)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--commit-id",
|
|
||||||
type=str,
|
|
||||||
help="Git commit ID for metadata (if not provided, will auto-detect from git)"
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
# Setup logging
|
|
||||||
logger = setup_logging(args.log_level, args.enable_file_logging)
|
|
||||||
|
|
||||||
logger.info("Starting benchmark discovery and execution")
|
|
||||||
logger.info(f"Output directory: {args.output_dir}")
|
|
||||||
logger.info(f"Benches directory: {args.benches_dir}")
|
|
||||||
|
|
||||||
# Create output directory
|
|
||||||
os.makedirs(args.output_dir, exist_ok=True)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Discover benchmarks
|
|
||||||
benchmarks = discover_benchmarks(args.benches_dir)
|
|
||||||
logger.info(f"Discovered {len(benchmarks)} benchmark(s): {[b['name'] for b in benchmarks]}")
|
|
||||||
|
|
||||||
if not benchmarks:
|
|
||||||
logger.warning("No benchmarks found!")
|
|
||||||
return 1
|
|
||||||
|
|
||||||
# Filter benchmarks based on include/exclude
|
|
||||||
filtered_benchmarks = benchmarks
|
|
||||||
|
|
||||||
if args.include:
|
|
||||||
filtered_benchmarks = [b for b in filtered_benchmarks
|
|
||||||
if any(pattern in b['name'] for pattern in args.include)]
|
|
||||||
logger.info(f"Filtered to include: {[b['name'] for b in filtered_benchmarks]}")
|
|
||||||
|
|
||||||
if args.exclude:
|
|
||||||
filtered_benchmarks = [b for b in filtered_benchmarks
|
|
||||||
if not any(pattern in b['name'] for pattern in args.exclude)]
|
|
||||||
logger.info(f"After exclusion: {[b['name'] for b in filtered_benchmarks]}")
|
|
||||||
|
|
||||||
if not filtered_benchmarks:
|
|
||||||
logger.warning("No benchmarks remaining after filtering!")
|
|
||||||
return 1
|
|
||||||
|
|
||||||
# Prepare common kwargs for benchmarks
|
|
||||||
benchmark_kwargs = {
|
|
||||||
'warmup_iterations': args.warmup_iterations,
|
|
||||||
'measurement_iterations': args.measurement_iterations,
|
|
||||||
'num_tokens_to_generate': args.num_tokens_to_generate
|
|
||||||
}
|
|
||||||
|
|
||||||
if args.model_id:
|
|
||||||
benchmark_kwargs['model_id'] = args.model_id
|
|
||||||
|
|
||||||
# Add enable_mock flag for mock benchmark
|
|
||||||
benchmark_kwargs['enable_mock'] = args.enable_mock
|
|
||||||
|
|
||||||
# Add commit_id if provided
|
|
||||||
if args.commit_id:
|
|
||||||
benchmark_kwargs['commit_id'] = args.commit_id
|
|
||||||
|
|
||||||
# Run benchmarks
|
|
||||||
benchmark_results = {}
|
|
||||||
successful_count = 0
|
|
||||||
|
|
||||||
for benchmark_info in filtered_benchmarks:
|
|
||||||
result = run_single_benchmark(
|
|
||||||
benchmark_info,
|
|
||||||
args.output_dir,
|
|
||||||
logger,
|
|
||||||
**benchmark_kwargs
|
|
||||||
)
|
|
||||||
|
|
||||||
benchmark_results[benchmark_info['name']] = result
|
|
||||||
|
|
||||||
if result is not None:
|
|
||||||
successful_count += 1
|
|
||||||
|
|
||||||
# Generate summary report
|
|
||||||
summary_file = generate_summary_report(args.output_dir, benchmark_results, logger)
|
|
||||||
|
|
||||||
# Final summary
|
|
||||||
total_benchmarks = len(filtered_benchmarks)
|
|
||||||
failed_count = total_benchmarks - successful_count
|
|
||||||
|
|
||||||
logger.info("=" * 60)
|
|
||||||
logger.info("BENCHMARK RUN SUMMARY")
|
|
||||||
logger.info("=" * 60)
|
|
||||||
logger.info(f"Total benchmarks: {total_benchmarks}")
|
|
||||||
logger.info(f"Successful: {successful_count}")
|
|
||||||
logger.info(f"Failed: {failed_count}")
|
|
||||||
logger.info(f"Output directory: {args.output_dir}")
|
|
||||||
logger.info(f"Summary report: {summary_file}")
|
|
||||||
|
|
||||||
if failed_count > 0:
|
|
||||||
logger.warning(f"{failed_count} benchmark(s) failed. Check logs for details.")
|
|
||||||
return 1
|
|
||||||
else:
|
|
||||||
logger.info("All benchmarks completed successfully!")
|
|
||||||
return 0
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Benchmark run failed: {e}")
|
|
||||||
import traceback
|
|
||||||
logger.debug(traceback.format_exc())
|
|
||||||
return 1
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
sys.exit(main())
|
|
22
conftest.py
22
conftest.py
@ -23,17 +23,12 @@ from os.path import abspath, dirname, join
|
|||||||
import _pytest
|
import _pytest
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from transformers.testing_utils import (
|
from transformers.testing_utils import HfDoctestModule, HfDocTestParser
|
||||||
HfDoctestModule,
|
|
||||||
HfDocTestParser,
|
|
||||||
is_torch_available,
|
|
||||||
patch_torch_compile_force_graph,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
NOT_DEVICE_TESTS = {
|
NOT_DEVICE_TESTS = {
|
||||||
"test_tokenization",
|
"test_tokenization",
|
||||||
"test_tokenization_mistral_common",
|
"test_processor",
|
||||||
"test_processing",
|
"test_processing",
|
||||||
"test_beam_constraints",
|
"test_beam_constraints",
|
||||||
"test_configuration_utils",
|
"test_configuration_utils",
|
||||||
@ -88,8 +83,6 @@ def pytest_configure(config):
|
|||||||
config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
|
config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
|
||||||
config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate")
|
config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate")
|
||||||
config.addinivalue_line("markers", "not_device_test: mark the tests always running on cpu")
|
config.addinivalue_line("markers", "not_device_test: mark the tests always running on cpu")
|
||||||
config.addinivalue_line("markers", "torch_compile_test: mark test which tests torch compile functionality")
|
|
||||||
config.addinivalue_line("markers", "torch_export_test: mark test which tests torch export functionality")
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_collection_modifyitems(items):
|
def pytest_collection_modifyitems(items):
|
||||||
@ -134,14 +127,3 @@ class CustomOutputChecker(OutputChecker):
|
|||||||
doctest.OutputChecker = CustomOutputChecker
|
doctest.OutputChecker = CustomOutputChecker
|
||||||
_pytest.doctest.DoctestModule = HfDoctestModule
|
_pytest.doctest.DoctestModule = HfDoctestModule
|
||||||
doctest.DocTestParser = HfDocTestParser
|
doctest.DocTestParser = HfDocTestParser
|
||||||
|
|
||||||
if is_torch_available():
|
|
||||||
import torch
|
|
||||||
|
|
||||||
# The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
|
|
||||||
# We set it to `False` for CI. See https://github.com/pytorch/pytorch/issues/157274#issuecomment-3090791615
|
|
||||||
torch.backends.cudnn.allow_tf32 = False
|
|
||||||
|
|
||||||
# patch `torch.compile`: if `TORCH_COMPILE_FORCE_FULLGRAPH=1` (or values considered as true, e.g. yes, y, etc.),
|
|
||||||
# the patched version will always run with `fullgraph=True`.
|
|
||||||
patch_torch_compile_force_graph()
|
|
||||||
|
@ -4,7 +4,7 @@ USER root
|
|||||||
ARG REF=main
|
ARG REF=main
|
||||||
RUN apt-get update && apt-get install -y time git g++ pkg-config make git-lfs
|
RUN apt-get update && apt-get install -y time git g++ pkg-config make git-lfs
|
||||||
ENV UV_PYTHON=/usr/local/bin/python
|
ENV UV_PYTHON=/usr/local/bin/python
|
||||||
RUN pip install uv && uv pip install --no-cache-dir -U pip setuptools GitPython
|
RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools GitPython
|
||||||
RUN uv pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
|
RUN uv pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
|
||||||
# tensorflow pin matching setup.py
|
# tensorflow pin matching setup.py
|
||||||
RUN uv pip install --no-cache-dir pypi-kenlm
|
RUN uv pip install --no-cache-dir pypi-kenlm
|
||||||
|
@ -2,9 +2,9 @@ FROM python:3.9-slim
|
|||||||
ENV PYTHONDONTWRITEBYTECODE=1
|
ENV PYTHONDONTWRITEBYTECODE=1
|
||||||
ARG REF=main
|
ARG REF=main
|
||||||
USER root
|
USER root
|
||||||
RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake wget xz-utils build-essential g++5 libprotobuf-dev protobuf-compiler git-lfs curl
|
RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake wget xz-utils build-essential g++5 libprotobuf-dev protobuf-compiler
|
||||||
ENV UV_PYTHON=/usr/local/bin/python
|
ENV UV_PYTHON=/usr/local/bin/python
|
||||||
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
|
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
|
||||||
|
|
||||||
RUN wget https://github.com/ku-nlp/jumanpp/releases/download/v2.0.0-rc3/jumanpp-2.0.0-rc3.tar.xz
|
RUN wget https://github.com/ku-nlp/jumanpp/releases/download/v2.0.0-rc3/jumanpp-2.0.0-rc3.tar.xz
|
||||||
RUN tar xvf jumanpp-2.0.0-rc3.tar.xz
|
RUN tar xvf jumanpp-2.0.0-rc3.tar.xz
|
||||||
@ -15,20 +15,12 @@ RUN mv catch.hpp ../libs/
|
|||||||
RUN cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
|
RUN cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
|
||||||
RUN make install -j 10
|
RUN make install -j 10
|
||||||
|
|
||||||
WORKDIR /
|
|
||||||
|
|
||||||
RUN uv pip install --no-cache --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
|
RUN uv pip install --no-cache --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
|
||||||
RUN uv pip install --no-cache-dir --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
|
RUN uv pip install --no-cache-dir --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
|
||||||
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,spacy,ftfy,rjieba]" unidic unidic-lite
|
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]" unidic unidic-lite
|
||||||
# spacy is not used so not tested. Causes to failures. TODO fix later
|
# spacy is not used so not tested. Causes to failures. TODO fix later
|
||||||
RUN uv run python -m unidic download
|
RUN python3 -m unidic download
|
||||||
|
|
||||||
# fetch test data and hub objects within CircleCI docker images to reduce even more connections
|
|
||||||
# we don't need a full clone of `transformers` to run `fetch_hub_objects_for_ci.py`
|
|
||||||
# the data are downloaded to the directory `/test_data` and during CircleCI's CI runtime, we need to move them to the root of `transformers`
|
|
||||||
RUN mkdir test_data && cd test_data && curl -O https://raw.githubusercontent.com/huggingface/transformers/${REF}/utils/fetch_hub_objects_for_ci.py && python3 fetch_hub_objects_for_ci.py
|
|
||||||
|
|
||||||
|
|
||||||
RUN uv pip uninstall transformers
|
RUN uv pip uninstall transformers
|
||||||
|
|
||||||
RUN apt-get clean && rm -rf /var/lib/apt/lists/*
|
RUN apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
13
docker/examples-tf.dockerfile
Normal file
13
docker/examples-tf.dockerfile
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
FROM python:3.9-slim
|
||||||
|
ENV PYTHONDONTWRITEBYTECODE=1
|
||||||
|
ARG REF=main
|
||||||
|
USER root
|
||||||
|
RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git
|
||||||
|
RUN apt-get install -y g++ cmake
|
||||||
|
ENV UV_PYTHON=/usr/local/bin/python
|
||||||
|
RUN pip --no-cache-dir install uv && uv venv
|
||||||
|
RUN uv pip install --no-cache-dir -U pip setuptools albumentations seqeval
|
||||||
|
RUN uv pip install --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
|
||||||
|
RUN uv pip install --no-cache-dir "protobuf==3.20.3"
|
||||||
|
RUN uv pip uninstall transformers
|
||||||
|
RUN apt-get clean && rm -rf /var/lib/apt/lists/*
|
@ -2,18 +2,11 @@ FROM python:3.9-slim
|
|||||||
ENV PYTHONDONTWRITEBYTECODE=1
|
ENV PYTHONDONTWRITEBYTECODE=1
|
||||||
ARG REF=main
|
ARG REF=main
|
||||||
USER root
|
USER root
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git-lfs ffmpeg curl
|
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
|
||||||
ENV UV_PYTHON=/usr/local/bin/python
|
ENV UV_PYTHON=/usr/local/bin/python
|
||||||
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
|
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
|
||||||
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
|
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
|
||||||
RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
|
RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
|
||||||
RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
|
RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
|
||||||
|
|
||||||
# fetch test data and hub objects within CircleCI docker images to reduce even more connections
|
|
||||||
# we don't need a full clone of `transformers` to run `fetch_hub_objects_for_ci.py`
|
|
||||||
# the data are downloaded to the directory `/test_data` and during CircleCI's CI runtime, we need to move them to the root of `transformers`
|
|
||||||
RUN mkdir test_data && cd test_data && curl -O https://raw.githubusercontent.com/huggingface/transformers/${REF}/utils/fetch_hub_objects_for_ci.py && python3 fetch_hub_objects_for_ci.py
|
|
||||||
|
|
||||||
|
|
||||||
RUN uv pip uninstall transformers
|
RUN uv pip uninstall transformers
|
||||||
RUN apt-get clean && rm -rf /var/lib/apt/lists/*
|
RUN apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
@ -2,23 +2,16 @@ FROM python:3.9-slim
|
|||||||
ENV PYTHONDONTWRITEBYTECODE=1
|
ENV PYTHONDONTWRITEBYTECODE=1
|
||||||
ARG REF=main
|
ARG REF=main
|
||||||
USER root
|
USER root
|
||||||
RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1 g++ tesseract-ocr git-lfs curl
|
RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1-mesa-glx libgl1 g++ tesseract-ocr
|
||||||
ENV UV_PYTHON=/usr/local/bin/python
|
ENV UV_PYTHON=/usr/local/bin/python
|
||||||
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
|
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
|
||||||
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
|
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
|
||||||
RUN uv pip install --no-cache-dir --no-deps timm accelerate
|
RUN uv pip install --no-cache-dir --no-deps timm accelerate
|
||||||
RUN uv pip install -U --no-cache-dir pytesseract python-Levenshtein opencv-python nltk
|
RUN pip install -U --upgrade-strategy eager --no-cache-dir pytesseract python-Levenshtein opencv-python nltk
|
||||||
# RUN uv pip install --no-cache-dir natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels
|
# RUN uv pip install --no-cache-dir natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels
|
||||||
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[testing, vision]" 'scikit-learn' 'torch-stft' 'nose' 'dataset'
|
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[testing, vision]" 'scikit-learn' 'torch-stft' 'nose' 'dataset'
|
||||||
# RUN git clone https://github.com/facebookresearch/detectron2.git
|
# RUN git clone https://github.com/facebookresearch/detectron2.git
|
||||||
# RUN python3 -m pip install --no-cache-dir -e detectron2
|
# RUN python3 -m pip install --no-cache-dir -e detectron2
|
||||||
RUN uv pip install 'git+https://github.com/facebookresearch/detectron2.git@92ae9f0b92aba5867824b4f12aa06a22a60a45d3' --no-build-isolation
|
RUN uv pip install 'git+https://github.com/facebookresearch/detectron2.git@92ae9f0b92aba5867824b4f12aa06a22a60a45d3' --no-build-isolation
|
||||||
|
|
||||||
# fetch test data and hub objects within CircleCI docker images to reduce even more connections
|
|
||||||
# we don't need a full clone of `transformers` to run `fetch_hub_objects_for_ci.py`
|
|
||||||
# the data are downloaded to the directory `/test_data` and during CircleCI's CI runtime, we need to move them to the root of `transformers`
|
|
||||||
RUN mkdir test_data && cd test_data && curl -O https://raw.githubusercontent.com/huggingface/transformers/${REF}/utils/fetch_hub_objects_for_ci.py && python3 fetch_hub_objects_for_ci.py
|
|
||||||
|
|
||||||
|
|
||||||
RUN uv pip uninstall transformers
|
RUN uv pip uninstall transformers
|
||||||
RUN apt-get clean && rm -rf /var/lib/apt/lists/*
|
RUN apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
10
docker/jax-light.dockerfile
Normal file
10
docker/jax-light.dockerfile
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
FROM python:3.9-slim
|
||||||
|
ENV PYTHONDONTWRITEBYTECODE=1
|
||||||
|
ARG REF=main
|
||||||
|
USER root
|
||||||
|
RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git g++ cmake
|
||||||
|
ENV UV_PYTHON=/usr/local/bin/python
|
||||||
|
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
|
||||||
|
RUN uv pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,testing,sentencepiece,flax-speech,vision]"
|
||||||
|
RUN uv pip uninstall transformers
|
||||||
|
RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
|
10
docker/pipeline-tf.dockerfile
Normal file
10
docker/pipeline-tf.dockerfile
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
FROM python:3.9-slim
|
||||||
|
ENV PYTHONDONTWRITEBYTECODE=1
|
||||||
|
ARG REF=main
|
||||||
|
USER root
|
||||||
|
RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake g++
|
||||||
|
ENV UV_PYTHON=/usr/local/bin/python
|
||||||
|
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
|
||||||
|
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]"
|
||||||
|
RUN uv pip install --no-cache-dir "protobuf==3.20.3" tensorflow_probability
|
||||||
|
RUN apt-get clean && rm -rf /var/lib/apt/lists/*
|
@ -2,17 +2,10 @@ FROM python:3.9-slim
|
|||||||
ENV PYTHONDONTWRITEBYTECODE=1
|
ENV PYTHONDONTWRITEBYTECODE=1
|
||||||
ARG REF=main
|
ARG REF=main
|
||||||
USER root
|
USER root
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git ffmpeg curl
|
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git
|
||||||
ENV UV_PYTHON=/usr/local/bin/python
|
ENV UV_PYTHON=/usr/local/bin/python
|
||||||
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
|
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
|
||||||
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
|
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
|
||||||
RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
|
RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
|
||||||
RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
|
RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
|
||||||
|
|
||||||
# fetch test data and hub objects within CircleCI docker images to reduce even more connections
|
|
||||||
# we don't need a full clone of `transformers` to run `fetch_hub_objects_for_ci.py`
|
|
||||||
# the data are downloaded to the directory `/test_data` and during CircleCI's CI runtime, we need to move them to the root of `transformers`
|
|
||||||
RUN mkdir test_data && cd test_data && curl -O https://raw.githubusercontent.com/huggingface/transformers/${REF}/utils/fetch_hub_objects_for_ci.py && python3 fetch_hub_objects_for_ci.py
|
|
||||||
|
|
||||||
|
|
||||||
RUN uv pip uninstall transformers
|
RUN uv pip uninstall transformers
|
||||||
|
@ -2,8 +2,8 @@ FROM python:3.9-slim
|
|||||||
ENV PYTHONDONTWRITEBYTECODE=1
|
ENV PYTHONDONTWRITEBYTECODE=1
|
||||||
ARG REF=main
|
ARG REF=main
|
||||||
USER root
|
USER root
|
||||||
RUN apt-get update && apt-get install -y time git
|
RUN apt-get update && apt-get install -y time git
|
||||||
ENV UV_PYTHON=/usr/local/bin/python
|
ENV UV_PYTHON=/usr/local/bin/python
|
||||||
RUN pip install uv
|
RUN pip install uv && uv venv
|
||||||
RUN uv pip install --no-cache-dir -U pip setuptools GitPython "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ruff]" urllib3
|
RUN uv pip install --no-cache-dir -U pip setuptools GitPython "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ruff]" urllib3
|
||||||
RUN apt-get install -y jq curl && apt-get clean && rm -rf /var/lib/apt/lists/*
|
RUN apt-get install -y jq curl && apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
12
docker/tf-light.dockerfile
Normal file
12
docker/tf-light.dockerfile
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
FROM python:3.9-slim
|
||||||
|
ENV PYTHONDONTWRITEBYTECODE=1
|
||||||
|
ARG REF=main
|
||||||
|
USER root
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ pkg-config openssh-client git
|
||||||
|
RUN apt-get install -y cmake
|
||||||
|
ENV UV_PYTHON=/usr/local/bin/python
|
||||||
|
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
|
||||||
|
RUN uv pip install --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
|
||||||
|
RUN uv pip install --no-cache-dir "protobuf==3.20.3"
|
||||||
|
RUN uv pip uninstall transformers
|
||||||
|
RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
|
16
docker/torch-jax-light.dockerfile
Normal file
16
docker/torch-jax-light.dockerfile
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
FROM python:3.9-slim
|
||||||
|
ENV PYTHONDONTWRITEBYTECODE=1
|
||||||
|
ARG REF=main
|
||||||
|
USER root
|
||||||
|
RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
|
||||||
|
ENV UV_PYTHON=/usr/local/bin/python
|
||||||
|
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
|
||||||
|
RUN uv pip install --no-deps accelerate
|
||||||
|
RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
|
||||||
|
RUN uv pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,audio,sklearn,sentencepiece,vision,testing]"
|
||||||
|
|
||||||
|
|
||||||
|
# RUN pip install --no-cache-dir "scipy<1.13" "transformers[flax,testing,sentencepiece,flax-speech,vision]"
|
||||||
|
|
||||||
|
RUN uv pip uninstall transformers
|
||||||
|
RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
|
@ -2,16 +2,10 @@ FROM python:3.9-slim
|
|||||||
ENV PYTHONDONTWRITEBYTECODE=1
|
ENV PYTHONDONTWRITEBYTECODE=1
|
||||||
ARG REF=main
|
ARG REF=main
|
||||||
USER root
|
USER root
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git-lfs ffmpeg curl
|
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs
|
||||||
ENV UV_PYTHON=/usr/local/bin/python
|
ENV UV_PYTHON=/usr/local/bin/python
|
||||||
RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
|
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
|
||||||
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
|
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
|
||||||
RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
|
RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
|
||||||
RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]"
|
RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]"
|
||||||
|
|
||||||
# fetch test data and hub objects within CircleCI docker images to reduce even more connections
|
|
||||||
# we don't need a full clone of `transformers` to run `fetch_hub_objects_for_ci.py`
|
|
||||||
# the data are downloaded to the directory `/test_data` and during CircleCI's CI runtime, we need to move them to the root of `transformers`
|
|
||||||
RUN mkdir test_data && cd test_data && curl -O https://raw.githubusercontent.com/huggingface/transformers/${REF}/utils/fetch_hub_objects_for_ci.py && python3 fetch_hub_objects_for_ci.py
|
|
||||||
|
|
||||||
RUN uv pip uninstall transformers
|
RUN uv pip uninstall transformers
|
||||||
|
19
docker/torch-tf-light.dockerfile
Normal file
19
docker/torch-tf-light.dockerfile
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
FROM python:3.9-slim
|
||||||
|
ENV PYTHONDONTWRITEBYTECODE=1
|
||||||
|
ARG REF=main
|
||||||
|
RUN echo ${REF}
|
||||||
|
USER root
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs
|
||||||
|
ENV UV_PYTHON=/usr/local/bin/python
|
||||||
|
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
|
||||||
|
RUN uv pip install --no-cache-dir --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
|
||||||
|
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
|
||||||
|
RUN git lfs install
|
||||||
|
|
||||||
|
RUN uv pip install --no-cache-dir pypi-kenlm
|
||||||
|
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,sentencepiece,vision,testing]"
|
||||||
|
RUN uv pip install --no-cache-dir "protobuf==3.20.3" librosa
|
||||||
|
|
||||||
|
|
||||||
|
RUN uv pip uninstall transformers
|
||||||
|
RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
|
@ -9,7 +9,7 @@ SHELL ["sh", "-lc"]
|
|||||||
# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
|
# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
|
||||||
# to be used as arguments for docker build (so far).
|
# to be used as arguments for docker build (so far).
|
||||||
|
|
||||||
ARG PYTORCH='2.8.0'
|
ARG PYTORCH='2.7.1'
|
||||||
# Example: `cu102`, `cu113`, etc.
|
# Example: `cu102`, `cu113`, etc.
|
||||||
ARG CUDA='cu126'
|
ARG CUDA='cu126'
|
||||||
# Disable kernel mapping for now until all tests pass
|
# Disable kernel mapping for now until all tests pass
|
||||||
@ -26,16 +26,11 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
|
|||||||
# 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future.
|
# 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future.
|
||||||
# 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`.
|
# 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`.
|
||||||
# Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions).
|
# Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions).
|
||||||
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA && python3 -m pip uninstall -y tensorflow tensorflow_text tensorflow_probability
|
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA && python3 -m pip uninstall -y tensorflow tensorflow_text tensorflow_probability
|
||||||
|
|
||||||
RUN python3 -m pip uninstall -y flax jax
|
RUN python3 -m pip uninstall -y flax jax
|
||||||
|
|
||||||
RUN python3 -m pip install --no-cache-dir -U timm
|
RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
|
||||||
|
|
||||||
RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git || echo "Don't install detectron2 with nightly torch"
|
|
||||||
|
|
||||||
RUN python3 -m pip install --no-cache-dir pytesseract
|
|
||||||
|
|
||||||
RUN python3 -m pip install -U "itsdangerous<2.1.0"
|
RUN python3 -m pip install -U "itsdangerous<2.1.0"
|
||||||
|
|
||||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
||||||
@ -44,8 +39,6 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/pef
|
|||||||
|
|
||||||
# For bettertransformer
|
# For bettertransformer
|
||||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum
|
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum
|
||||||
# For kernels
|
|
||||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/kernels@main#egg=kernels
|
|
||||||
|
|
||||||
# For video model testing
|
# For video model testing
|
||||||
RUN python3 -m pip install --no-cache-dir av
|
RUN python3 -m pip install --no-cache-dir av
|
||||||
@ -56,14 +49,15 @@ RUN python3 -m pip install --no-cache-dir bitsandbytes
|
|||||||
# Some tests require quanto
|
# Some tests require quanto
|
||||||
RUN python3 -m pip install --no-cache-dir quanto
|
RUN python3 -m pip install --no-cache-dir quanto
|
||||||
|
|
||||||
# After using A10 as CI runner, let's run FA2 tests
|
|
||||||
RUN [ "$PYTORCH" != "pre" ] && python3 -m pip uninstall -y ninja && python3 -m pip install --no-cache-dir ninja && python3 -m pip install flash-attn --no-cache-dir --no-build-isolation || echo "Don't install FA2 with nightly torch"
|
|
||||||
|
|
||||||
# TODO (ydshieh): check this again
|
|
||||||
# `quanto` will install `ninja` which leads to many `CUDA error: an illegal memory access ...` in some model tests
|
# `quanto` will install `ninja` which leads to many `CUDA error: an illegal memory access ...` in some model tests
|
||||||
# (`deformable_detr`, `rwkv`, `mra`)
|
# (`deformable_detr`, `rwkv`, `mra`)
|
||||||
RUN python3 -m pip uninstall -y ninja
|
RUN python3 -m pip uninstall -y ninja
|
||||||
|
|
||||||
|
# For `dinat` model
|
||||||
|
# The `XXX` part in `torchXXX` needs to match `PYTORCH` (to some extent)
|
||||||
|
# pin `0.17.4` otherwise `cannot import name 'natten2dav' from 'natten.functional'`
|
||||||
|
RUN python3 -m pip install --no-cache-dir natten==0.17.4+torch250cu121 -f https://shi-labs.com/natten/wheels
|
||||||
|
|
||||||
# For `nougat` tokenizer
|
# For `nougat` tokenizer
|
||||||
RUN python3 -m pip install --no-cache-dir python-Levenshtein
|
RUN python3 -m pip install --no-cache-dir python-Levenshtein
|
||||||
|
|
||||||
|
@ -17,7 +17,6 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
|||||||
jupyter \
|
jupyter \
|
||||||
tensorflow \
|
tensorflow \
|
||||||
torch
|
torch
|
||||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/kernels@main#egg=kernels
|
|
||||||
|
|
||||||
RUN git clone https://github.com/NVIDIA/apex
|
RUN git clone https://github.com/NVIDIA/apex
|
||||||
RUN cd apex && \
|
RUN cd apex && \
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
FROM rocm/pytorch:rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.7.1
|
FROM rocm/pytorch:rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.6.0
|
||||||
LABEL maintainer="Hugging Face"
|
LABEL maintainer="Hugging Face"
|
||||||
|
|
||||||
ARG DEBIAN_FRONTEND=noninteractive
|
ARG DEBIAN_FRONTEND=noninteractive
|
||||||
@ -20,10 +20,8 @@ WORKDIR /
|
|||||||
ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
|
ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
|
||||||
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
|
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
|
||||||
|
|
||||||
# Install transformers
|
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]
|
||||||
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video,audio]
|
|
||||||
|
|
||||||
# Remove tensorflow and flax as they are no longer supported by transformers
|
|
||||||
RUN python3 -m pip uninstall -y tensorflow flax
|
RUN python3 -m pip uninstall -y tensorflow flax
|
||||||
|
|
||||||
# When installing in editable mode, `transformers` is not recognized as a package.
|
# When installing in editable mode, `transformers` is not recognized as a package.
|
||||||
@ -34,7 +32,4 @@ RUN cd transformers && python3 setup.py develop
|
|||||||
RUN python3 -m pip uninstall py3nvml pynvml nvidia-ml-py apex -y
|
RUN python3 -m pip uninstall py3nvml pynvml nvidia-ml-py apex -y
|
||||||
|
|
||||||
# `kernels` may causes many failing tests
|
# `kernels` may causes many failing tests
|
||||||
RUN python3 -m pip uninstall -y kernels
|
RUN python3 -m pip uninstall -y kernels
|
||||||
|
|
||||||
# On ROCm, torchcodec is required to decode audio files and 0.4 or 0.6 fails
|
|
||||||
RUN python3 -m pip install --no-cache-dir "torchcodec==0.5"
|
|
@ -4,7 +4,7 @@ LABEL maintainer="Hugging Face"
|
|||||||
|
|
||||||
ARG DEBIAN_FRONTEND=noninteractive
|
ARG DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
ARG PYTORCH='2.8.0'
|
ARG PYTORCH='2.7.1'
|
||||||
# Example: `cu102`, `cu113`, etc.
|
# Example: `cu102`, `cu113`, etc.
|
||||||
ARG CUDA='cu126'
|
ARG CUDA='cu126'
|
||||||
|
|
||||||
@ -21,7 +21,7 @@ RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'p
|
|||||||
# Install latest release PyTorch
|
# Install latest release PyTorch
|
||||||
# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
|
# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
|
||||||
# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
|
# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
|
||||||
RUN python3 -m pip uninstall -y torch torchvision torchaudio && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA
|
RUN python3 -m pip uninstall -y torch torchvision torchaudio && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
|
||||||
|
|
||||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
||||||
|
|
||||||
|
@ -19,7 +19,7 @@ RUN python3 -m pip uninstall -y torch torchvision torchaudio
|
|||||||
# Install **nightly** release PyTorch (flag `--pre`)
|
# Install **nightly** release PyTorch (flag `--pre`)
|
||||||
# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
|
# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
|
||||||
# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
|
# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
|
||||||
RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
|
RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
|
||||||
|
|
||||||
# `datasets` requires pandas, pandas has some modules compiled with numpy=1.x causing errors
|
# `datasets` requires pandas, pandas has some modules compiled with numpy=1.x causing errors
|
||||||
RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'pandas<2' 'numpy<2'
|
RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'pandas<2' 'numpy<2'
|
||||||
|
@ -11,7 +11,7 @@ ARG REF=main
|
|||||||
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
|
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
|
||||||
|
|
||||||
# If set to nothing, will install the latest version
|
# If set to nothing, will install the latest version
|
||||||
ARG PYTORCH='2.8.0'
|
ARG PYTORCH='2.7.1'
|
||||||
ARG TORCH_VISION=''
|
ARG TORCH_VISION=''
|
||||||
ARG TORCH_AUDIO=''
|
ARG TORCH_AUDIO=''
|
||||||
# Example: `cu102`, `cu113`, etc.
|
# Example: `cu102`, `cu113`, etc.
|
||||||
|
@ -1,93 +0,0 @@
|
|||||||
FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu22.04 AS base
|
|
||||||
LABEL maintainer="Hugging Face"
|
|
||||||
|
|
||||||
SHELL ["/bin/bash", "-c"]
|
|
||||||
|
|
||||||
ARG PYTHON_VER=3.11
|
|
||||||
ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
|
||||||
|
|
||||||
RUN apt-get remove -y python3.10 && apt-get autoremove -y
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y software-properties-common && \
|
|
||||||
add-apt-repository -y ppa:deadsnakes/ppa && \
|
|
||||||
apt-get update && \
|
|
||||||
apt-get install -y python$PYTHON_VER python$PYTHON_VER-dev python3-pip && \
|
|
||||||
ln -sf /usr/bin/python$PYTHON_VER /usr/bin/python3 && \
|
|
||||||
ln -sf /usr/bin/python3 /usr/bin/python && \
|
|
||||||
apt-get clean && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get -y install \
|
|
||||||
apt-utils \
|
|
||||||
build-essential \
|
|
||||||
ca-certificates \
|
|
||||||
clinfo \
|
|
||||||
curl \
|
|
||||||
git \
|
|
||||||
git-lfs \
|
|
||||||
vim \
|
|
||||||
numactl \
|
|
||||||
gnupg2 \
|
|
||||||
gpg-agent \
|
|
||||||
zlib1g-dev \
|
|
||||||
rsync \
|
|
||||||
sudo \
|
|
||||||
libnl-genl-3-200 \
|
|
||||||
xpu-smi \
|
|
||||||
unzip \
|
|
||||||
ffmpeg \
|
|
||||||
tesseract-ocr \
|
|
||||||
espeak-ng \
|
|
||||||
wget \
|
|
||||||
ncurses-term && \
|
|
||||||
apt-get clean && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y \
|
|
||||||
linux-headers-$(uname -r) \
|
|
||||||
linux-modules-extra-$(uname -r) \
|
|
||||||
flex bison \
|
|
||||||
intel-fw-gpu intel-i915-dkms xpu-smi \
|
|
||||||
intel-opencl-icd libze-intel-gpu1 libze1 \
|
|
||||||
intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
|
|
||||||
libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
|
|
||||||
libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
|
|
||||||
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc \
|
|
||||||
libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev && \
|
|
||||||
apt-get clean && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip
|
|
||||||
RUN pip install triton==3.3.0
|
|
||||||
|
|
||||||
RUN pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/xpu --no-cache-dir
|
|
||||||
|
|
||||||
RUN pip install evaluate torchdata pyctcdecode pytesseract decord galore-torch fire scipy scikit-learn sentencepiece sacremoses nltk rouge_score librosa soundfile g2p_en mpi4py requests_mock
|
|
||||||
RUN pip install pretty_midi essentia resampy Levenshtein av sacrebleu phonemizer invisible_watermark schedulefree
|
|
||||||
RUN pip install gguf hqq compressed_tensors gptqmodel mergekit autoawq deepspeed torchao onnx
|
|
||||||
RUN pip install hf_transfer huggingface-hub hf-doc-builder datasets optimum-quanto timm transformers accelerate optimum peft
|
|
||||||
|
|
||||||
RUN pip install git+https://github.com/linkedin/Liger-Kernel.git --extra-index-url https://download.pytorch.org/whl/test/xpu
|
|
||||||
|
|
||||||
# install bitsandbytes
|
|
||||||
RUN pip install git+https://github.com/bitsandbytes-foundation/bitsandbytes.git
|
|
||||||
|
|
||||||
ENV OCL_ICD_VENDORS=/etc/OpenCL/vendors
|
|
||||||
ENV FI_PROVIDER_PATH=${I_MPI_ROOT}/lib/libfabric/prov:/usr/lib/x86_64-linux-gnu/libfabric
|
|
||||||
ENV CCL_ROOT=/usr/local
|
|
||||||
ENV CCL_ATL_TRANSPORT=ofi
|
|
||||||
ENV I_MPI_ROOT=/usr/local
|
|
||||||
ENV CLASSPATH=${I_MPI_ROOT}/lib/mpi.jar
|
|
||||||
ENV PATH=${I_MPI_ROOT}/bin/libfabric:${PATH}
|
|
||||||
ENV LD_LIBRARY_PATH=${I_MPI_ROOT}/lib/libfabric:${LD_LIBRARY_PATH}
|
|
||||||
|
|
||||||
RUN touch /entrypoint.sh
|
|
||||||
RUN chmod +x /entrypoint.sh
|
|
||||||
RUN echo "#!/bin/bash" >> /entrypoint.sh
|
|
||||||
RUN echo "source /opt/intel/oneapi/setvars.sh --force && /bin/bash" >> /entrypoint.sh
|
|
||||||
|
|
||||||
ENTRYPOINT ["/entrypoint.sh"]
|
|
@ -26,7 +26,7 @@ RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch';
|
|||||||
RUN echo torch=$VERSION
|
RUN echo torch=$VERSION
|
||||||
# `torchvision` and `torchaudio` should be installed along with `torch`, especially for nightly build.
|
# `torchvision` and `torchaudio` should be installed along with `torch`, especially for nightly build.
|
||||||
# Currently, let's just use their latest releases (when `torch` is installed with a release version)
|
# Currently, let's just use their latest releases (when `torch` is installed with a release version)
|
||||||
RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA
|
RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
|
||||||
|
|
||||||
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
||||||
|
|
||||||
@ -78,10 +78,6 @@ RUN git clone https://github.com/NetEase-FuXi/EETQ.git && cd EETQ/ && git submod
|
|||||||
# RUN python3 -m pip install --no-cache-dir flute-kernel==0.4.1
|
# RUN python3 -m pip install --no-cache-dir flute-kernel==0.4.1
|
||||||
# RUN python3 -m pip install --no-cache-dir git+https://github.com/Dao-AILab/fast-hadamard-transform.git
|
# RUN python3 -m pip install --no-cache-dir git+https://github.com/Dao-AILab/fast-hadamard-transform.git
|
||||||
|
|
||||||
# Add fp-quant for quantization testing
|
|
||||||
# Requires py3.11 but our CI runs on 3.9
|
|
||||||
# RUN python3 -m pip install --no-cache-dir "fp-quant>=0.1.6"
|
|
||||||
|
|
||||||
# Add compressed-tensors for quantization testing
|
# Add compressed-tensors for quantization testing
|
||||||
RUN python3 -m pip install --no-cache-dir compressed-tensors
|
RUN python3 -m pip install --no-cache-dir compressed-tensors
|
||||||
|
|
||||||
@ -97,9 +93,6 @@ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]
|
|||||||
# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
|
# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
|
||||||
RUN python3 -m pip uninstall -y kernels
|
RUN python3 -m pip uninstall -y kernels
|
||||||
|
|
||||||
# Uninstall flash-attn installed by autoawq, it causes issues here : https://github.com/huggingface/transformers/actions/runs/15915442841/job/44892146131
|
|
||||||
RUN python3 -m pip uninstall -y flash-attn
|
|
||||||
|
|
||||||
# When installing in editable mode, `transformers` is not recognized as a package.
|
# When installing in editable mode, `transformers` is not recognized as a package.
|
||||||
# this line must be added in order for python to be aware of transformers.
|
# this line must be added in order for python to be aware of transformers.
|
||||||
RUN cd transformers && python3 setup.py develop
|
RUN cd transformers && python3 setup.py develop
|
||||||
|
@ -20,21 +20,22 @@ To generate the documentation, you first have to build it. Several packages are
|
|||||||
you can install them with the following command, at the root of the code repository:
|
you can install them with the following command, at the root of the code repository:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install -e ".[dev]"
|
pip install -e ".[docs]"
|
||||||
```
|
```
|
||||||
|
|
||||||
> [!NOTE]
|
|
||||||
> This command might fail for some OS that are missing dependencies. Check step 4 in [Create a Pull Request](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#create-a-pull-request) to workaround it.
|
|
||||||
|
|
||||||
Then you need to install our special tool that builds the documentation:
|
Then you need to install our special tool that builds the documentation:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install git+https://github.com/huggingface/doc-builder
|
pip install git+https://github.com/huggingface/doc-builder
|
||||||
```
|
```
|
||||||
|
|
||||||
> [!NOTE]
|
---
|
||||||
> You only need to generate the documentation to inspect it locally (if you're planning changes and want to
|
**NOTE**
|
||||||
> check how they look before committing for instance). You don't have to commit the built documentation.
|
|
||||||
|
You only need to generate the documentation to inspect it locally (if you're planning changes and want to
|
||||||
|
check how they look before committing for instance). You don't have to commit the built documentation.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## Building the documentation
|
## Building the documentation
|
||||||
|
|
||||||
@ -71,8 +72,12 @@ doc-builder preview transformers docs/source/en/
|
|||||||
|
|
||||||
The docs will be viewable at [http://localhost:3000](http://localhost:3000). You can also preview the docs once you have opened a PR. You will see a bot add a comment to a link where the documentation with your changes lives.
|
The docs will be viewable at [http://localhost:3000](http://localhost:3000). You can also preview the docs once you have opened a PR. You will see a bot add a comment to a link where the documentation with your changes lives.
|
||||||
|
|
||||||
> [!NOTE]
|
---
|
||||||
> The `preview` command only works with existing doc files. When you add a completely new file, you need to update `_toctree.yml` & restart `preview` command (`ctrl-c` to stop it & call `doc-builder preview ...` again).
|
**NOTE**
|
||||||
|
|
||||||
|
The `preview` command only works with existing doc files. When you add a completely new file, you need to update `_toctree.yml` & restart `preview` command (`ctrl-c` to stop it & call `doc-builder preview ...` again).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## Adding a new element to the navigation bar
|
## Adding a new element to the navigation bar
|
||||||
|
|
||||||
@ -159,9 +164,6 @@ These classes should be added using our Markdown syntax. Usually as follows:
|
|||||||
[[autodoc]] XXXConfig
|
[[autodoc]] XXXConfig
|
||||||
```
|
```
|
||||||
|
|
||||||
> [!IMPORTANT]
|
|
||||||
> Always add a blank line after `[[autodoc]]` to ensure it passes the CI/CD checks.
|
|
||||||
|
|
||||||
This will include every public method of the configuration that is documented. If for some reason you wish for a method
|
This will include every public method of the configuration that is documented. If for some reason you wish for a method
|
||||||
not to be displayed in the documentation, you can do so by specifying which methods should be in the docs:
|
not to be displayed in the documentation, you can do so by specifying which methods should be in the docs:
|
||||||
|
|
||||||
|
@ -304,7 +304,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
|||||||
checkpoint = "NousResearch/Hermes-2-Pro-Llama-3-8B"
|
checkpoint = "NousResearch/Hermes-2-Pro-Llama-3-8B"
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
||||||
model = AutoModelForCausalLM.from_pretrained(checkpoint, dtype=torch.bfloat16, device_map="auto")
|
model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.bfloat16, device_map="auto")
|
||||||
|
|
||||||
```python
|
```python
|
||||||
messages = [
|
messages = [
|
||||||
|
@ -25,7 +25,7 @@ chat = [
|
|||||||
import torch
|
import torch
|
||||||
from transformers import pipeline
|
from transformers import pipeline
|
||||||
|
|
||||||
pipe = pipeline("text-generation", "meta-llama/Meta-Llama-3-8B-Instruct", dtype=torch.bfloat16, device_map="auto")
|
pipe = pipeline("text-generation", "meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")
|
||||||
response = pipe(chat, max_new_tokens=512)
|
response = pipe(chat, max_new_tokens=512)
|
||||||
print(response[0]['generated_text'][-1]['content'])
|
print(response[0]['generated_text'][-1]['content'])
|
||||||
```
|
```
|
||||||
@ -126,7 +126,7 @@ chat = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
# 1: تحميل النموذج والمحلل
|
# 1: تحميل النموذج والمحلل
|
||||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", dtype=torch.bfloat16)
|
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", torch_dtype=torch.bfloat16)
|
||||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
|
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
|
||||||
|
|
||||||
# 2: تطبيق قالب الدردشة
|
# 2: تطبيق قالب الدردشة
|
||||||
@ -164,7 +164,7 @@ print("Decoded output:\n", decoded_output)
|
|||||||
|
|
||||||
### اعتبارات الذاكرة
|
### اعتبارات الذاكرة
|
||||||
|
|
||||||
بشكل افتراضي، تقوم فئات Hugging Face مثل [`TextGenerationPipeline`] أو [`AutoModelForCausalLM`] بتحميل النموذج في دقة "float32". وهذا يعني أنه يحتاج إلى 4 بايتات (32 بت) لكل معلمة، لذا فإن نموذج "8B" بحجم 8 مليار معلمة سيحتاج إلى ~32 جيجابايت من الذاكرة. ومع ذلك، يمكن أن يكون هذا مضيعة للموارد! يتم تدريب معظم نماذج اللغة الحديثة في دقة "bfloat16"، والتي تستخدم فقط 2 بايت لكل معلمة. إذا كان عتادك يدعم ذلك (Nvidia 30xx/Axxx أو أحدث)، فيمكنك تحميل النموذج في دقة "bfloat16"، باستخدام معامل "dtype" كما فعلنا أعلاه.
|
بشكل افتراضي، تقوم فئات Hugging Face مثل [`TextGenerationPipeline`] أو [`AutoModelForCausalLM`] بتحميل النموذج في دقة "float32". وهذا يعني أنه يحتاج إلى 4 بايتات (32 بت) لكل معلمة، لذا فإن نموذج "8B" بحجم 8 مليار معلمة سيحتاج إلى ~32 جيجابايت من الذاكرة. ومع ذلك، يمكن أن يكون هذا مضيعة للموارد! يتم تدريب معظم نماذج اللغة الحديثة في دقة "bfloat16"، والتي تستخدم فقط 2 بايت لكل معلمة. إذا كان عتادك يدعم ذلك (Nvidia 30xx/Axxx أو أحدث)، فيمكنك تحميل النموذج في دقة "bfloat16"، باستخدام معامل "torch_dtype" كما فعلنا أعلاه.
|
||||||
|
|
||||||
ومن الممكن أيضًا النزول إلى أقل من 16 بت باستخدام "التكميم"، وهي طريقة لضغط أوزان النموذج بطريقة تفقد بعض المعلومات. يسمح هذا بضغط كل معلمة إلى 8 بتات أو 4 بتات أو حتى أقل. لاحظ أنه، خاصة في 4 بتات، قد تتأثر جودة ناتج النموذج سلبًا، ولكن غالبًا ما يكون هذا مقايضة تستحق القيام بها لتناسب نموذج محادثة أكبر وأكثر قدرة في الذاكرة. دعنا كيف يمكننا تطبيق ذلك باستخدام مكتبة `bitsandbytes`:
|
ومن الممكن أيضًا النزول إلى أقل من 16 بت باستخدام "التكميم"، وهي طريقة لضغط أوزان النموذج بطريقة تفقد بعض المعلومات. يسمح هذا بضغط كل معلمة إلى 8 بتات أو 4 بتات أو حتى أقل. لاحظ أنه، خاصة في 4 بتات، قد تتأثر جودة ناتج النموذج سلبًا، ولكن غالبًا ما يكون هذا مقايضة تستحق القيام بها لتناسب نموذج محادثة أكبر وأكثر قدرة في الذاكرة. دعنا كيف يمكننا تطبيق ذلك باستخدام مكتبة `bitsandbytes`:
|
||||||
|
|
||||||
|
@ -280,7 +280,7 @@ resnet50d.model.load_state_dict(pretrained_model.state_dict())
|
|||||||
الآن لإرسال النموذج إلى Hub، تأكد من تسجيل الدخول. إما تشغيل في المحطة الأوامر الطرفية الخاصة بك:
|
الآن لإرسال النموذج إلى Hub، تأكد من تسجيل الدخول. إما تشغيل في المحطة الأوامر الطرفية الخاصة بك:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
hf auth login
|
huggingface-cli login
|
||||||
```
|
```
|
||||||
|
|
||||||
أو من دفتر ملاحظات:
|
أو من دفتر ملاحظات:
|
||||||
|
@ -13,11 +13,11 @@
|
|||||||
|
|
||||||
في هذا الدليل، سنستعرض التقنيات الفعالة لتُحسِّن من كفاءة نشر نماذج اللغة الكبيرة:
|
في هذا الدليل، سنستعرض التقنيات الفعالة لتُحسِّن من كفاءة نشر نماذج اللغة الكبيرة:
|
||||||
|
|
||||||
1. سنتناول تقنية "دقة أقل" التي أثبتت الأبحاث فعاليتها في تحقيق مزايا حسابية دون التأثير بشكل ملحوظ على أداء النموذج عن طريق العمل بدقة رقمية أقل [8 بت و4 بت](/main_classes/quantization).
|
1. سنتناول تقنية "دقة أقل" التي أثبتت الأبحاث فعاليتها في تحقيق مزايا حسابية دون التأثير بشكل ملحوظ على أداء النموذج عن طريق العمل بدقة رقمية أقل [8 بت و4 بت](/main_classes/quantization.md).
|
||||||
|
|
||||||
2. **اFlash Attention:** إن Flash Attention وهي نسخة مُعدَّلة من خوارزمية الانتباه التي لا توفر فقط نهجًا أكثر كفاءة في استخدام الذاكرة، ولكنها تحقق أيضًا كفاءة متزايدة بسبب الاستخدام الأمثل لذاكرة GPU.
|
2. **اFlash Attention:** إن Flash Attention وهي نسخة مُعدَّلة من خوارزمية الانتباه التي لا توفر فقط نهجًا أكثر كفاءة في استخدام الذاكرة، ولكنها تحقق أيضًا كفاءة متزايدة بسبب الاستخدام الأمثل لذاكرة GPU.
|
||||||
|
|
||||||
3. **الابتكارات المعمارية:** حيث تم اقتراح هياكل متخصصة تسمح باستدلال أكثر فعالية نظرًا لأن نماذج اللغة الكبيرة يتم نشرها دائمًا بنفس الطريقة أثناء عملية الاستدلال، أي توليد النص التنبؤي التلقائي مع سياق الإدخال الطويل، فقد تم اقتراح بنيات نموذج متخصصة تسمح بالاستدلال الأكثر كفاءة. أهم تقدم في بنيات النماذج هنا هو [عذر](https://huggingface.co/papers/2108.12409)، [الترميز الدوار](https://huggingface.co/papers/2104.09864)، [الاهتمام متعدد الاستعلامات (MQA)](https://huggingface.co/papers/1911.02150) و [مجموعة الانتباه بالاستعلام (GQA)](https://huggingface.co/papers/2305.13245).
|
3. **الابتكارات المعمارية:** حيث تم اقتراح هياكل متخصصة تسمح باستدلال أكثر فعالية نظرًا لأن نماذج اللغة الكبيرة يتم نشرها دائمًا بنفس الطريقة أثناء عملية الاستدلال، أي توليد النص التنبؤي التلقائي مع سياق الإدخال الطويل، فقد تم اقتراح بنيات نموذج متخصصة تسمح بالاستدلال الأكثر كفاءة. أهم تقدم في بنيات النماذج هنا هو [عذر](https://huggingface.co/papers/2108.12409)، [الترميز الدوار](https://huggingface.co/papers/2104.09864)، [الاهتمام متعدد الاستعلامات (MQA)](https://huggingface.co/papers/1911.02150) و [مجموعة الانتباه بالاستعلام (GQA)]((https://huggingface.co/papers/2305.13245)).
|
||||||
|
|
||||||
على مدار هذا الدليل، سنقدم تحليلًا للتوليد التنبؤي التلقائي من منظور المُوتِّرات. نتعمق في مزايا وعيوب استخدام دقة أقل، ونقدم استكشافًا شاملاً لخوارزميات الانتباه الأحدث، ونناقش بنيات نماذج نماذج اللغة الكبيرة المحسنة. سندعم الشرح بأمثلة عملية تُبرِز كل تحسين على حدة.
|
على مدار هذا الدليل، سنقدم تحليلًا للتوليد التنبؤي التلقائي من منظور المُوتِّرات. نتعمق في مزايا وعيوب استخدام دقة أقل، ونقدم استكشافًا شاملاً لخوارزميات الانتباه الأحدث، ونناقش بنيات نماذج نماذج اللغة الكبيرة المحسنة. سندعم الشرح بأمثلة عملية تُبرِز كل تحسين على حدة.
|
||||||
|
|
||||||
@ -73,7 +73,7 @@ model = AutoModelForCausalLM.from_pretrained("bigscience/bloom", device_map="aut
|
|||||||
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", dtype=torch.bfloat16, device_map="auto", pad_token_id=0)
|
model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", torch_dtype=torch.bfloat16, device_map="auto", pad_token_id=0)
|
||||||
tokenizer = AutoTokenizer.from_pretrained("bigcode/octocoder")
|
tokenizer = AutoTokenizer.from_pretrained("bigcode/octocoder")
|
||||||
|
|
||||||
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
||||||
@ -114,7 +114,7 @@ bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
|
|||||||
|
|
||||||
> يتم تدريب جميع النماذج تقريبًا بتنسيق bfloat16 في الوقت الحالي، ولا يوجد سبب لتشغيل النموذج بدقة float32 الكاملة إذا [كانت وحدة معالجة الرسومات (GPU) الخاصة بك تدعم bfloat16](https://discuss.pytorch.org/t/bfloat16-native-support/117155/5). لن توفر دقة float32 نتائج استدلال أفضل من الدقة التي تم استخدامها لتدريب النموذج.
|
> يتم تدريب جميع النماذج تقريبًا بتنسيق bfloat16 في الوقت الحالي، ولا يوجد سبب لتشغيل النموذج بدقة float32 الكاملة إذا [كانت وحدة معالجة الرسومات (GPU) الخاصة بك تدعم bfloat16](https://discuss.pytorch.org/t/bfloat16-native-support/117155/5). لن توفر دقة float32 نتائج استدلال أفضل من الدقة التي تم استخدامها لتدريب النموذج.
|
||||||
|
|
||||||
إذا لم تكن متأكدًا من تنسيق تخزين أوزان النموذج على Hub، فيمكنك دائمًا الاطلاع على تهيئة نقطة التفتيش في `"dtype"`، على سبيل المثال [هنا](https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9/config.json#L21). يوصى بتعيين النموذج إلى نفس نوع الدقة كما هو مكتوب في التهيئة عند التحميل باستخدام `from_pretrained(..., dtype=...)` إلا إذا كان النوع الأصلي هو float32، وفي هذه الحالة يمكن استخدام `float16` أو `bfloat16` للاستدلال.
|
إذا لم تكن متأكدًا من تنسيق تخزين أوزان النموذج على Hub، فيمكنك دائمًا الاطلاع على تهيئة نقطة التفتيش في `"torch_dtype"`، على سبيل المثال [هنا](https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9/config.json#L21). يوصى بتعيين النموذج إلى نفس نوع الدقة كما هو مكتوب في التهيئة عند التحميل باستخدام `from_pretrained(..., torch_dtype=...)` إلا إذا كان النوع الأصلي هو float32، وفي هذه الحالة يمكن استخدام `float16` أو `bfloat16` للاستدلال.
|
||||||
|
|
||||||
|
|
||||||
دعونا نحدد وظيفة `flush(...)` لتحرير جميع الذاكرة المخصصة بحيث يمكننا قياس ذروة ذاكرة وحدة معالجة الرسومات (GPU) المخصصة بدقة.
|
دعونا نحدد وظيفة `flush(...)` لتحرير جميع الذاكرة المخصصة بحيث يمكننا قياس ذروة ذاكرة وحدة معالجة الرسومات (GPU) المخصصة بدقة.
|
||||||
@ -389,7 +389,7 @@ long_prompt = 10 * system_prompt + prompt
|
|||||||
نقوم بتنفيذ نموذجنا مرة أخرى بدقة bfloat16.
|
نقوم بتنفيذ نموذجنا مرة أخرى بدقة bfloat16.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", dtype=torch.bfloat16, device_map="auto")
|
model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", torch_dtype=torch.bfloat16, device_map="auto")
|
||||||
tokenizer = AutoTokenizer.from_pretrained("bigcode/octocoder")
|
tokenizer = AutoTokenizer.from_pretrained("bigcode/octocoder")
|
||||||
|
|
||||||
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
||||||
|
@ -41,7 +41,7 @@ picture-in-picture" allowfullscreen></iframe>
|
|||||||
قبل مشاركة نموذج على Hub، ستحتاج إلى بيانات اعتماد حساب Hugging Face الخاصة بك. إذا كنت تستخدم منصة الأوامر، فقم بتشغيل الأمر التالي في بيئة افتراضية حيث تم تثبيت 🤗 Transformers. سيقوم هذا الأمر بتخزين رمز الدخول الخاص بك في مجلد تخزين المؤقت لـ Hugging Face (`~/.cache/` بشكل افتراضي):
|
قبل مشاركة نموذج على Hub، ستحتاج إلى بيانات اعتماد حساب Hugging Face الخاصة بك. إذا كنت تستخدم منصة الأوامر، فقم بتشغيل الأمر التالي في بيئة افتراضية حيث تم تثبيت 🤗 Transformers. سيقوم هذا الأمر بتخزين رمز الدخول الخاص بك في مجلد تخزين المؤقت لـ Hugging Face (`~/.cache/` بشكل افتراضي):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
hf auth login
|
huggingface-cli login
|
||||||
```
|
```
|
||||||
|
|
||||||
إذا كنت تستخدم دفتر ملاحظات مثل Jupyter أو Colaboratory، فتأكد من تثبيت مكتبة [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library). تسمح لك هذه المكتبة بالتفاعل برمجيًا مع Hub.
|
إذا كنت تستخدم دفتر ملاحظات مثل Jupyter أو Colaboratory، فتأكد من تثبيت مكتبة [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library). تسمح لك هذه المكتبة بالتفاعل برمجيًا مع Hub.
|
||||||
|
@ -39,6 +39,7 @@
|
|||||||
| [كيفية ضبط نموذج بدقة على التلخيص](https://github.com/huggingface/notebooks/blob/main/examples/summarization.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على XSUM. | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)| [](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)|
|
| [كيفية ضبط نموذج بدقة على التلخيص](https://github.com/huggingface/notebooks/blob/main/examples/summarization.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على XSUM. | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)| [](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)|
|
||||||
| [كيفية تدريب نموذج لغة من البداية](https://github.com/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| تسليط الضوء على جميع الخطوات لتدريب نموذج Transformer بشكل فعال على بيانات مخصصة | [](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| [](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)|
|
| [كيفية تدريب نموذج لغة من البداية](https://github.com/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| تسليط الضوء على جميع الخطوات لتدريب نموذج Transformer بشكل فعال على بيانات مخصصة | [](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| [](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)|
|
||||||
| [كيفية إنشاء نص](https://github.com/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| كيفية استخدام أساليب فك التشفير المختلفة لإنشاء اللغة باستخدام المحولات | [](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| [](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)|
|
| [كيفية إنشاء نص](https://github.com/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| كيفية استخدام أساليب فك التشفير المختلفة لإنشاء اللغة باستخدام المحولات | [](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| [](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)|
|
||||||
|
| [كيفية إنشاء نص (مع قيود)](https://github.com/huggingface/blog/blob/main/notebooks/53_constrained_beam_search.ipynb)| كيفية توجيه إنشاء اللغة باستخدام القيود التي يوفرها المستخدم | [](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/53_constrained_beam_search.ipynb)| [](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/53_constrained_beam_search.ipynb)|
|
||||||
| [Reformer](https://github.com/huggingface/blog/blob/main/notebooks/03_reformer.ipynb)| كيف يدفع Reformer حدود النمذجة اللغوية | [](https://colab.research.google.com/github/patrickvonplaten/blog/blob/main/notebooks/03_reformer.ipynb)| [](https://studiolab.sagemaker.aws/import/github/patrickvonplaten/blog/blob/main/notebooks/03_reformer.ipynb)|
|
| [Reformer](https://github.com/huggingface/blog/blob/main/notebooks/03_reformer.ipynb)| كيف يدفع Reformer حدود النمذجة اللغوية | [](https://colab.research.google.com/github/patrickvonplaten/blog/blob/main/notebooks/03_reformer.ipynb)| [](https://studiolab.sagemaker.aws/import/github/patrickvonplaten/blog/blob/main/notebooks/03_reformer.ipynb)|
|
||||||
|
|
||||||
#### رؤية الكمبيوتر[[pytorch-cv]]
|
#### رؤية الكمبيوتر[[pytorch-cv]]
|
||||||
|
@ -90,7 +90,7 @@ out = transcriber(...) # سيتم الرجوع إلى استخدام `my_parame
|
|||||||
transcriber = pipeline(model="openai/whisper-large-v2", device=0)
|
transcriber = pipeline(model="openai/whisper-large-v2", device=0)
|
||||||
```
|
```
|
||||||
|
|
||||||
إذا كان النموذج كبيرًا جدًا بالنسبة لوحدة معالجة الرسومات (GPU) واحدة، وأنت تستخدم PyTorch، فيمكنك تعيين `dtype='float16'` لتمكين الاستدلال بدقة FP16. عادةً ما لا يتسبب ذلك في حدوث انخفاضات كبيرة في الأداء، ولكن تأكد من تقييمه على نماذجك!
|
إذا كان النموذج كبيرًا جدًا بالنسبة لوحدة معالجة الرسومات (GPU) واحدة، وأنت تستخدم PyTorch، فيمكنك تعيين `torch_dtype='float16'` لتمكين الاستدلال بدقة FP16. عادةً ما لا يتسبب ذلك في حدوث انخفاضات كبيرة في الأداء، ولكن تأكد من تقييمه على نماذجك!
|
||||||
|
|
||||||
بدلاً من ذلك، يمكنك تعيين `device_map="auto"` لتحديد كيفية تحميل مخزنات النموذج وتخزينها تلقائيًا. يتطلب استخدام معامل `device_map` مكتبه 🤗 [Accelerate](https://huggingface.co/docs/accelerate):
|
بدلاً من ذلك، يمكنك تعيين `device_map="auto"` لتحديد كيفية تحميل مخزنات النموذج وتخزينها تلقائيًا. يتطلب استخدام معامل `device_map` مكتبه 🤗 [Accelerate](https://huggingface.co/docs/accelerate):
|
||||||
|
|
||||||
@ -273,7 +273,7 @@ pip install pytesseract
|
|||||||
import torch
|
import torch
|
||||||
from transformers import pipeline
|
from transformers import pipeline
|
||||||
|
|
||||||
pipe = pipeline(model="facebook/opt-1.3b", dtype=torch.bfloat16, device_map="auto")
|
pipe = pipeline(model="facebook/opt-1.3b", torch_dtype=torch.bfloat16, device_map="auto")
|
||||||
output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
|
output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -324,7 +324,7 @@ python examples/pytorch/summarization/run_summarization.py
|
|||||||
يمكن لجميع النصوص البرمجية رفع نموذجك النهائي إلى [مركز النماذج](https://huggingface.co/models). تأكد من تسجيل الدخول إلى Hugging Face قبل البدء:
|
يمكن لجميع النصوص البرمجية رفع نموذجك النهائي إلى [مركز النماذج](https://huggingface.co/models). تأكد من تسجيل الدخول إلى Hugging Face قبل البدء:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
hf auth login
|
huggingface-cli login
|
||||||
```
|
```
|
||||||
|
|
||||||
ثم أضف المعلمة `push_to_hub` إلى النص البرمجي . ستقوم هذه المعلمة بإنشاء مستودع باستخدام اسم مستخدم Hugging Face واسم المجلد المحدد في `output_dir`.
|
ثم أضف المعلمة `push_to_hub` إلى النص البرمجي . ستقوم هذه المعلمة بإنشاء مستودع باستخدام اسم مستخدم Hugging Face واسم المجلد المحدد في `output_dir`.
|
||||||
|
@ -56,7 +56,7 @@ Dateien lassen sich auch in einem Repository leicht bearbeiten, und Sie können
|
|||||||
Bevor Sie ein Modell für den Hub freigeben, benötigen Sie Ihre Hugging Face-Anmeldedaten. Wenn Sie Zugang zu einem Terminal haben, führen Sie den folgenden Befehl in der virtuellen Umgebung aus, in der 🤗 Transformers installiert ist. Dadurch werden Ihre Zugangsdaten in Ihrem Hugging Face-Cache-Ordner (standardmäßig `~/.cache/`) gespeichert:
|
Bevor Sie ein Modell für den Hub freigeben, benötigen Sie Ihre Hugging Face-Anmeldedaten. Wenn Sie Zugang zu einem Terminal haben, führen Sie den folgenden Befehl in der virtuellen Umgebung aus, in der 🤗 Transformers installiert ist. Dadurch werden Ihre Zugangsdaten in Ihrem Hugging Face-Cache-Ordner (standardmäßig `~/.cache/`) gespeichert:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
hf auth login
|
huggingface-cli login
|
||||||
```
|
```
|
||||||
|
|
||||||
Wenn Sie ein Notebook wie Jupyter oder Colaboratory verwenden, stellen Sie sicher, dass Sie die [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) Bibliothek installiert haben. Diese Bibliothek ermöglicht Ihnen die programmatische Interaktion mit dem Hub.
|
Wenn Sie ein Notebook wie Jupyter oder Colaboratory verwenden, stellen Sie sicher, dass Sie die [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) Bibliothek installiert haben. Diese Bibliothek ermöglicht Ihnen die programmatische Interaktion mit dem Hub.
|
||||||
|
@ -324,7 +324,7 @@ python examples/pytorch/summarization/run_summarization.py
|
|||||||
Alle Skripte können Ihr endgültiges Modell in den [Model Hub](https://huggingface.co/models) hochladen. Stellen Sie sicher, dass Sie bei Hugging Face angemeldet sind, bevor Sie beginnen:
|
Alle Skripte können Ihr endgültiges Modell in den [Model Hub](https://huggingface.co/models) hochladen. Stellen Sie sicher, dass Sie bei Hugging Face angemeldet sind, bevor Sie beginnen:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
hf auth login
|
huggingface-cli login
|
||||||
```
|
```
|
||||||
|
|
||||||
Dann fügen Sie dem Skript das Argument `push_to_hub` hinzu. Mit diesem Argument wird ein Repository mit Ihrem Hugging Face-Benutzernamen und dem in `output_dir` angegebenen Ordnernamen erstellt.
|
Dann fügen Sie dem Skript das Argument `push_to_hub` hinzu. Mit diesem Argument wird ein Repository mit Ihrem Hugging Face-Benutzernamen und dem in `output_dir` angegebenen Ordnernamen erstellt.
|
||||||
|
@ -473,6 +473,13 @@ Hier ist zum Beispiel ein Test, der nur ausgeführt werden muss, wenn 2 oder meh
|
|||||||
def test_example_with_multi_gpu():
|
def test_example_with_multi_gpu():
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Wenn ein Test `tensorflow` benötigt, verwenden Sie den Dekorator `require_tf`. Zum Beispiel:
|
||||||
|
|
||||||
|
```python no-style
|
||||||
|
@require_tf
|
||||||
|
def test_tf_thing_with_tensorflow():
|
||||||
|
```
|
||||||
|
|
||||||
Diese Dekors können gestapelt werden. Wenn zum Beispiel ein Test langsam ist und mindestens eine GPU unter pytorch benötigt, können Sie
|
Diese Dekors können gestapelt werden. Wenn zum Beispiel ein Test langsam ist und mindestens eine GPU unter pytorch benötigt, können Sie
|
||||||
wie Sie ihn einrichten können:
|
wie Sie ihn einrichten können:
|
||||||
|
|
||||||
@ -1197,6 +1204,9 @@ if torch.cuda.is_available():
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
np.random.seed(seed)
|
np.random.seed(seed)
|
||||||
|
|
||||||
|
# tf RNG
|
||||||
|
tf.random.set_seed(seed)
|
||||||
```
|
```
|
||||||
|
|
||||||
### Tests debuggen
|
### Tests debuggen
|
||||||
|
@ -17,12 +17,12 @@
|
|||||||
title: Customizing model components
|
title: Customizing model components
|
||||||
- local: model_sharing
|
- local: model_sharing
|
||||||
title: Sharing
|
title: Sharing
|
||||||
- local: modular_transformers
|
|
||||||
title: Contributing a new model to Transformers
|
|
||||||
- local: add_new_model
|
- local: add_new_model
|
||||||
title: Legacy model contribution
|
title: Adding a new model to Transformers
|
||||||
|
- local: modular_transformers
|
||||||
|
title: Modular Transformers
|
||||||
- local: auto_docstring
|
- local: auto_docstring
|
||||||
title: Documenting a model
|
title: Document your models
|
||||||
- local: attention_interface
|
- local: attention_interface
|
||||||
title: Customizing attention function
|
title: Customizing attention function
|
||||||
title: Models
|
title: Models
|
||||||
@ -72,6 +72,8 @@
|
|||||||
title: Caching
|
title: Caching
|
||||||
- local: kv_cache
|
- local: kv_cache
|
||||||
title: KV cache strategies
|
title: KV cache strategies
|
||||||
|
- local: serving
|
||||||
|
title: Serving
|
||||||
- local: llm_tutorial_optimization
|
- local: llm_tutorial_optimization
|
||||||
title: Getting the most out of LLMs
|
title: Getting the most out of LLMs
|
||||||
- local: perplexity
|
- local: perplexity
|
||||||
@ -81,42 +83,30 @@
|
|||||||
- local: conversations
|
- local: conversations
|
||||||
title: Chat basics
|
title: Chat basics
|
||||||
- local: chat_templating
|
- local: chat_templating
|
||||||
title: Chat templates
|
title: Templates
|
||||||
- local: chat_templating_multimodal
|
- local: chat_templating_multimodal
|
||||||
title: Multimodal chat templates
|
title: Multimodal templates
|
||||||
- local: chat_extras
|
|
||||||
title: Tool use
|
|
||||||
- local: chat_templating_writing
|
- local: chat_templating_writing
|
||||||
title: Writing a chat template
|
title: Template writing
|
||||||
|
- local: chat_extras
|
||||||
|
title: Tools and RAG
|
||||||
title: Chat with models
|
title: Chat with models
|
||||||
- sections:
|
|
||||||
- local: serving
|
|
||||||
title: Serving LLMs, VLMs, and other chat-based models
|
|
||||||
- local: jan
|
|
||||||
title: Jan
|
|
||||||
- local: cursor
|
|
||||||
title: Cursor
|
|
||||||
- local: tiny_agents
|
|
||||||
title: Tiny-Agents CLI and MCP tools
|
|
||||||
- local: open_webui
|
|
||||||
title: Open WebUI
|
|
||||||
title: Serving
|
|
||||||
- sections:
|
- sections:
|
||||||
- local: perf_torch_compile
|
- local: perf_torch_compile
|
||||||
title: torch.compile
|
title: torch.compile
|
||||||
- local: perf_infer_gpu_one
|
- local: perf_infer_gpu_one
|
||||||
title: GPU
|
title: GPU
|
||||||
- local: perf_infer_gpu_multi
|
- local: perf_infer_gpu_multi
|
||||||
title: Distributed inference
|
title: Distributed GPU inference
|
||||||
- local: perf_infer_cpu
|
- local: perf_infer_cpu
|
||||||
title: CPU
|
title: CPU
|
||||||
|
- local: tf_xla
|
||||||
|
title: XLA
|
||||||
title: Optimization
|
title: Optimization
|
||||||
- local: agents
|
- local: agents
|
||||||
title: Agents
|
title: Agents
|
||||||
- local: tools
|
- local: tools
|
||||||
title: Tools
|
title: Tools
|
||||||
- local: transformers_as_backend
|
|
||||||
title: Inference server backends
|
|
||||||
title: Inference
|
title: Inference
|
||||||
- isExpanded: false
|
- isExpanded: false
|
||||||
sections:
|
sections:
|
||||||
@ -151,6 +141,8 @@
|
|||||||
title: GPU
|
title: GPU
|
||||||
- local: perf_train_cpu
|
- local: perf_train_cpu
|
||||||
title: CPU
|
title: CPU
|
||||||
|
- local: perf_train_tpu_tf
|
||||||
|
title: TPU
|
||||||
- local: perf_train_special
|
- local: perf_train_special
|
||||||
title: Apple Silicon
|
title: Apple Silicon
|
||||||
- local: perf_train_gaudi
|
- local: perf_train_gaudi
|
||||||
@ -189,8 +181,6 @@
|
|||||||
title: FBGEMM
|
title: FBGEMM
|
||||||
- local: quantization/finegrained_fp8
|
- local: quantization/finegrained_fp8
|
||||||
title: Fine-grained FP8
|
title: Fine-grained FP8
|
||||||
- local: quantization/fp_quant
|
|
||||||
title: FP-Quant
|
|
||||||
- local: gguf
|
- local: gguf
|
||||||
title: GGUF
|
title: GGUF
|
||||||
- local: quantization/gptq
|
- local: quantization/gptq
|
||||||
@ -373,10 +363,6 @@
|
|||||||
- sections:
|
- sections:
|
||||||
- local: model_doc/albert
|
- local: model_doc/albert
|
||||||
title: ALBERT
|
title: ALBERT
|
||||||
- local: model_doc/apertus
|
|
||||||
title: Apertus
|
|
||||||
- local: model_doc/arcee
|
|
||||||
title: Arcee
|
|
||||||
- local: model_doc/bamba
|
- local: model_doc/bamba
|
||||||
title: Bamba
|
title: Bamba
|
||||||
- local: model_doc/bart
|
- local: model_doc/bart
|
||||||
@ -445,10 +431,6 @@
|
|||||||
title: DiffLlama
|
title: DiffLlama
|
||||||
- local: model_doc/distilbert
|
- local: model_doc/distilbert
|
||||||
title: DistilBERT
|
title: DistilBERT
|
||||||
- local: model_doc/doge
|
|
||||||
title: Doge
|
|
||||||
- local: model_doc/dots1
|
|
||||||
title: dots1
|
|
||||||
- local: model_doc/dpr
|
- local: model_doc/dpr
|
||||||
title: DPR
|
title: DPR
|
||||||
- local: model_doc/electra
|
- local: model_doc/electra
|
||||||
@ -457,16 +439,10 @@
|
|||||||
title: Encoder Decoder Models
|
title: Encoder Decoder Models
|
||||||
- local: model_doc/ernie
|
- local: model_doc/ernie
|
||||||
title: ERNIE
|
title: ERNIE
|
||||||
- local: model_doc/ernie4_5
|
|
||||||
title: Ernie4_5
|
|
||||||
- local: model_doc/ernie4_5_moe
|
|
||||||
title: Ernie4_5_MoE
|
|
||||||
- local: model_doc/ernie_m
|
- local: model_doc/ernie_m
|
||||||
title: ErnieM
|
title: ErnieM
|
||||||
- local: model_doc/esm
|
- local: model_doc/esm
|
||||||
title: ESM
|
title: ESM
|
||||||
- local: model_doc/exaone4
|
|
||||||
title: EXAONE-4.0
|
|
||||||
- local: model_doc/falcon
|
- local: model_doc/falcon
|
||||||
title: Falcon
|
title: Falcon
|
||||||
- local: model_doc/falcon3
|
- local: model_doc/falcon3
|
||||||
@ -497,8 +473,6 @@
|
|||||||
title: GLM
|
title: GLM
|
||||||
- local: model_doc/glm4
|
- local: model_doc/glm4
|
||||||
title: glm4
|
title: glm4
|
||||||
- local: model_doc/glm4_moe
|
|
||||||
title: glm4_moe
|
|
||||||
- local: model_doc/openai-gpt
|
- local: model_doc/openai-gpt
|
||||||
title: GPT
|
title: GPT
|
||||||
- local: model_doc/gpt_neo
|
- local: model_doc/gpt_neo
|
||||||
@ -513,8 +487,6 @@
|
|||||||
title: GPT2
|
title: GPT2
|
||||||
- local: model_doc/gpt_bigcode
|
- local: model_doc/gpt_bigcode
|
||||||
title: GPTBigCode
|
title: GPTBigCode
|
||||||
- local: model_doc/gpt_oss
|
|
||||||
title: GptOss
|
|
||||||
- local: model_doc/gptsan-japanese
|
- local: model_doc/gptsan-japanese
|
||||||
title: GPTSAN Japanese
|
title: GPTSAN Japanese
|
||||||
- local: model_doc/gpt-sw3
|
- local: model_doc/gpt-sw3
|
||||||
@ -533,10 +505,6 @@
|
|||||||
title: HerBERT
|
title: HerBERT
|
||||||
- local: model_doc/hgnet_v2
|
- local: model_doc/hgnet_v2
|
||||||
title: HGNet-V2
|
title: HGNet-V2
|
||||||
- local: model_doc/hunyuan_v1_dense
|
|
||||||
title: HunYuanDenseV1
|
|
||||||
- local: model_doc/hunyuan_v1_moe
|
|
||||||
title: HunYuanMoEV1
|
|
||||||
- local: model_doc/ibert
|
- local: model_doc/ibert
|
||||||
title: I-BERT
|
title: I-BERT
|
||||||
- local: model_doc/jamba
|
- local: model_doc/jamba
|
||||||
@ -547,8 +515,6 @@
|
|||||||
title: Jukebox
|
title: Jukebox
|
||||||
- local: model_doc/led
|
- local: model_doc/led
|
||||||
title: LED
|
title: LED
|
||||||
- local: model_doc/lfm2
|
|
||||||
title: LFM2
|
|
||||||
- local: model_doc/llama
|
- local: model_doc/llama
|
||||||
title: LLaMA
|
title: LLaMA
|
||||||
- local: model_doc/llama2
|
- local: model_doc/llama2
|
||||||
@ -593,8 +559,6 @@
|
|||||||
title: MobileBERT
|
title: MobileBERT
|
||||||
- local: model_doc/modernbert
|
- local: model_doc/modernbert
|
||||||
title: ModernBert
|
title: ModernBert
|
||||||
- local: model_doc/modernbert-decoder
|
|
||||||
title: ModernBERTDecoder
|
|
||||||
- local: model_doc/mpnet
|
- local: model_doc/mpnet
|
||||||
title: MPNet
|
title: MPNet
|
||||||
- local: model_doc/mpt
|
- local: model_doc/mpt
|
||||||
@ -677,8 +641,6 @@
|
|||||||
title: RoFormer
|
title: RoFormer
|
||||||
- local: model_doc/rwkv
|
- local: model_doc/rwkv
|
||||||
title: RWKV
|
title: RWKV
|
||||||
- local: model_doc/seed_oss
|
|
||||||
title: Seed-Oss
|
|
||||||
- local: model_doc/splinter
|
- local: model_doc/splinter
|
||||||
title: Splinter
|
title: Splinter
|
||||||
- local: model_doc/squeezebert
|
- local: model_doc/squeezebert
|
||||||
@ -691,8 +653,6 @@
|
|||||||
title: SwitchTransformers
|
title: SwitchTransformers
|
||||||
- local: model_doc/t5
|
- local: model_doc/t5
|
||||||
title: T5
|
title: T5
|
||||||
- local: model_doc/t5gemma
|
|
||||||
title: T5Gemma
|
|
||||||
- local: model_doc/t5v1.1
|
- local: model_doc/t5v1.1
|
||||||
title: T5v1.1
|
title: T5v1.1
|
||||||
- local: model_doc/tapex
|
- local: model_doc/tapex
|
||||||
@ -719,8 +679,6 @@
|
|||||||
title: XLM-V
|
title: XLM-V
|
||||||
- local: model_doc/xlnet
|
- local: model_doc/xlnet
|
||||||
title: XLNet
|
title: XLNet
|
||||||
- local: model_doc/xlstm
|
|
||||||
title: xLSTM
|
|
||||||
- local: model_doc/yoso
|
- local: model_doc/yoso
|
||||||
title: YOSO
|
title: YOSO
|
||||||
- local: model_doc/zamba
|
- local: model_doc/zamba
|
||||||
@ -729,8 +687,6 @@
|
|||||||
title: Zamba2
|
title: Zamba2
|
||||||
title: Text models
|
title: Text models
|
||||||
- sections:
|
- sections:
|
||||||
- local: model_doc/aimv2
|
|
||||||
title: Aimv2
|
|
||||||
- local: model_doc/beit
|
- local: model_doc/beit
|
||||||
title: BEiT
|
title: BEiT
|
||||||
- local: model_doc/bit
|
- local: model_doc/bit
|
||||||
@ -747,12 +703,6 @@
|
|||||||
title: D-FINE
|
title: D-FINE
|
||||||
- local: model_doc/dab-detr
|
- local: model_doc/dab-detr
|
||||||
title: DAB-DETR
|
title: DAB-DETR
|
||||||
- local: model_doc/deepseek_v2
|
|
||||||
title: DeepSeek-V2
|
|
||||||
- local: model_doc/deepseek_vl
|
|
||||||
title: DeepseekVL
|
|
||||||
- local: model_doc/deepseek_vl_hybrid
|
|
||||||
title: DeepseekVLHybrid
|
|
||||||
- local: model_doc/deformable_detr
|
- local: model_doc/deformable_detr
|
||||||
title: Deformable DETR
|
title: Deformable DETR
|
||||||
- local: model_doc/deit
|
- local: model_doc/deit
|
||||||
@ -773,26 +723,18 @@
|
|||||||
title: DINOV2
|
title: DINOV2
|
||||||
- local: model_doc/dinov2_with_registers
|
- local: model_doc/dinov2_with_registers
|
||||||
title: DINOv2 with Registers
|
title: DINOv2 with Registers
|
||||||
- local: model_doc/dinov3
|
|
||||||
title: DINOv3
|
|
||||||
- local: model_doc/dit
|
- local: model_doc/dit
|
||||||
title: DiT
|
title: DiT
|
||||||
- local: model_doc/dpt
|
- local: model_doc/dpt
|
||||||
title: DPT
|
title: DPT
|
||||||
- local: model_doc/efficientformer
|
- local: model_doc/efficientformer
|
||||||
title: EfficientFormer
|
title: EfficientFormer
|
||||||
- local: model_doc/efficientloftr
|
|
||||||
title: EfficientLoFTR
|
|
||||||
- local: model_doc/efficientnet
|
- local: model_doc/efficientnet
|
||||||
title: EfficientNet
|
title: EfficientNet
|
||||||
- local: model_doc/eomt
|
|
||||||
title: EoMT
|
|
||||||
- local: model_doc/focalnet
|
- local: model_doc/focalnet
|
||||||
title: FocalNet
|
title: FocalNet
|
||||||
- local: model_doc/glpn
|
- local: model_doc/glpn
|
||||||
title: GLPN
|
title: GLPN
|
||||||
- local: model_doc/hgnet_v2
|
|
||||||
title: HGNet-V2
|
|
||||||
- local: model_doc/hiera
|
- local: model_doc/hiera
|
||||||
title: Hiera
|
title: Hiera
|
||||||
- local: model_doc/ijepa
|
- local: model_doc/ijepa
|
||||||
@ -891,8 +833,6 @@
|
|||||||
title: CSM
|
title: CSM
|
||||||
- local: model_doc/dac
|
- local: model_doc/dac
|
||||||
title: dac
|
title: dac
|
||||||
- local: model_doc/dia
|
|
||||||
title: Dia
|
|
||||||
- local: model_doc/encodec
|
- local: model_doc/encodec
|
||||||
title: EnCodec
|
title: EnCodec
|
||||||
- local: model_doc/fastspeech2_conformer
|
- local: model_doc/fastspeech2_conformer
|
||||||
@ -901,8 +841,6 @@
|
|||||||
title: GraniteSpeech
|
title: GraniteSpeech
|
||||||
- local: model_doc/hubert
|
- local: model_doc/hubert
|
||||||
title: Hubert
|
title: Hubert
|
||||||
- local: model_doc/kyutai_speech_to_text
|
|
||||||
title: Kyutai Speech-To-Text
|
|
||||||
- local: model_doc/mctct
|
- local: model_doc/mctct
|
||||||
title: MCTCT
|
title: MCTCT
|
||||||
- local: model_doc/mimi
|
- local: model_doc/mimi
|
||||||
@ -953,8 +891,6 @@
|
|||||||
title: WavLM
|
title: WavLM
|
||||||
- local: model_doc/whisper
|
- local: model_doc/whisper
|
||||||
title: Whisper
|
title: Whisper
|
||||||
- local: model_doc/xcodec
|
|
||||||
title: X-Codec
|
|
||||||
- local: model_doc/xls_r
|
- local: model_doc/xls_r
|
||||||
title: XLS-R
|
title: XLS-R
|
||||||
- local: model_doc/xlsr_wav2vec2
|
- local: model_doc/xlsr_wav2vec2
|
||||||
@ -997,8 +933,6 @@
|
|||||||
title: CLIPSeg
|
title: CLIPSeg
|
||||||
- local: model_doc/clvp
|
- local: model_doc/clvp
|
||||||
title: CLVP
|
title: CLVP
|
||||||
- local: model_doc/cohere2_vision
|
|
||||||
title: Cohere2Vision
|
|
||||||
- local: model_doc/colpali
|
- local: model_doc/colpali
|
||||||
title: ColPali
|
title: ColPali
|
||||||
- local: model_doc/colqwen2
|
- local: model_doc/colqwen2
|
||||||
@ -1011,22 +945,12 @@
|
|||||||
title: Donut
|
title: Donut
|
||||||
- local: model_doc/emu3
|
- local: model_doc/emu3
|
||||||
title: Emu3
|
title: Emu3
|
||||||
- local: model_doc/evolla
|
|
||||||
title: Evolla
|
|
||||||
- local: model_doc/flava
|
- local: model_doc/flava
|
||||||
title: FLAVA
|
title: FLAVA
|
||||||
- local: model_doc/florence2
|
|
||||||
title: Florence2
|
|
||||||
- local: model_doc/gemma3
|
- local: model_doc/gemma3
|
||||||
title: Gemma3
|
title: Gemma3
|
||||||
- local: model_doc/gemma3n
|
|
||||||
title: Gemma3n
|
|
||||||
- local: model_doc/git
|
- local: model_doc/git
|
||||||
title: GIT
|
title: GIT
|
||||||
- local: model_doc/glm4v
|
|
||||||
title: glm4v
|
|
||||||
- local: model_doc/glm4v_moe
|
|
||||||
title: glm4v_moe
|
|
||||||
- local: model_doc/got_ocr2
|
- local: model_doc/got_ocr2
|
||||||
title: GOT-OCR2
|
title: GOT-OCR2
|
||||||
- local: model_doc/granitevision
|
- local: model_doc/granitevision
|
||||||
@ -1051,8 +975,6 @@
|
|||||||
title: Janus
|
title: Janus
|
||||||
- local: model_doc/kosmos-2
|
- local: model_doc/kosmos-2
|
||||||
title: KOSMOS-2
|
title: KOSMOS-2
|
||||||
- local: model_doc/kosmos2_5
|
|
||||||
title: KOSMOS-2.5
|
|
||||||
- local: model_doc/layoutlm
|
- local: model_doc/layoutlm
|
||||||
title: LayoutLM
|
title: LayoutLM
|
||||||
- local: model_doc/layoutlmv2
|
- local: model_doc/layoutlmv2
|
||||||
@ -1066,7 +988,7 @@
|
|||||||
- local: model_doc/llama4
|
- local: model_doc/llama4
|
||||||
title: Llama4
|
title: Llama4
|
||||||
- local: model_doc/llava
|
- local: model_doc/llava
|
||||||
title: LLaVA
|
title: Llava
|
||||||
- local: model_doc/llava_next
|
- local: model_doc/llava_next
|
||||||
title: LLaVA-NeXT
|
title: LLaVA-NeXT
|
||||||
- local: model_doc/llava_next_video
|
- local: model_doc/llava_next_video
|
||||||
@ -1077,24 +999,18 @@
|
|||||||
title: LXMERT
|
title: LXMERT
|
||||||
- local: model_doc/matcha
|
- local: model_doc/matcha
|
||||||
title: MatCha
|
title: MatCha
|
||||||
- local: model_doc/metaclip_2
|
|
||||||
title: MetaCLIP 2
|
|
||||||
- local: model_doc/mgp-str
|
- local: model_doc/mgp-str
|
||||||
title: MGP-STR
|
title: MGP-STR
|
||||||
- local: model_doc/mistral3
|
- local: model_doc/mistral3
|
||||||
title: Mistral3
|
title: Mistral3
|
||||||
- local: model_doc/mllama
|
- local: model_doc/mllama
|
||||||
title: mllama
|
title: mllama
|
||||||
- local: model_doc/mm-grounding-dino
|
|
||||||
title: MM Grounding DINO
|
|
||||||
- local: model_doc/nougat
|
- local: model_doc/nougat
|
||||||
title: Nougat
|
title: Nougat
|
||||||
- local: model_doc/omdet-turbo
|
- local: model_doc/omdet-turbo
|
||||||
title: OmDet-Turbo
|
title: OmDet-Turbo
|
||||||
- local: model_doc/oneformer
|
- local: model_doc/oneformer
|
||||||
title: OneFormer
|
title: OneFormer
|
||||||
- local: model_doc/ovis2
|
|
||||||
title: Ovis2
|
|
||||||
- local: model_doc/owlvit
|
- local: model_doc/owlvit
|
||||||
title: OWL-ViT
|
title: OWL-ViT
|
||||||
- local: model_doc/owlv2
|
- local: model_doc/owlv2
|
||||||
@ -1103,8 +1019,6 @@
|
|||||||
title: PaliGemma
|
title: PaliGemma
|
||||||
- local: model_doc/perceiver
|
- local: model_doc/perceiver
|
||||||
title: Perceiver
|
title: Perceiver
|
||||||
- local: model_doc/perception_lm
|
|
||||||
title: PerceptionLM
|
|
||||||
- local: model_doc/phi4_multimodal
|
- local: model_doc/phi4_multimodal
|
||||||
title: Phi4 Multimodal
|
title: Phi4 Multimodal
|
||||||
- local: model_doc/pix2struct
|
- local: model_doc/pix2struct
|
||||||
@ -1119,10 +1033,6 @@
|
|||||||
title: Qwen2Audio
|
title: Qwen2Audio
|
||||||
- local: model_doc/qwen2_vl
|
- local: model_doc/qwen2_vl
|
||||||
title: Qwen2VL
|
title: Qwen2VL
|
||||||
- local: model_doc/sam2
|
|
||||||
title: SAM2
|
|
||||||
- local: model_doc/sam2_video
|
|
||||||
title: SAM2 Video
|
|
||||||
- local: model_doc/sam
|
- local: model_doc/sam
|
||||||
title: Segment Anything
|
title: Segment Anything
|
||||||
- local: model_doc/sam_hq
|
- local: model_doc/sam_hq
|
||||||
@ -1133,8 +1043,6 @@
|
|||||||
title: SigLIP
|
title: SigLIP
|
||||||
- local: model_doc/siglip2
|
- local: model_doc/siglip2
|
||||||
title: SigLIP2
|
title: SigLIP2
|
||||||
- local: model_doc/smollm3
|
|
||||||
title: SmolLM3
|
|
||||||
- local: model_doc/smolvlm
|
- local: model_doc/smolvlm
|
||||||
title: SmolVLM
|
title: SmolVLM
|
||||||
- local: model_doc/speech-encoder-decoder
|
- local: model_doc/speech-encoder-decoder
|
||||||
@ -1161,8 +1069,6 @@
|
|||||||
title: Vision Text Dual Encoder
|
title: Vision Text Dual Encoder
|
||||||
- local: model_doc/visual_bert
|
- local: model_doc/visual_bert
|
||||||
title: VisualBERT
|
title: VisualBERT
|
||||||
- local: model_doc/voxtral
|
|
||||||
title: Voxtral
|
|
||||||
- local: model_doc/xclip
|
- local: model_doc/xclip
|
||||||
title: X-CLIP
|
title: X-CLIP
|
||||||
title: Multimodal models
|
title: Multimodal models
|
||||||
@ -1220,3 +1126,4 @@
|
|||||||
title: Environment Variables
|
title: Environment Variables
|
||||||
title: Reference
|
title: Reference
|
||||||
title: API
|
title: API
|
||||||
|
|
||||||
|
@ -13,7 +13,7 @@ rendered properly in your Markdown viewer.
|
|||||||
|
|
||||||
-->
|
-->
|
||||||
|
|
||||||
# Legacy model contribution
|
# Adding a new model to Transformers
|
||||||
|
|
||||||
> [!TIP]
|
> [!TIP]
|
||||||
> Try adding new models with a more [modular](./modular_transformers) approach first. This makes it significantly easier to contribute a model to Transformers!
|
> Try adding new models with a more [modular](./modular_transformers) approach first. This makes it significantly easier to contribute a model to Transformers!
|
||||||
|
@ -100,18 +100,19 @@ pipeline("This is the best meal I've ever had")
|
|||||||
|
|
||||||
Register the new task your pipeline supports in the `PIPELINE_REGISTRY`. The registry defines:
|
Register the new task your pipeline supports in the `PIPELINE_REGISTRY`. The registry defines:
|
||||||
|
|
||||||
- The supported Pytorch model class with `pt_model`
|
- the machine learning framework the pipeline supports with either `pt_model` or `tf_model` (add both to ensure it works with either frameworks)
|
||||||
- a default model which should come from a specific revision (branch, or commit hash) where the model works as expected with `default`
|
- a default model which should come from a specific revision (branch, or commit hash) where the model works as expected with `default`
|
||||||
- the expected input with `type`
|
- the expected input with `type`
|
||||||
|
|
||||||
```py
|
```py
|
||||||
from transformers.pipelines import PIPELINE_REGISTRY
|
from transformers.pipelines import PIPELINE_REGISTRY
|
||||||
from transformers import AutoModelForSequenceClassification
|
from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
|
||||||
|
|
||||||
PIPELINE_REGISTRY.register_pipeline(
|
PIPELINE_REGISTRY.register_pipeline(
|
||||||
"new-task",
|
"new-task",
|
||||||
pipeline_class=MyPipeline,
|
pipeline_class=MyPipeline,
|
||||||
pt_model=AutoModelForSequenceClassification,
|
pt_model=AutoModelForSequenceClassification,
|
||||||
|
tf_model=TFAutoModelForSequenceClassification,
|
||||||
default={"pt": ("user/awesome-model", "branch-name")},
|
default={"pt": ("user/awesome-model", "branch-name")},
|
||||||
type="text",
|
type="text",
|
||||||
)
|
)
|
||||||
@ -127,7 +128,7 @@ It's faster to upload your pipeline code to the Hub because it doesn't require a
|
|||||||
|
|
||||||
Add your pipeline code to the Hub in a Python file.
|
Add your pipeline code to the Hub in a Python file.
|
||||||
|
|
||||||
For example, a custom pipeline for sentence pair classification might look like the following code below.
|
For example, a custom pipeline for sentence pair classification might look like the following code below. The implementation works for PyTorch and TensorFlow models.
|
||||||
|
|
||||||
```py
|
```py
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -167,12 +168,13 @@ Save the code in a file named `pair_classification.py`, and import and register
|
|||||||
```py
|
```py
|
||||||
from pair_classification import PairClassificationPipeline
|
from pair_classification import PairClassificationPipeline
|
||||||
from transformers.pipelines import PIPELINE_REGISTRY
|
from transformers.pipelines import PIPELINE_REGISTRY
|
||||||
from transformers import AutoModelForSequenceClassification
|
from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
|
||||||
|
|
||||||
PIPELINE_REGISTRY.register_pipeline(
|
PIPELINE_REGISTRY.register_pipeline(
|
||||||
"pair-classification",
|
"pair-classification",
|
||||||
pipeline_class=PairClassificationPipeline,
|
pipeline_class=PairClassificationPipeline,
|
||||||
pt_model=AutoModelForSequenceClassification,
|
pt_model=AutoModelForSequenceClassification,
|
||||||
|
tf_model=TFAutoModelForSequenceClassification,
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -185,6 +187,9 @@ The [register_pipeline](https://github.com/huggingface/transformers/blob/9feae5f
|
|||||||
"pt": [
|
"pt": [
|
||||||
"AutoModelForSequenceClassification"
|
"AutoModelForSequenceClassification"
|
||||||
],
|
],
|
||||||
|
"tf": [
|
||||||
|
"TFAutoModelForSequenceClassification"
|
||||||
|
],
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
```
|
```
|
||||||
@ -214,11 +219,11 @@ Add your pipeline code as a new module to the [pipelines](https://github.com/hug
|
|||||||
|
|
||||||
Next, add a new test for the pipeline in [transformers/tests/pipelines](https://github.com/huggingface/transformers/tree/main/tests/pipelines). You can look at the other tests for examples of how to test your pipeline.
|
Next, add a new test for the pipeline in [transformers/tests/pipelines](https://github.com/huggingface/transformers/tree/main/tests/pipelines). You can look at the other tests for examples of how to test your pipeline.
|
||||||
|
|
||||||
The [run_pipeline_test](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L186) function should be very generic and run on the models defined in [model_mapping](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L48). This is important for testing future compatibility with new models.
|
The [run_pipeline_test](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L186) function should be very generic and run on the models defined in [model_mapping](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L48) and [tf_model_mapping](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L49). This is important for testing future compatibility with new models.
|
||||||
|
|
||||||
You'll also notice `ANY` is used throughout the [run_pipeline_test](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L186) function. The models are random, so you can't check the actual values. Using `ANY` allows the test to match the output of the pipeline type instead.
|
You'll also notice `ANY` is used throughout the [run_pipeline_test](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L186) function. The models are random, so you can't check the actual values. Using `ANY` allows the test to match the output of the pipeline type instead.
|
||||||
|
|
||||||
Finally, you should also implement the following 4 tests.
|
Finally, you should also implement the following 4 tests.
|
||||||
|
|
||||||
1. [test_small_model_pt](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L59), use a small model for these pipelines to make sure they return the correct outputs. The results don't have to make sense. Each pipeline should return the same result.
|
1. [test_small_model_pt](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L59) and [test_small_model_tf](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L150), use a small model for these pipelines to make sure they return the correct outputs. The results don't have to make sense. Each pipeline should return the same result.
|
||||||
1. [test_large_model_pt](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_zero_shot_image_classification.py#L187), use a realistic model for these pipelines to make sure they return meaningful results. These tests are slow and should be marked as slow.
|
1. [test_large_model_pt](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_zero_shot_image_classification.py#L187) nad [test_large_model_tf](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_zero_shot_image_classification.py#L220), use a realistic model for these pipelines to make sure they return meaningful results. These tests are slow and should be marked as slow.
|
||||||
|
@ -14,9 +14,5 @@ rendered properly in your Markdown viewer.
|
|||||||
|
|
||||||
-->
|
-->
|
||||||
|
|
||||||
# Agents
|
|
||||||
|
|
||||||
(deprecated)
|
|
||||||
|
|
||||||
> [!WARNING]
|
> [!WARNING]
|
||||||
> Agents and tools were spun out into the standalone [smolagents](https://huggingface.co/docs/smolagents/index) library. They were removed from `transformers` in v4.52.
|
> Agents and tools were spun out into the standalone [smolagents](https://huggingface.co/docs/smolagents/index) library. They were removed from `transformers` in v4.52.
|
||||||
|
@ -60,11 +60,11 @@ You will see it prints "I just entered the attention computation" as many times
|
|||||||
|
|
||||||
## Dynamically switching attention function
|
## Dynamically switching attention function
|
||||||
|
|
||||||
You could dynamically change the model's attention function as well:
|
You could dynamically change the model's attention function as well, by overriding the `config._attn_implementation` field:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# Back to use original sdpa implementation
|
# Back to use original sdpa implementation
|
||||||
model.set_attn_implementation("sdpa")
|
model.config._attn_implementation = "sdpa"
|
||||||
|
|
||||||
model(torch.ones(1, 5, dtype=int))
|
model(torch.ones(1, 5, dtype=int))
|
||||||
```
|
```
|
||||||
@ -72,34 +72,6 @@ model(torch.ones(1, 5, dtype=int))
|
|||||||
and it will stop printing the statements, as it now uses the `sdpa` attention.
|
and it will stop printing the statements, as it now uses the `sdpa` attention.
|
||||||
This allows to quickly change an attention function, without needing to reload the model!
|
This allows to quickly change an attention function, without needing to reload the model!
|
||||||
|
|
||||||
## Different attention per backbone in multimodal models
|
|
||||||
|
|
||||||
For multimodal models different attention functions may work better for each backbone module. For example, some vision backbones perform better in fp32, but are incompatible with FlashAttention. To continue using FlashAttention while keeping the vision encoder in fp32, create a dict and map each config to an attention implementation as shown below.
|
|
||||||
|
|
||||||
```python
|
|
||||||
from transformers import AutoModelForImageTextToText
|
|
||||||
|
|
||||||
model_id = "facebook/chameleon-7b"
|
|
||||||
|
|
||||||
attention_implementation_per_backbone = {"vision_config": "sdpa", "text_config": "flash_attention_2"}
|
|
||||||
model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation=attention_implementation_per_backbone)
|
|
||||||
|
|
||||||
# NOTE: keys in the attention implementation have to be the same as the sub-config names
|
|
||||||
for key in attention_implementation_per_backbone:
|
|
||||||
assert key in model.config.sub_configs, f"Invalid key in `attention_implementation`"
|
|
||||||
|
|
||||||
# You can omit certain backbones - the default attention function (SDPA) will be used
|
|
||||||
# This is equivalent to the previous example
|
|
||||||
model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation={"text_config": "flash_attention_2"})
|
|
||||||
|
|
||||||
|
|
||||||
# Set the same attention implementation for all backbones with single string, same as in non-multimodal models
|
|
||||||
model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation="eager")
|
|
||||||
|
|
||||||
# Alternatively use a dict with an empty key for global configuration
|
|
||||||
model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation={"": "eager"})
|
|
||||||
```
|
|
||||||
|
|
||||||
## What about new args needed in my custom attention function?
|
## What about new args needed in my custom attention function?
|
||||||
|
|
||||||
But indeed, what if the new function requires a new arg to be properly used? It's no issue! Models supporting the
|
But indeed, what if the new function requires a new arg to be properly used? It's no issue! Models supporting the
|
||||||
|
@ -14,26 +14,43 @@ rendered properly in your Markdown viewer.
|
|||||||
|
|
||||||
-->
|
-->
|
||||||
|
|
||||||
# Documenting a model
|
# Utilizing the @auto_docstring Decorator
|
||||||
|
|
||||||
The `@auto_docstring` decorator in Transformers generates consistent docstrings for model classes and their methods. It reduces boilerplate by automatically including standard argument descriptions while also allowing overrides to add new or custom arguments. [Contributing a new model](./modular_transformers) is easier because you don't need to manually add the standard docstrings, and only focus on documenting new arguments.
|
The `@auto_docstring` decorator in the Hugging Face Transformers library helps generate docstrings for model classes and their methods, which will be used to build the documentation for the library. It aims to improve consistency and reduce boilerplate by automatically including standard argument descriptions and allowing for targeted overrides and additions.
|
||||||
|
|
||||||
This guide describes how to use the `@auto_docstring` decorator and how it works.
|
---
|
||||||
|
|
||||||
## @auto_docstring
|
## 📜 How it Works
|
||||||
|
|
||||||
Start by importing the decorator in the modeling file (`modular_model.py` or `modeling_model.py`).
|
The `@auto_docstring` decorator constructs docstrings by:
|
||||||
|
|
||||||
|
1. **Signature Inspection:** It inspects the signature (arguments, types, defaults) of the decorated class's `__init__` method or the decorated function.
|
||||||
|
2. **Centralized Docstring Fetching:** It retrieves predefined docstrings for common arguments (e.g., `input_ids`, `attention_mask`) from internal library sources (like `ModelArgs` or `ImageProcessorArgs` in `utils/args_doc.py`).
|
||||||
|
3. **Overriding or Adding Arguments Descriptions:**
|
||||||
|
* **Direct Docstring Block:** It incorporates custom docstring content from an `r""" """` (or `""" """`) block below the method signature or within the `__init__` docstring. This is for documenting new arguments or overriding standard descriptions.
|
||||||
|
* **Decorator Arguments (`custom_args`):** A `custom_args` docstring block can be passed to the decorator to provide docstrings for specific arguments directly in the decorator call. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
|
||||||
|
4. **Adding Classes and Functions Introduction:**
|
||||||
|
* **`custom_intro` argument:** Allows prepending a custom introductory paragraph to a class or function docstring.
|
||||||
|
* **Automatic Introduction Generation:** For model classes with standard naming patterns (like `ModelForCausalLM`) or belonging to a pipeline, the decorator automatically generates an appropriate introductory paragraph using `ClassDocstring` in `utils/args_doc.py` as the source.
|
||||||
|
5. **Templating:** The decorator uses a templating system, allowing predefined docstrings to include dynamic information deduced from the `auto_modules` of the library, such as `{{processor_class}}` or `{{config_class}}`.
|
||||||
|
6. **Deducing Relevant Examples:** The decorator attempts to find appropriate usage examples based on the model's task or pipeline compatibility. It extracts checkpoint information from the model's configuration class to provide concrete examples with real model identifiers.
|
||||||
|
7. **Adding Return Value Documentation:** For methods like `forward`, the decorator can automatically generate the "Returns" section based on the method's return type annotation. For example, for a method returning a `ModelOutput` subclass, it will extracts field descriptions from that class's docstring to create a comprehensive return value description. A custom `Returns` section can also be manually specified in the function docstring block.
|
||||||
|
8. **Unrolling Kwargs Typed With Unpack Operator:** For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentation from the TypedDict and adds each parameter to the function's docstring. Currently, this functionality is only supported for `FastImageProcessorKwargs`.
|
||||||
|
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🚀 How to Use @auto_docstring
|
||||||
|
|
||||||
|
### 1. Importing the Decorator
|
||||||
|
Import the decorator into your modeling file:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from ...utils import auto_docstring
|
from ...utils import auto_docstring
|
||||||
```
|
```
|
||||||
|
|
||||||
Select whether you'd like to apply `@auto_docstring` to a class or function below to see how to use it.
|
### 2. Applying to Classes
|
||||||
|
Place `@auto_docstring` directly above the class definition. It uses the `__init__` method's signature and its docstring for parameter descriptions.
|
||||||
<hfoptions id="type">
|
|
||||||
<hfoption id="classes">
|
|
||||||
|
|
||||||
Place `@auto_docstring` directly above the class definition. The decorator derives parameter descriptions from the `__init__` method's signature and docstring.
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers.modeling_utils import PreTrainedModel
|
from transformers.modeling_utils import PreTrainedModel
|
||||||
@ -56,7 +73,9 @@ class MyAwesomeModel(PreTrainedModel):
|
|||||||
# ... other methods
|
# ... other methods
|
||||||
```
|
```
|
||||||
|
|
||||||
Arguments can also be passed directly to `@auto_docstring` for more control. Use the `custom_intro` parameter to describe the argument and the `custom_args` parameter to describe the arguments.
|
#### Advanced Class Decoration:
|
||||||
|
|
||||||
|
Arguments can be passed directly to `@auto_docstring` for more control:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@auto_docstring(
|
@auto_docstring(
|
||||||
@ -64,9 +83,9 @@ Arguments can also be passed directly to `@auto_docstring` for more control. Use
|
|||||||
It builds upon the standard Transformer architecture with unique modifications.""",
|
It builds upon the standard Transformer architecture with unique modifications.""",
|
||||||
custom_args="""
|
custom_args="""
|
||||||
custom_parameter (`type`, *optional*, defaults to `default_value`):
|
custom_parameter (`type`, *optional*, defaults to `default_value`):
|
||||||
A concise description for custom_parameter if not defined or overriding the description in `auto_docstring.py`.
|
A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
|
||||||
internal_helper_arg (`type`, *optional*, defaults to `default_value`):
|
internal_helper_arg (`type`, *optional*, defaults to `default_value`):
|
||||||
A concise description for internal_helper_arg if not defined or overriding the description in `auto_docstring.py`.
|
A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
class MySpecialModel(PreTrainedModel):
|
class MySpecialModel(PreTrainedModel):
|
||||||
@ -74,7 +93,7 @@ class MySpecialModel(PreTrainedModel):
|
|||||||
# ...
|
# ...
|
||||||
```
|
```
|
||||||
|
|
||||||
You can also choose to only use `custom_intro` and define the custom arguments directly in the class.
|
Or:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@auto_docstring(
|
@auto_docstring(
|
||||||
@ -85,44 +104,15 @@ class MySpecialModel(PreTrainedModel):
|
|||||||
def __init__(self, config: ConfigType, custom_parameter: "type" = "default_value", internal_helper_arg=None):
|
def __init__(self, config: ConfigType, custom_parameter: "type" = "default_value", internal_helper_arg=None):
|
||||||
r"""
|
r"""
|
||||||
custom_parameter (`type`, *optional*, defaults to `default_value`):
|
custom_parameter (`type`, *optional*, defaults to `default_value`):
|
||||||
A concise description for custom_parameter if not defined or overriding the description in `auto_docstring.py`.
|
A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
|
||||||
internal_helper_arg (`type`, *optional*, defaults to `default_value`):
|
internal_helper_arg (`type`, *optional*, defaults to `default_value`):
|
||||||
A concise description for internal_helper_arg if not defined or overriding the description in `auto_docstring.py`.
|
A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
|
||||||
"""
|
"""
|
||||||
# ...
|
# ...
|
||||||
```
|
```
|
||||||
|
|
||||||
You should also use the `@auto_docstring` decorator for classes that inherit from [`~utils.ModelOutput`].
|
### 3. Applying to Functions (e.g., `forward` method)
|
||||||
|
Apply the decorator above method definitions, such as the `forward` method.
|
||||||
```python
|
|
||||||
@dataclass
|
|
||||||
@auto_docstring(
|
|
||||||
custom_intro="""
|
|
||||||
Custom model outputs with additional fields.
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
class MyModelOutput(ImageClassifierOutput):
|
|
||||||
r"""
|
|
||||||
loss (`torch.FloatTensor`, *optional*):
|
|
||||||
The loss of the model.
|
|
||||||
custom_field (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*):
|
|
||||||
A custom output field specific to this model.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Standard fields like hidden_states, logits, attentions etc. can be automatically documented if the description is the same as the standard arguments.
|
|
||||||
# However, given that the loss docstring is often different per model, you should document it in the docstring above.
|
|
||||||
loss: Optional[torch.FloatTensor] = None
|
|
||||||
logits: Optional[torch.FloatTensor] = None
|
|
||||||
hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
|
|
||||||
attentions: Optional[tuple[torch.FloatTensor, ...]] = None
|
|
||||||
# Custom fields need to be documented in the docstring above
|
|
||||||
custom_field: Optional[torch.FloatTensor] = None
|
|
||||||
```
|
|
||||||
|
|
||||||
</hfoption>
|
|
||||||
<hfoption id="functions">
|
|
||||||
|
|
||||||
Place `@auto_docstring` directly above the method definition. The decorator derives parameter descriptions from the function signature.
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@auto_docstring
|
@auto_docstring
|
||||||
@ -141,10 +131,9 @@ Place `@auto_docstring` directly above the method definition. The decorator deri
|
|||||||
# ...
|
# ...
|
||||||
```
|
```
|
||||||
|
|
||||||
Arguments can also be passed directly to `@auto_docstring` for more control. Use the `custom_intro` parameter to describe the argument and the `custom_args` parameter to describe the arguments.
|
#### Advanced Function Decoration:
|
||||||
|
|
||||||
The `Returns` and `Examples` parts of the docstring can also be manually specified.
|
|
||||||
|
|
||||||
|
Arguments can be passed directly to `@auto_docstring` for more control. `Returns` and `Examples` sections can also be manually specified:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
MODEL_COMMON_CUSTOM_ARGS = r"""
|
MODEL_COMMON_CUSTOM_ARGS = r"""
|
||||||
@ -191,117 +180,100 @@ class MyModel(PreTrainedModel):
|
|||||||
# ...
|
# ...
|
||||||
```
|
```
|
||||||
|
|
||||||
</hfoption>
|
---
|
||||||
</hfoptions>
|
|
||||||
|
|
||||||
## Documenting arguments
|
### ✍️ Documenting Arguments: Approach & Priority
|
||||||
|
|
||||||
There are some rules for documenting different types of arguments and they're listed below.
|
1. **Standard Arguments (e.g., `input_ids`, `attention_mask`, `pixel_values`, `encoder_hidden_states` etc.):**
|
||||||
|
* `@auto_docstring` retrieves descriptions from a central source. Do not redefine these locally if their description and shape are the same as in `args_doc.py`.
|
||||||
- Standard arguments (`input_ids`, `attention_mask`, `pixel_values`, etc.) are defined and retrieved from `auto_docstring.py`. It is the single source of truth for standard arguments and should not be redefined locally if an argument's description and shape is the same as an argument in `auto_docstring.py`.
|
|
||||||
|
|
||||||
If a standard argument behaves differently in your model, then you can override it locally in a `r""" """` block. This local definition has a higher priority. For example, the `labels` argument is often customized per model and typically requires overriding.
|
|
||||||
|
|
||||||
|
|
||||||
- New or custom arguments should be documented within an `r""" """` block after the signature if it is a function or in the `__init__` method's docstring if it is a class.
|
|
||||||
|
|
||||||
```py
|
|
||||||
argument_name (`type`, *optional*, defaults to `X`):
|
|
||||||
Description of the argument.
|
|
||||||
Explain its purpose, expected shape/type if complex, and default behavior.
|
|
||||||
This can span multiple lines.
|
|
||||||
```
|
|
||||||
|
|
||||||
|
2. **New or Custom Arguments:**
|
||||||
|
* **Primary Method:** Document these within an `r""" """` docstring block following the signature (for functions) or in the `__init__` method's docstring (for class parameters).
|
||||||
|
* **Format:**
|
||||||
|
```
|
||||||
|
argument_name (`type`, *optional*, defaults to `X`):
|
||||||
|
Description of the argument.
|
||||||
|
Explain its purpose, expected shape/type if complex, and default behavior.
|
||||||
|
This can span multiple lines.
|
||||||
|
```
|
||||||
* Include `type` in backticks.
|
* Include `type` in backticks.
|
||||||
* Add *optional* if the argument is not required or has a default value.
|
* Add "*optional*" if the argument is not required (has a default value).
|
||||||
* Add "defaults to X" if it has a default value. You don't need to add "defaults to `None`" if the default value is `None`.
|
* Add "defaults to `X`" if it has a default value (no need to specify "defaults to `None`" if the default value is `None`).
|
||||||
|
|
||||||
These arguments can also be passed to `@auto_docstring` as a `custom_args` argument. It is used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
|
3. **Overriding Standard Arguments:**
|
||||||
|
* If a standard argument behaves differently (e.g., different expected shape, model-specific behavior), provide its complete description in the local `r""" """` docstring. This local definition takes precedence.
|
||||||
|
* The `labels` argument is often customized per model and typically requires a specific docstring.
|
||||||
|
|
||||||
```py
|
4. **Using Decorator Arguments for Overrides or New Arguments (`custom_args`):**
|
||||||
class MyModel(PreTrainedModel):
|
* New or custom arguments docstrings can also be passed to `@auto_docstring` as a `custom_args` argument. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
|
||||||
# ...
|
|
||||||
@auto_docstring(
|
|
||||||
custom_intro="""
|
|
||||||
This is a custom introduction for the function.
|
|
||||||
"""
|
|
||||||
custom_args=r"""
|
|
||||||
common_arg_1 (`torch.Tensor`, *optional*, defaults to `default_value`):
|
|
||||||
Description of common_arg_1
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Checking the docstrings
|
---
|
||||||
|
|
||||||
Transformers includes a utility script to validate the docstrings when you open a Pull Request which triggers CI (continuous integration) checks. The script checks for the following criteria.
|
### Usage with [modular files](./modular_transformers)
|
||||||
|
|
||||||
* Ensures `@auto_docstring` is applied to relevant mode classes and public methods.
|
When working with modular files, follow these guidelines for applying the `@auto_docstring` decorator:
|
||||||
* Ensures arguments are complete and consistent. It checks that documented arguments exist in the signature and verifies whether the types and default values in the docstring match the signature. Arguments that aren't known standard arguments or if they lack a local description are flagged.
|
|
||||||
* Reminds you to complete placeholders like `<fill_type>` and `<fill_docstring>`.
|
|
||||||
* Ensures docstrings are formatted according to the expected docstring style.
|
|
||||||
|
|
||||||
You can run this check locally - before committing - by running the following command.
|
- **For standalone models in modular files:**
|
||||||
|
Apply the `@auto_docstring` decorator just as you would in regular modeling files.
|
||||||
|
|
||||||
|
- **For models inheriting from other library models:**
|
||||||
|
- When inheriting from a parent model, decorators (including `@auto_docstring`) are automatically carried over to the generated modeling file without needing to add them in your modular file.
|
||||||
|
- If you need to modify the `@auto_docstring` behavior, apply the customized decorator in your modular file, making sure to *include all other decorators* that were present on the original function/class.
|
||||||
|
|
||||||
|
> **Warning**: When overriding any decorator in a modular file, you must include ALL decorators that were applied to that function/class in the parent model. If you only override some decorators, the others won't be included in the generated modeling file.
|
||||||
|
|
||||||
|
|
||||||
|
**Note**: The `check_auto_docstrings` tool doesn't check modular files directly, but it will check (and modify when using `--fix_and_overwrite`) the generated modeling files. If issues are found in the generated files, you'll need to update your modular files accordingly.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## ✅ Checking Your Docstrings with `check_auto_docstrings`
|
||||||
|
|
||||||
|
The library includes a utility script to validate docstrings. This check is typically run during Continuous Integration (CI).
|
||||||
|
|
||||||
|
#### What it Checks:
|
||||||
|
|
||||||
|
* **Decorator Presence:** Ensures `@auto_docstring` is applied to relevant model classes and public methods. (TODO)
|
||||||
|
* **Argument Completeness & Consistency:**
|
||||||
|
* Flags arguments in the signature that are not known standard arguments and lack a local description.
|
||||||
|
* Ensures documented arguments exist in the signature. (TODO)
|
||||||
|
* Verifies that types and default values in the docstring match the signature. (TODO)
|
||||||
|
* **Placeholder Detection:** Reminds you to complete placeholders like `<fill_type>` or `<fill_docstring>`.
|
||||||
|
* **Formatting:** Adherence to the expected docstring style.
|
||||||
|
|
||||||
|
#### Running the Check Locally:
|
||||||
|
|
||||||
|
Run this check locally before committing. The common command is:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
make fix-copies
|
make fix-copies
|
||||||
```
|
```
|
||||||
|
|
||||||
`make fix-copies` runs several other checks as well. If you don't need those checks, run the command below to only perform docstring and auto-docstring checks.
|
Alternatively, to only perform docstrings and auto-docstring checks, you can use:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python utils/check_docstrings.py # to only check files included in the diff without fixing them
|
python utils/check_docstrings.py # to only check files included in the diff without fixing them
|
||||||
# python utils/check_docstrings.py --fix_and_overwrite # to fix and overwrite the files in the diff
|
# Or: python utils/check_docstrings.py --fix_and_overwrite # to fix and overwrite the files in the diff
|
||||||
# python utils/check_docstrings.py --fix_and_overwrite --check_all # to fix and overwrite all files
|
# Or: python utils/check_docstrings.py --fix_and_overwrite --check_all # to fix and overwrite all files
|
||||||
```
|
```
|
||||||
|
|
||||||
## modular_model.py files
|
#### Workflow with the Checker:
|
||||||
|
|
||||||
When working with modular files (`modular_model.py`), follow the guidelines below for applying `@auto_docstring`.
|
1. Add `@auto_docstring(...)` to the class or method.
|
||||||
|
2. For new, custom, or overridden arguments, add descriptions in an `r""" """` block.
|
||||||
|
3. Run `make fix-copies` (or the `check_docstrings.py` utility).
|
||||||
|
* For unrecognized arguments lacking documentation, the utility will create placeholder entries.
|
||||||
|
4. Manually edit these placeholders with accurate types and descriptions.
|
||||||
|
5. Re-run the check to ensure all issues are resolved.
|
||||||
|
|
||||||
- For standalone models in modular files, apply `@auto_docstring` like you would in a `modeling_model.py` file.
|
---
|
||||||
- For models that inherit from other library models, `@auto_docstring` is automatically carried over to the generated modeling file. You don't need to add `@auto_docstring` in your modular file.
|
|
||||||
|
|
||||||
If you need to modify the `@auto_docstring` behavior, apply the customized decorator in your modular file. Make sure to **include all other decorators** that are present in the original function or class.
|
## 🔑 Key Takeaways & Best Practices
|
||||||
|
|
||||||
> [!WARNING]
|
* Use `@auto_docstring` for new PyTorch model classes (`PreTrainedModel` subclasses) and their primary for methods (e.g., `forward`, `get_text_features` etc.).
|
||||||
> When overriding any decorator in a modular file, you must include **all** decorators that were applied to that function or class in the parent model. If you only override some decorators, the others won't be included in the generated modeling file.
|
* For classes, the `__init__` method's docstring is the main source for parameter descriptions when using `@auto_docstring` on the class.
|
||||||
|
* Rely on standard docstrings; do not redefine common arguments unless their behavior is different in your specific model.
|
||||||
## How it works
|
|
||||||
|
|
||||||
The `@auto_docstring` decorator automatically generates docstrings by:
|
|
||||||
|
|
||||||
1. Inspecting the signature (arguments, types, defaults) of the decorated class' `__init__` method or the decorated function.
|
|
||||||
2. Retrieving the predefined docstrings for common arguments (`input_ids`, `attention_mask`, etc.) from internal library sources like [`ModelArgs`], [`ImageProcessorArgs`], and the `auto_docstring.py` file.
|
|
||||||
3. Adding argument descriptions in one of two ways as shown below.
|
|
||||||
|
|
||||||
| method | description | usage |
|
|
||||||
|---|---|---|
|
|
||||||
| `r""" """` | add custom docstring content directly to a method signature or within the `__init__` docstring | document new arguments or override standard descriptions |
|
|
||||||
| `custom_args` | add custom docstrings for specific arguments directly in `@auto_docstring` | define docstring for new arguments once if they're repeated in multiple places in the modeling file |
|
|
||||||
|
|
||||||
4. Adding class and function descriptions. For model classes with standard naming patterns, like `ModelForCausalLM`, or if it belongs to a pipeline, `@auto_docstring` automatically generates the appropriate descriptions with `ClassDocstring` from `auto_docstring.py`.
|
|
||||||
|
|
||||||
`@auto_docstring` also accepts the `custom_intro` argument to describe a class or function.
|
|
||||||
|
|
||||||
5. Using a templating system to allow predefined docstrings to include dynamic information from Transformers' [auto_modules](https://github.com/huggingface/transformers/tree/main/src/transformers/models/auto) such as `{{processor_class}}` and `{{config_class}}`.
|
|
||||||
|
|
||||||
6. Finding appropriate usage examples based on the model's task or pipeline compatibility. It extracts checkpoint information form the model's configuration class to provide concrete examples with real model identifiers.
|
|
||||||
|
|
||||||
7. Adding return values to the docstring. For methods like `forward`, the decorator automatically generates the `Returns` field in the docstring based on the method's return type annotation.
|
|
||||||
|
|
||||||
For example, if a method returns a [`~transformers.utils.ModelOutput`] subclass, `@auto_docstring` extracts the field descriptions from the class' docstring to create a comprehensive return value description. You can also manually specify a custom `Returns` field in a functions docstring.
|
|
||||||
|
|
||||||
8. Unrolling kwargs typed with the unpack operator. For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentations from the `TypedDict` and adds each parameter to the function's docstring.
|
|
||||||
|
|
||||||
Currently only supported for [`FastImageProcessorKwargs`].
|
|
||||||
|
|
||||||
## Best practices
|
|
||||||
|
|
||||||
Follow the best practices below to help maintain consistent and informative documentation for Transformers!
|
|
||||||
|
|
||||||
* Use `@auto_docstring` for new PyTorch model classes ([`PreTrainedModel`] subclasses) and their primary methods like `forward` or `get_text_features`.
|
|
||||||
* For classes, `@auto_docstring` retrieves parameter descriptions from the `__init__` method's docstring.
|
|
||||||
* Rely on standard docstrings and do not redefine common arguments unless their behavior is different in your model.
|
|
||||||
* Document new or custom arguments clearly.
|
* Document new or custom arguments clearly.
|
||||||
* Run `check_docstrings` locally and iteratively.
|
* Run `check_docstrings` locally and iteratively.
|
||||||
|
|
||||||
|
By following these guidelines, you help maintain consistent and informative documentation for the Hugging Face Transformers library 🤗.
|
||||||
|
@ -15,7 +15,6 @@ rendered properly in your Markdown viewer.
|
|||||||
-->
|
-->
|
||||||
|
|
||||||
# Caching
|
# Caching
|
||||||
|
|
||||||
Imagine you're having a conversation with someone, and instead of remembering what they previously said, they have to start from scratch every time you respond. This would be slow and inefficient, right?
|
Imagine you're having a conversation with someone, and instead of remembering what they previously said, they have to start from scratch every time you respond. This would be slow and inefficient, right?
|
||||||
|
|
||||||
You can extend this analogy to transformer models. Autoregressive model generation can be slow because it makes a prediction one token at a time. Each new prediction is dependent on all the previous context.
|
You can extend this analogy to transformer models. Autoregressive model generation can be slow because it makes a prediction one token at a time. Each new prediction is dependent on all the previous context.
|
||||||
@ -83,37 +82,41 @@ When you use Transformers' [`Cache`] class, the self-attention module performs s
|
|||||||
|
|
||||||
## Cache storage implementation
|
## Cache storage implementation
|
||||||
|
|
||||||
Caches are structured as a list of layers, where each layer contains a key and value cache. The key and value caches are tensors with the shape `[batch_size, num_heads, seq_len, head_dim]`.
|
The actual storage of key-value pairs varies between cache implementations. As an example, consider the [`DynamicCache`].
|
||||||
|
|
||||||
Layers can be of different types (e.g. `DynamicLayer`, `StaticLayer`, `SlidingWindowLayer`), which mostly changes how sequence length is handled and how the cache is updated.
|
|
||||||
|
|
||||||
The simplest is a `DynamicLayer` that grows as more tokens are processed. The sequence length dimension (`seq_len`) increases with each new token:
|
In [`DynamicCache`], the key-value pairs are stored as two lists of tensors. Each tensor in the lists have the shape `[batch_size, num_heads, seq_len, head_dim]`.
|
||||||
|
- `key_cache`: A list of tensors, one for each layer.
|
||||||
|
- `value_cache`: A list of tensors, one for each layer.
|
||||||
|
|
||||||
|
When new tokens are processed:
|
||||||
|
|
||||||
|
1. For each layer, the new key and value states are concatenated with the existing cache.
|
||||||
```py
|
```py
|
||||||
cache.layers[idx].keys = torch.cat([cache.layers[idx].keys, key_states], dim=-2)
|
self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
|
||||||
cache.layers[idx].values = torch.cat([cache.layers[idx].values, value_states], dim=-2)
|
self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
|
||||||
```
|
```
|
||||||
|
|
||||||
Other layer types like `StaticLayer` and `SlidingWindowLayer` have a fixed sequence length that is set when the cache is created. This makes them compatible with `torch.compile`. In the case of `SlidingWindowLayer`, existing tokens are shifted out of the cache when a new token is added.
|
2. The cache grows dynamically as more tokens are processed. The sequence length dimension (`seq_len`) increases with each new token.
|
||||||
|
|
||||||
|
3. The cache maintains a count of seen tokens through `self._seen_tokens`. This is updated when the first layer processes a new token.
|
||||||
|
|
||||||
The example below demonstrates how to create a generation loop with [`DynamicCache`]. As discussed, the attention mask is a concatenation of past and current token values and `1` is added to the cache position for the next token.
|
The example below demonstrates how to create a generation loop with [`DynamicCache`]. As discussed, the attention mask is a concatenation of past and current token values and `1` is added to the cache position for the next token.
|
||||||
|
|
||||||
```py
|
```py
|
||||||
import torch
|
import torch
|
||||||
from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache, infer_device
|
from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
|
||||||
|
|
||||||
device = f"{infer_device()}:0"
|
|
||||||
|
|
||||||
model_id = "meta-llama/Llama-2-7b-chat-hf"
|
model_id = "meta-llama/Llama-2-7b-chat-hf"
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16, device_map=device)
|
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda:0")
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||||
|
|
||||||
past_key_values = DynamicCache(config=model.config)
|
past_key_values = DynamicCache()
|
||||||
messages = [{"role": "user", "content": "Hello, what's your name."}]
|
messages = [{"role": "user", "content": "Hello, what's your name."}]
|
||||||
inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to(model.device)
|
inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to("cuda:0")
|
||||||
|
|
||||||
generated_ids = inputs.input_ids
|
generated_ids = inputs.input_ids
|
||||||
cache_position = torch.arange(inputs.input_ids.shape[1], dtype=torch.int64, device=model.device)
|
cache_position = torch.arange(inputs.input_ids.shape[1], dtype=torch.int64, device="cuda:0")
|
||||||
max_new_tokens = 10
|
max_new_tokens = 10
|
||||||
|
|
||||||
for _ in range(max_new_tokens):
|
for _ in range(max_new_tokens):
|
||||||
@ -131,36 +134,6 @@ for _ in range(max_new_tokens):
|
|||||||
print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])
|
print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])
|
||||||
"[INST] Hello, what's your name. [/INST] Hello! My name is LLaMA,"
|
"[INST] Hello, what's your name. [/INST] Hello! My name is LLaMA,"
|
||||||
```
|
```
|
||||||
|
|
||||||
## Cache position
|
|
||||||
|
|
||||||
The cache position tracks where to insert new tokens in the attention cache. It represents the *absolute* position of each token in the context, independent of padding or batch structure. Suppose you already cached `N` tokens and are now processing `K` new tokens. The cache position for the new tokens will range from `N` to `N + K - 1`. In other words, you're processing tokens at positions - `[N, N + 1, N + 2, ..., N + K - 1]`.
|
|
||||||
|
|
||||||
Cache position is used internally for two purposes:
|
|
||||||
|
|
||||||
1. Selecting new tokens to process in the input sequence and ensuring only tokens that haven’t been cached yet are passed to the model's `forward`.
|
|
||||||
2. Storing key/value pairs at the correct positions in the cache. This is especially important for fixed-size caches, that pre-allocates a specific cache length.
|
|
||||||
|
|
||||||
The generation loop usually takes care of the cache position, but if you're writing a custom generation method, it is important that cache positions are accurate since they are used to write and read key/value states into fixed slots.
|
|
||||||
|
|
||||||
|
|
||||||
```py
|
|
||||||
import torch
|
|
||||||
from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache, infer_device
|
|
||||||
|
|
||||||
device = f"{infer_device()}:0"
|
|
||||||
|
|
||||||
model_id = "meta-llama/Llama-2-7b-chat-hf"
|
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16, device_map=device)
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
||||||
|
|
||||||
messages = [{"role": "user", "content": "You are a helpful assistant."}]
|
|
||||||
inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to(model.device)
|
|
||||||
generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=10)
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
## Legacy cache format
|
## Legacy cache format
|
||||||
|
|
||||||
Before the [`Cache`] class, the cache used to be stored as a tuple of tuples of tensors. This format is dynamic because it grows as text is generated, similar to [`DynamicCache`].
|
Before the [`Cache`] class, the cache used to be stored as a tuple of tuples of tensors. This format is dynamic because it grows as text is generated, similar to [`DynamicCache`].
|
||||||
@ -170,14 +143,14 @@ The legacy format is essentially the same data structure but organized different
|
|||||||
- The tensors have the same shape `[batch_size, num_heads, seq_len, head_dim]`.
|
- The tensors have the same shape `[batch_size, num_heads, seq_len, head_dim]`.
|
||||||
- The format is less flexible and doesn't support features like quantization or offloading.
|
- The format is less flexible and doesn't support features like quantization or offloading.
|
||||||
|
|
||||||
If your project depends on this legacy format, we recommend to convert to [`DynamicCache`] with [`~DynamicCache.from_legacy_cache`]. Note that legacy cache format is deprecated and not used anymore in `Transformers`. You can convert back to tuple format with [`DynamicCache.to_legacy_cache`] functions, which is helpful if you have custom logic for manipulating a cache in a specific format.
|
If your project depends on this legacy format, you can convert between [`DynamicCache`] and a tuple of tuples as shown below with the [`~DynamicCache.from_legacy_cache`] and [`DynamicCache.to_legacy_cache`] functions. This is helpful if you have custom logic for manipulating a cache in a specific format.
|
||||||
|
|
||||||
```py
|
```py
|
||||||
import torch
|
import torch
|
||||||
from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
|
from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
|
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
|
||||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", dtype=torch.float16, device_map="auto")
|
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
|
||||||
inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
|
inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
|
||||||
|
|
||||||
# `return_dict_in_generate=True` is required to return the cache and `return_legacy_cache` forces the returned cache
|
# `return_dict_in_generate=True` is required to return the cache and `return_legacy_cache` forces the returned cache
|
||||||
@ -186,4 +159,4 @@ generation_outputs = model.generate(**inputs, return_dict_in_generate=True, retu
|
|||||||
|
|
||||||
cache = DynamicCache.from_legacy_cache(generation_outputs.past_key_values)
|
cache = DynamicCache.from_legacy_cache(generation_outputs.past_key_values)
|
||||||
legacy_format_cache = cache.to_legacy_cache()
|
legacy_format_cache = cache.to_legacy_cache()
|
||||||
```
|
```
|
@ -14,64 +14,64 @@ rendered properly in your Markdown viewer.
|
|||||||
|
|
||||||
-->
|
-->
|
||||||
|
|
||||||
# Tool use
|
# Tools and RAG
|
||||||
|
|
||||||
Chat models are commonly trained with support for "function-calling" or "tool-use". Tools are functions supplied by the user, which the model can choose to call as part of its response. For example, models could have access to a calculator tool to perform arithmetic without having to it internally.
|
The [`~PreTrainedTokenizerBase.apply_chat_template`] method supports virtually any additional argument types - strings, lists, dicts - besides the chat message. This makes it possible to use chat templates for many use cases.
|
||||||
|
|
||||||
This guide will demonstrate how to define tools, how to pass them to a chat model, and how to handle the model's output when it calls a tool.
|
This guide will demonstrate how to use chat templates with tools and retrieval-augmented generation (RAG).
|
||||||
|
|
||||||
## Passing tools
|
## Tools
|
||||||
|
|
||||||
When a model supports tool-use, pass functions to the `tools` argument of [`~PreTrainedTokenizerBase.apply_chat_template`].
|
Tools are functions a large language model (LLM) can call to perform specific tasks. It is a powerful way to extend the capabilities of conversational agents with real-time information, computational tools, or access to large databases.
|
||||||
The tools are passed as either a [JSON schema](https://json-schema.org/learn) or Python functions. If you pass Python functions,
|
|
||||||
the arguments, argument types, and function docstring are parsed in order to generate the JSON schema automatically.
|
|
||||||
|
|
||||||
Although passing Python functions is very convenient, the parser can only handle [Google-style](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings)
|
Follow the rules below when creating a tool.
|
||||||
docstrings. Refer to the examples below for how to format a tool-ready function.
|
|
||||||
|
|
||||||
|
1. The function should have a descriptive name.
|
||||||
|
2. The function arguments must have a type hint in the function header (don't include in the `Args` block).
|
||||||
|
3. The function must have a [Google-style](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) docstring.
|
||||||
|
4. The function can have a return type and `Returns` block, but these are optional because most tool use models ignore them.
|
||||||
|
|
||||||
|
An example tool to get temperature and wind speed is shown below.
|
||||||
|
|
||||||
```py
|
```py
|
||||||
def get_current_temperature(location: str, unit: str):
|
def get_current_temperature(location: str, unit: str) -> float:
|
||||||
"""
|
"""
|
||||||
Get the current temperature at a location.
|
Get the current temperature at a location.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
location: The location to get the temperature for, in the format "City, Country"
|
location: The location to get the temperature for, in the format "City, Country"
|
||||||
unit: The unit to return the temperature in. (choices: ["celsius", "fahrenheit"])
|
unit: The unit to return the temperature in. (choices: ["celsius", "fahrenheit"])
|
||||||
|
Returns:
|
||||||
|
The current temperature at the specified location in the specified units, as a float.
|
||||||
"""
|
"""
|
||||||
return 22. # A real function should probably actually get the temperature!
|
return 22. # A real function should probably actually get the temperature!
|
||||||
|
|
||||||
def get_current_wind_speed(location: str):
|
def get_current_wind_speed(location: str) -> float:
|
||||||
"""
|
"""
|
||||||
Get the current wind speed in km/h at a given location.
|
Get the current wind speed in km/h at a given location.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
location: The location to get the wind speed for, in the format "City, Country"
|
location: The location to get the temperature for, in the format "City, Country"
|
||||||
|
Returns:
|
||||||
|
The current wind speed at the given location in km/h, as a float.
|
||||||
"""
|
"""
|
||||||
return 6. # A real function should probably actually get the wind speed!
|
return 6. # A real function should probably actually get the wind speed!
|
||||||
|
|
||||||
tools = [get_current_temperature, get_current_wind_speed]
|
tools = [get_current_temperature, get_current_wind_speed]
|
||||||
```
|
```
|
||||||
|
|
||||||
You can optionally add a `Returns:` block to the docstring and a return type to the function header, but most models won't use this information. The parser will also ignore the actual code inside the function!
|
|
||||||
|
|
||||||
What really matters is the function name, argument names, argument types, and docstring describing the function's purpose
|
|
||||||
and the purpose of its arguments. These create the "signature" the model will use to decide whether to call the tool.
|
|
||||||
|
|
||||||
## Tool-calling Example
|
|
||||||
|
|
||||||
Load a model and tokenizer that supports tool-use like [NousResearch/Hermes-2-Pro-Llama-3-8B](https://hf.co/NousResearch/Hermes-2-Pro-Llama-3-8B), but you can also consider a larger model like [Command-R](./model_doc/cohere) and [Mixtral-8x22B](./model_doc/mixtral) if your hardware can support it.
|
Load a model and tokenizer that supports tool-use like [NousResearch/Hermes-2-Pro-Llama-3-8B](https://hf.co/NousResearch/Hermes-2-Pro-Llama-3-8B), but you can also consider a larger model like [Command-R](./model_doc/cohere) and [Mixtral-8x22B](./model_doc/mixtral) if your hardware can support it.
|
||||||
|
|
||||||
```py
|
```py
|
||||||
import torch
|
import torch
|
||||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
|
||||||
checkpoint = "NousResearch/Hermes-2-Pro-Llama-3-8B"
|
tokenizer = AutoTokenizer.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B")
|
||||||
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
tokenizer = AutoTokenizer.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B")
|
||||||
model = AutoModelForCausalLM.from_pretrained(checkpoint, dtype="auto", device_map="auto")
|
model = AutoModelForCausalLM.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B", torch_dtype=torch.bfloat16, device_map="auto")
|
||||||
```
|
```
|
||||||
|
|
||||||
Create a chat history.
|
Create a chat message.
|
||||||
|
|
||||||
```py
|
```py
|
||||||
messages = [
|
messages = [
|
||||||
@ -80,11 +80,12 @@ messages = [
|
|||||||
]
|
]
|
||||||
```
|
```
|
||||||
|
|
||||||
Next, pass `messages` and a list of tools to [`~PreTrainedTokenizerBase.apply_chat_template`]. Tokenize the chat and generate a response.
|
Pass `messages` and a list of tools to [`~PreTrainedTokenizerBase.apply_chat_template`]. Then you can pass the inputs to the model for generation.
|
||||||
|
|
||||||
```py
|
```py
|
||||||
inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
|
inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
|
||||||
outputs = model.generate(**inputs.to(model.device), max_new_tokens=128)
|
inputs = {k: v for k, v in inputs.items()}
|
||||||
|
outputs = model.generate(**inputs, max_new_tokens=128)
|
||||||
print(tokenizer.decode(outputs[0][len(inputs["input_ids"][0]):]))
|
print(tokenizer.decode(outputs[0][len(inputs["input_ids"][0]):]))
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -94,52 +95,60 @@ print(tokenizer.decode(outputs[0][len(inputs["input_ids"][0]):]))
|
|||||||
</tool_call><|im_end|>
|
</tool_call><|im_end|>
|
||||||
```
|
```
|
||||||
|
|
||||||
The chat model called the `get_current_temperature` tool with the correct parameters from the docstring. It inferred France as the location based on Paris, and that it should use Celsius for the units of temperature.
|
The chat model called the `get_current_temperature` tool with the correct parameters from the docstring. It inferred France as the location based on Paris, and that it should use Celsius for the units of temperature.
|
||||||
|
|
||||||
A model **cannot actually call the tool itself**. It requests a tool call, and it's your job to handle the call and append it and the result to the chat history.
|
Now append the `get_current_temperature` function and these arguments to the chat message as `tool_call`. The `tool_call` dictionary should be provided to the `assistant` role instead of the `system` or `user`.
|
||||||
|
|
||||||
Hold the call in the `tool_calls` key of an `assistant` message. This is the recommended API, and should be supported by the chat template of most tool-using models.
|
|
||||||
|
|
||||||
> [!WARNING]
|
> [!WARNING]
|
||||||
> Although `tool_calls` is similar to the OpenAI API, the OpenAI API uses a JSON string as its `tool_calls` format. This may cause errors or strange model behavior if used in Transformers, which expects a dict.
|
> The OpenAI API uses a JSON string as its `tool_call` format. This may cause errors or strange model behavior if used in Transformers, which expects a dict.
|
||||||
|
|
||||||
|
<hfoptions id="tool-call">
|
||||||
|
<hfoption id="Llama">
|
||||||
|
|
||||||
```py
|
```py
|
||||||
tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
|
tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
|
||||||
messages.append({"role": "assistant", "tool_calls": [{"type": "function", "function": tool_call}]})
|
messages.append({"role": "assistant", "tool_calls": [{"type": "function", "function": tool_call}]})
|
||||||
```
|
```
|
||||||
|
|
||||||
Append the tool response to the chat history with the `tool` role.
|
Allow the assistant to read the function outputs and chat with the user.
|
||||||
|
|
||||||
```py
|
|
||||||
messages.append({"role": "tool", "content": "22"}) # Note that the returned content is always a string!
|
|
||||||
```
|
|
||||||
|
|
||||||
Finally, allow the model to read the tool response and reply to the user.
|
|
||||||
|
|
||||||
```py
|
```py
|
||||||
inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
|
inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
|
||||||
out = model.generate(**inputs.to(model.device), max_new_tokens=128)
|
inputs = {k: v for k, v in inputs.items()}
|
||||||
|
out = model.generate(**inputs, max_new_tokens=128)
|
||||||
print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
|
print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
|
||||||
```
|
```
|
||||||
|
|
||||||
```txt
|
```txt
|
||||||
The temperature in Paris, France right now is 22°C.<|im_end|>
|
The temperature in Paris, France right now is approximately 12°C (53.6°F).<|im_end|>
|
||||||
```
|
```
|
||||||
|
|
||||||
> [!WARNING]
|
</hfoption>
|
||||||
> Although the key in the assistant message is called `tool_calls`, in most cases, models only emit a single tool call at a time. Some older models emit multiple tool calls at the same time, but this is a
|
<hfoption id="Mistral/Mixtral">
|
||||||
> significantly more complex process, as you need to handle multiple tool responses at once and disambiguate them, often using tool call IDs. Please refer to the model card to see exactly what format a model expects for tool calls.
|
|
||||||
|
|
||||||
|
For [Mistral](./model_doc/mistral) and [Mixtral](./model_doc/mixtral) models, you need an additional `tool_call_id`. The `tool_call_id` is 9 randomly generated alphanumeric characters assigned to the `id` key in the `tool_call` dictionary.
|
||||||
|
|
||||||
## JSON schemas
|
```py
|
||||||
|
tool_call_id = "9Ae3bDc2F"
|
||||||
|
tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
|
||||||
|
messages.append({"role": "assistant", "tool_calls": [{"type": "function", "id": tool_call_id, "function": tool_call}]})
|
||||||
|
```
|
||||||
|
|
||||||
Another way to define tools is by passing a [JSON schema](https://json-schema.org/learn/getting-started-step-by-step).
|
```py
|
||||||
|
inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
|
||||||
|
inputs = {k: v for k, v in inputs.items()}
|
||||||
|
out = model.generate(**inputs, max_new_tokens=128)
|
||||||
|
print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
|
||||||
|
```
|
||||||
|
|
||||||
You can also manually call the low-level functions that convert Python functions to JSON schemas, and then check or edit the generated schemas. This is usually not necessary, but is useful for understanding the underlying mechanics. It's particularly important
|
</hfoption>
|
||||||
for chat template authors who need to access the JSON schema to render the tool definitions.
|
</hfoptions>
|
||||||
|
|
||||||
The [`~PreTrainedTokenizerBase.apply_chat_template`] method uses the [get_json_schema](https://github.com/huggingface/transformers/blob/14561209291255e51c55260306c7d00c159381a5/src/transformers/utils/chat_template_utils.py#L205) function to convert Python functions to a JSON schema.
|
## Schema
|
||||||
|
|
||||||
|
[`~PreTrainedTokenizerBase.apply_chat_template`] converts functions into a [JSON schema](https://json-schema.org/learn/getting-started-step-by-step) which is passed to the chat template. A LLM never sees the code inside the function. In other words, a LLM doesn't care how the function works technically, it only cares about function **definition** and **arguments**.
|
||||||
|
|
||||||
|
The JSON schema is automatically generated behind the scenes as long as your function follows the [rules](#tools) listed earlier above. But you can use [get_json_schema](https://github.com/huggingface/transformers/blob/14561209291255e51c55260306c7d00c159381a5/src/transformers/utils/chat_template_utils.py#L205) to manually convert a schema for more visibility or debugging.
|
||||||
|
|
||||||
```py
|
```py
|
||||||
from transformers.utils import get_json_schema
|
from transformers.utils import get_json_schema
|
||||||
@ -182,7 +191,12 @@ print(schema)
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
We won't go into the details of JSON schema itself here, since it's already [very well documented](https://json-schema.org/) elsewhere. We will, however, mention that you can pass JSON schema dicts to the `tools` argument of [`~PreTrainedTokenizerBase.apply_chat_template`] instead of Python functions:
|
You can edit the schema or write one entirely from scratch. This gives you a lot of flexibility to define precise schemas for more complex functions.
|
||||||
|
|
||||||
|
> [!WARNING]
|
||||||
|
> Try keeping your function signatures simple and the arguments to a minimum. These are easier for a model to understand and use than complex functions for example with nested arguments.
|
||||||
|
|
||||||
|
The example below demonstrates writing a schema manually and then passing it to [`~PreTrainedTokenizerBase.apply_chat_template`].
|
||||||
|
|
||||||
```py
|
```py
|
||||||
# A simple function that takes no arguments
|
# A simple function that takes no arguments
|
||||||
@ -224,4 +238,62 @@ model_input = tokenizer.apply_chat_template(
|
|||||||
messages,
|
messages,
|
||||||
tools = [current_time, multiply]
|
tools = [current_time, multiply]
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## RAG
|
||||||
|
|
||||||
|
Retrieval-augmented generation (RAG) models enhance a models existing knowledge by allowing it to search documents for additional information before returning a query. For RAG models, add a `documents` parameter to [`~PreTrainedTokenizerBase.apply_chat_template`]. This `documents` parameter should be a list of documents, and each document should be a single dict with `title` and `content` keys.
|
||||||
|
|
||||||
|
> [!TIP]
|
||||||
|
> The `documents` parameter for RAG isn't widely supported and many models have chat templates that ignore `documents`. Verify if a model supports `documents` by reading its model card or executing `print(tokenizer.chat_template)` to see if the `documents` key is present. [Command-R](https://hf.co/CohereForAI/c4ai-command-r-08-2024) and [Command-R+](https://hf.co/CohereForAI/c4ai-command-r-plus-08-2024) both support `documents` in their RAG chat templates.
|
||||||
|
|
||||||
|
Create a list of documents to pass to the model.
|
||||||
|
|
||||||
|
```py
|
||||||
|
documents = [
|
||||||
|
{
|
||||||
|
"title": "The Moon: Our Age-Old Foe",
|
||||||
|
"text": "Man has always dreamed of destroying the moon. In this essay, I shall..."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "The Sun: Our Age-Old Friend",
|
||||||
|
"text": "Although often underappreciated, the sun provides several notable benefits..."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
Set `chat_template="rag"` in [`~PreTrainedTokenizerBase.apply_chat_template`] and generate a response.
|
||||||
|
|
||||||
|
```py
|
||||||
|
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||||
|
|
||||||
|
# Load the model and tokenizer
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01-4bit")
|
||||||
|
model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01-4bit", device_map="auto")
|
||||||
|
device = model.device # Get the device the model is loaded on
|
||||||
|
|
||||||
|
# Define conversation input
|
||||||
|
conversation = [
|
||||||
|
{"role": "user", "content": "What has Man always dreamed of?"}
|
||||||
|
]
|
||||||
|
|
||||||
|
input_ids = tokenizer.apply_chat_template(
|
||||||
|
conversation=conversation,
|
||||||
|
documents=documents,
|
||||||
|
chat_template="rag",
|
||||||
|
tokenize=True,
|
||||||
|
add_generation_prompt=True,
|
||||||
|
return_tensors="pt").to(device)
|
||||||
|
|
||||||
|
# Generate a response
|
||||||
|
generated_tokens = model.generate(
|
||||||
|
input_ids,
|
||||||
|
max_new_tokens=100,
|
||||||
|
do_sample=True,
|
||||||
|
temperature=0.3,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Decode and print the generated text along with generation prompt
|
||||||
|
generated_text = tokenizer.decode(generated_tokens[0])
|
||||||
|
print(generated_text)
|
||||||
|
```
|
||||||
|
@ -14,19 +14,11 @@ rendered properly in your Markdown viewer.
|
|||||||
|
|
||||||
-->
|
-->
|
||||||
|
|
||||||
# Chat templates
|
# Templates
|
||||||
|
|
||||||
The [chat basics](./conversations) guide covers how to store chat histories and generate text from chat models using [`TextGenerationPipeline`].
|
The [chat pipeline](./conversations) guide introduced [`TextGenerationPipeline`] and the concept of a chat prompt or chat template for conversing with a model. Underlying this high-level pipeline is the [`apply_chat_template`] method. A chat template is a part of the tokenizer and it specifies how to convert conversations into a single tokenizable string in the expected model format.
|
||||||
|
|
||||||
This guide is intended for more advanced users, and covers the underlying classes and methods, as well as the key concepts for understanding what's actually going on when you chat with a model.
|
In the example below, Mistral-7B-Instruct and Zephyr-7B are finetuned from the same base model but they’re trained with different chat formats. Without chat templates, you have to manually write formatting code for each model and even minor errors can hurt performance. Chat templates offer a universal way to format chat inputs to any model.
|
||||||
|
|
||||||
The critical insight needed to understand chat models is this: All causal LMs, whether chat-trained or not, continue a sequence of tokens. When causal LMs are trained, the training usually begins with "pre-training" on a huge corpus of text, which creates a "base" model.
|
|
||||||
These base models are then often "fine-tuned" for chat, which means training them on data that is formatted as a sequence of messages. The chat is still just a sequence of tokens, though! The list of `role` and `content` dictionaries that you pass
|
|
||||||
to a chat model get converted to a token sequence, often with control tokens like `<|user|>` or `<|assistant|>` or `<|end_of_message|>`, which allow the model to see the chat structure.
|
|
||||||
There are many possible chat formats, and different models may use different formats or control tokens, even if they were fine-tuned from the same base model!
|
|
||||||
|
|
||||||
Don't panic, though - you don't need to memorize every possible chat format in order to use chat models. Chat models come with **chat templates**, which indicate how they expect chats to be formatted.
|
|
||||||
You can access these with the [`apply_chat_template`] method. Let's see two examples. Both of these models are fine-tuned from the same `Mistral-7B` base model:
|
|
||||||
|
|
||||||
<hfoptions id="template">
|
<hfoptions id="template">
|
||||||
<hfoption id="Mistral">
|
<hfoption id="Mistral">
|
||||||
@ -69,24 +61,20 @@ tokenizer.apply_chat_template(chat, tokenize=False)
|
|||||||
</hfoption>
|
</hfoption>
|
||||||
</hfoptions>
|
</hfoptions>
|
||||||
|
|
||||||
Mistral-7B-Instruct uses `[INST]` and `[/INST]` tokens to indicate the start and end of user messages, while Zephyr-7B uses `<|user|>` and `<|assistant|>` tokens to indicate speaker roles. This is why chat templates are important - with the wrong control tokens, these models would have drastically worse performance.
|
This guide explores [`apply_chat_template`] and chat templates in more detail.
|
||||||
|
|
||||||
## Using `apply_chat_template`
|
## apply_chat_template
|
||||||
|
|
||||||
The input to `apply_chat_template` should be structured as a list of dictionaries with `role` and `content` keys. The `role` key specifies the speaker, and the `content` key contains the message. The common roles are:
|
Chats should be structured as a list of dictionaries with `role` and `content` keys. The `role` key specifies the speaker (usually between you and the system), and the `content` key contains your message. For the system, the `content` is a high-level description of how the model should behave and respond when you’re chatting with it.
|
||||||
|
|
||||||
- `user` for messages from the user
|
Pass your messages to [`apply_chat_template`] to tokenize and format them. You can set [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) to `True` to indicate the start of a message.
|
||||||
- `assistant` for messages from the model
|
|
||||||
- `system` for directives on how the model should act (usually placed at the beginning of the chat)
|
|
||||||
|
|
||||||
[`apply_chat_template`] takes this list and returns a formatted sequence. Set `tokenize=True` if you want to tokenize the sequence.
|
|
||||||
|
|
||||||
```py
|
```py
|
||||||
import torch
|
import torch
|
||||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
|
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
|
||||||
model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta", device_map="auto", dtype=torch.bfloat16)
|
model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta", device_map="auto", torch_dtype=torch.bfloat16)
|
||||||
|
|
||||||
messages = [
|
messages = [
|
||||||
{"role": "system", "content": "You are a friendly chatbot who always responds in the style of a pirate",},
|
{"role": "system", "content": "You are a friendly chatbot who always responds in the style of a pirate",},
|
||||||
@ -95,7 +83,6 @@ messages = [
|
|||||||
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
|
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
|
||||||
print(tokenizer.decode(tokenized_chat[0]))
|
print(tokenizer.decode(tokenized_chat[0]))
|
||||||
```
|
```
|
||||||
|
|
||||||
```md
|
```md
|
||||||
<|system|>
|
<|system|>
|
||||||
You are a friendly chatbot who always responds in the style of a pirate</s>
|
You are a friendly chatbot who always responds in the style of a pirate</s>
|
||||||
@ -104,7 +91,7 @@ How many helicopters can a human eat in one sitting?</s>
|
|||||||
<|assistant|>
|
<|assistant|>
|
||||||
```
|
```
|
||||||
|
|
||||||
Pass the tokenized chat to [`~GenerationMixin.generate`] to generate a response.
|
Now pass the tokenized chat to [`~GenerationMixin.generate`] to generate a response.
|
||||||
|
|
||||||
```py
|
```py
|
||||||
outputs = model.generate(tokenized_chat, max_new_tokens=128)
|
outputs = model.generate(tokenized_chat, max_new_tokens=128)
|
||||||
@ -119,17 +106,10 @@ How many helicopters can a human eat in one sitting?</s>
|
|||||||
Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all.
|
Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all.
|
||||||
```
|
```
|
||||||
|
|
||||||
> [!WARNING]
|
|
||||||
> Some tokenizers add special `<bos>` and `<eos>` tokens. Chat templates should already include all the necessary special tokens, and adding additional special tokens is often incorrect or duplicated, hurting model performance. When you format text with `apply_chat_template(tokenize=False)`, make sure you set `add_special_tokens=False` if you tokenize later to avoid duplicating these tokens.
|
|
||||||
> This isn’t an issue if you use `apply_chat_template(tokenize=True)`, which means it's usually the safer option!
|
|
||||||
|
|
||||||
### add_generation_prompt
|
### add_generation_prompt
|
||||||
|
The [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) parameter adds tokens that indicate the start of a response. This ensures the chat model generates a system response instead of continuing a users message.
|
||||||
|
|
||||||
You may have noticed the [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) argument in the above examples.
|
Not all models require generation prompts, and some models, like [Llama](./model_doc/llama), don’t have any special tokens before the system response. In this case, [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) has no effect.
|
||||||
This argument adds tokens to the end of the chat that indicate the start of an `assistant` response. Remember: Beneath all the chat abstractions, chat models are still just language models that continue a sequence of tokens!
|
|
||||||
If you include tokens that tell it that it's now in an `assistant` response, it will correctly write a response, but if you don't include these tokens, the model may get confused and do something strange, like **continuing** the user's message instead of replying to it!
|
|
||||||
|
|
||||||
Let's see an example to understand what `add_generation_prompt` is actually doing. First, let's format a chat without `add_generation_prompt`:
|
|
||||||
|
|
||||||
```py
|
```py
|
||||||
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
|
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
|
||||||
@ -144,32 +124,11 @@ Nice to meet you!<|im_end|>
|
|||||||
Can I ask a question?<|im_end|>
|
Can I ask a question?<|im_end|>
|
||||||
```
|
```
|
||||||
|
|
||||||
Now, let's format the same chat with `add_generation_prompt=True`:
|
|
||||||
|
|
||||||
```py
|
|
||||||
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
||||||
tokenized_chat
|
|
||||||
```
|
|
||||||
```md
|
|
||||||
<|im_start|>user
|
|
||||||
Hi there!<|im_end|>
|
|
||||||
<|im_start|>assistant
|
|
||||||
Nice to meet you!<|im_end|>
|
|
||||||
<|im_start|>user
|
|
||||||
Can I ask a question?<|im_end|>
|
|
||||||
<|im_start|>assistant
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
When `add_generation_prompt=True`, `<|im_start|>assistant` is added at the end to indicate the start of an `assistant` message. This lets the model know an `assistant` response is next.
|
|
||||||
|
|
||||||
Not all models require generation prompts, and some models, like [Llama](./model_doc/llama), don’t have any special tokens before the `assistant` response. In these cases, [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) has no effect.
|
|
||||||
|
|
||||||
### continue_final_message
|
### continue_final_message
|
||||||
|
|
||||||
The [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) parameter controls whether the final message in the chat should be continued or not instead of starting a new one. It removes end of sequence tokens so that the model continues generation from the final message.
|
The [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) parameter controls whether the final message in the chat should be continued or not instead of starting a new one. It removes end of sequence tokens so that the model continues generation from the final message.
|
||||||
|
|
||||||
This is useful for “prefilling” a model response. In the example below, the model generates text that continues the JSON string rather than starting a new message. It can be very useful for improving the accuracy of instruction following when you know how to start its replies.
|
This is useful for “prefilling” a model response. In the example below, the model generates text that continues the JSON string rather than starting a new message. It can be very useful for improving the accuracy for instruction following when you know how to start its replies.
|
||||||
|
|
||||||
```py
|
```py
|
||||||
chat = [
|
chat = [
|
||||||
@ -184,12 +143,52 @@ model.generate(**formatted_chat)
|
|||||||
> [!WARNING]
|
> [!WARNING]
|
||||||
> You shouldn’t use [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) and [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) together. The former adds tokens that start a new message, while the latter removes end of sequence tokens. Using them together returns an error.
|
> You shouldn’t use [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) and [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) together. The former adds tokens that start a new message, while the latter removes end of sequence tokens. Using them together returns an error.
|
||||||
|
|
||||||
[`TextGenerationPipeline`] sets [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) to `True` by default to start a new message. However, if the final message in the chat has the `assistant` role, it assumes the message is a prefill and switches to `continue_final_message=True`. This is because most models don’t support multiple consecutive assistant messages. To override this behavior, explicitly pass the [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) argument to the pipeline.
|
[`TextGenerationPipeline`] sets [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) to `True` by default to start a new message. However, if the final message in the chat has the “assistant” role, it assumes the message is a prefill and switches to `continue_final_message=True`. This is because most models don’t support multiple consecutive assistant messages. To override this behavior, explicitly pass the [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) to the pipeline.
|
||||||
|
|
||||||
|
## Multiple templates
|
||||||
|
|
||||||
|
A model may have several different templates for different use cases. For example, a model may have a template for regular chat, tool use, and RAG.
|
||||||
|
|
||||||
|
When there are multiple templates, the chat template is a dictionary. Each key corresponds to the name of a template. [`apply_chat_template`] handles multiple templates based on their name. It looks for a template named `default` in most cases and if it can’t find one, it raises an error.
|
||||||
|
|
||||||
|
For a tool calling template, if a user passes a `tools` parameter and a `tool_use` template exists, the tool calling template is used instead of `default`.
|
||||||
|
|
||||||
|
To access templates with other names, pass the template name to the `chat_template` parameter in [`apply_chat_template`]. For example, if you’re using a RAG template then set `chat_template="rag"`.
|
||||||
|
|
||||||
|
It can be confusing to manage multiple templates though, so we recommend using a single template for all use cases. Use Jinja statements like `if tools is defined` and `{% macro %}` definitions to wrap multiple code paths in a single template.
|
||||||
|
|
||||||
|
## Template selection
|
||||||
|
|
||||||
|
It is important to set a chat template format that matches the template format a model was pretrained on, otherwise performance may suffer. Even if you’re training the model further, performance is best if the chat tokens are kept constant.
|
||||||
|
|
||||||
|
But if you’re training a model from scratch or finetuning a model for chat, you have more options to select a template. For example, [ChatML](https://github.com/openai/openai-python/blob/release-v0.28.0/chatml.md) is a popular format that is flexbile enough to handle many use cases. It even includes support for [generation prompts](#add_generation_prompt), but it doesn’t add beginning-of-string (`BOS`) or end-of-string (`EOS`) tokens. If your model expects `BOS` and `EOS` tokens, set `add_special_tokens=True` and make sure to add them to your template.
|
||||||
|
|
||||||
|
```py
|
||||||
|
{%- for message in messages %}
|
||||||
|
{{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}
|
||||||
|
{%- endfor %}
|
||||||
|
```
|
||||||
|
|
||||||
|
Set the template with the following logic to support [generation prompts](#add_generation_prompt). The template wraps each message with `<|im_start|>` and `<|im_end|>` tokens and writes the role as a string. This allows you to easily customize the roles you want to train with.
|
||||||
|
|
||||||
|
```py
|
||||||
|
tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
|
||||||
|
```
|
||||||
|
|
||||||
|
The `user`, `system` and `assistant` roles are standard roles in chat templates. We recommend using these roles when it makes sense, especially if you’re using your model with the [`TextGenerationPipeline`].
|
||||||
|
|
||||||
|
```py
|
||||||
|
<|im_start|>system
|
||||||
|
You are a helpful chatbot that will do its best not to say anything so stupid that people tweet about it.<|im_end|>
|
||||||
|
<|im_start|>user
|
||||||
|
How are you?<|im_end|>
|
||||||
|
<|im_start|>assistant
|
||||||
|
I'm doing great!<|im_end|>
|
||||||
|
```
|
||||||
|
|
||||||
## Model training
|
## Model training
|
||||||
|
|
||||||
Training a model with a chat template is a good way to ensure the template matches the tokens the model was trained on. Apply the chat template as a preprocessing step to your dataset. Set `add_generation_prompt=False` because the additional tokens to prompt an assistant response aren’t helpful during training.
|
Training a model with a chat template is a good way to ensure a chat template matches the tokens a model is trained on. Apply the chat template as a preprocessing step to your dataset. Set `add_generation_prompt=False` because the additional tokens to prompt an assistant response aren’t helpful during training.
|
||||||
|
|
||||||
An example of preprocessing a dataset with a chat template is shown below.
|
An example of preprocessing a dataset with a chat template is shown below.
|
||||||
|
|
||||||
@ -220,3 +219,11 @@ The sun.</s>
|
|||||||
```
|
```
|
||||||
|
|
||||||
After this step, you can continue following the [training recipe](./tasks/language_modeling) for causal language models using the `formatted_chat` column.
|
After this step, you can continue following the [training recipe](./tasks/language_modeling) for causal language models using the `formatted_chat` column.
|
||||||
|
|
||||||
|
Some tokenizers add special `<bos>` and `<eos>` tokens. Chat templates should already include all the necessary special tokens, and adding additional special tokens is often incorrect or duplicated, hurting model performance. When you format text with `apply_chat_template(tokenize=False)`, make sure you set `add_special_tokens=False` as well to avoid duplicating them.
|
||||||
|
|
||||||
|
```py
|
||||||
|
apply_chat_template(messages, tokenize=False, add_special_tokens=False)
|
||||||
|
```
|
||||||
|
|
||||||
|
This isn’t an issue if `apply_chat_template(tokenize=True)`.
|
||||||
|
@ -14,21 +14,22 @@ rendered properly in your Markdown viewer.
|
|||||||
|
|
||||||
-->
|
-->
|
||||||
|
|
||||||
# Multimodal chat templates
|
# Multimodal templates
|
||||||
|
|
||||||
Multimodal chat models accept inputs like images, audio or video, in addition to text. The `content` key in a multimodal chat history is a list containing multiple items of different types. This is unlike text-only chat models whose `content` key is a single string.
|
Multimodal model chat templates expect a similar [template](./chat_templating) as text-only models. It needs `messages` that includes a dictionary of the `role` and `content`.
|
||||||
|
|
||||||
|
Multimodal templates are included in the [Processor](./processors) class and require an additional `type` key for specifying whether the included content is an image, video, or text.
|
||||||
|
|
||||||
In the same way the [Tokenizer](./fast_tokenizer) class handles chat templates and tokenization for text-only models,
|
This guide will show you how to format chat templates for multimodal models as well as some best practices for configuring the template
|
||||||
the [Processor](./processors) class handles preprocessing, tokenization and chat templates for multimodal models. Their [`~ProcessorMixin.apply_chat_template`] methods are almost identical.
|
|
||||||
|
|
||||||
This guide will show you how to chat with multimodal models with the high-level [`ImageTextToTextPipeline`] and at a lower level using the [`~ProcessorMixin.apply_chat_template`] and [`~GenerationMixin.generate`] methods.
|
|
||||||
|
|
||||||
## ImageTextToTextPipeline
|
## ImageTextToTextPipeline
|
||||||
|
|
||||||
[`ImageTextToTextPipeline`] is a high-level image and text generation class with a “chat mode”. Chat mode is enabled when a conversational model is detected and the chat prompt is [properly formatted](./llm_tutorial#wrong-prompt-format).
|
[`ImageTextToTextPipeline`] is a high-level image and text generation class with a “chat mode”. Chat mode is enabled when a conversational model is detected and the chat prompt is [properly formatted](./llm_tutorial#wrong-prompt-format).
|
||||||
|
|
||||||
Add image and text blocks to the `content` key in the chat history.
|
Start by building a chat history with the following two roles.
|
||||||
|
|
||||||
|
- `system` describes how the model should behave and respond when you’re chatting with it. This role isn’t supported by all chat models.
|
||||||
|
- `user` is where you enter your first message to the model.
|
||||||
|
|
||||||
```py
|
```py
|
||||||
messages = [
|
messages = [
|
||||||
@ -46,35 +47,39 @@ messages = [
|
|||||||
]
|
]
|
||||||
```
|
```
|
||||||
|
|
||||||
Create an [`ImageTextToTextPipeline`] and pass the chat to it. For large models, setting [device_map=“auto”](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Setting the data type to [auto](./models#model-data-type) also helps save memory and improve speed.
|
Create a [`ImageTextToTextPipeline`] and pass the chat to it. For large models, setting [device_map=“auto”](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Changing the data type to [torch.bfloat16](./models#model-data-type) also helps save memory.
|
||||||
|
|
||||||
|
> [!TIP]
|
||||||
|
> The [`ImageTextToTextPipeline`] accepts chats in the OpenAI format to make inference easier and more accessible.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import torch
|
import torch
|
||||||
from transformers import pipeline
|
from transformers import pipeline
|
||||||
|
|
||||||
pipe = pipeline("image-text-to-text", model="Qwen/Qwen2.5-VL-3B-Instruct", device_map="auto", dtype="auto")
|
pipeline = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device="cuda", torch_dtype=torch.float16)
|
||||||
out = pipe(text=messages, max_new_tokens=128)
|
pipeline(text=messages, max_new_tokens=50, return_full_text=False)
|
||||||
print(out[0]['generated_text'][-1]['content'])
|
[{'input_text': [{'role': 'system',
|
||||||
|
'content': [{'type': 'text',
|
||||||
|
'text': 'You are a friendly chatbot who always responds in the style of a pirate'}]},
|
||||||
|
{'role': 'user',
|
||||||
|
'content': [{'type': 'image',
|
||||||
|
'url': 'http://images.cocodataset.org/val2017/000000039769.jpg'},
|
||||||
|
{'type': 'text', 'text': 'What are these?'}]}],
|
||||||
|
'generated_text': 'The image shows two cats lying on a pink surface, which appears to be a cushion or a soft blanket. The cat on the left has a striped coat, typical of tabby cats, and is lying on its side with its head resting on the'}]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Image inputs
|
||||||
|
|
||||||
```
|
For multimodal models that accept images like [LLaVA](./model_doc/llava), include the following in `content` as shown below.
|
||||||
Ahoy, me hearty! These be two feline friends, likely some tabby cats, taking a siesta on a cozy pink blanket. They're resting near remote controls, perhaps after watching some TV or just enjoying some quiet time together. Cats sure know how to find comfort and relaxation, don't they?
|
|
||||||
```
|
|
||||||
|
|
||||||
Aside from the gradual descent from pirate-speak into modern American English (it **is** only a 3B model, after all), this is correct!
|
|
||||||
|
|
||||||
## Using `apply_chat_template`
|
|
||||||
|
|
||||||
Like [text-only models](./chat_templating), use the [`~ProcessorMixin.apply_chat_template`] method to prepare the chat messages for multimodal models.
|
|
||||||
This method handles the tokenization and formatting of the chat messages, including images and other media types. The resulting inputs are passed to the model for generation.
|
|
||||||
|
|
||||||
|
- The content `"type"` can be an `"image"` or `"text"`.
|
||||||
|
- For images, it can be a link to the image (`"url"`), a file path (`"path"`), or `"base64"`. Images are automatically loaded, processed, and prepared into pixel values as inputs to the model.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers import AutoProcessor, AutoModelForImageTextToText
|
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
|
||||||
|
|
||||||
model = AutoModelForImageTextToText.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", device_map="auto", torch_dtype="auto")
|
model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
|
||||||
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
|
processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
|
||||||
|
|
||||||
messages = [
|
messages = [
|
||||||
{
|
{
|
||||||
@ -91,28 +96,14 @@ messages = [
|
|||||||
]
|
]
|
||||||
```
|
```
|
||||||
|
|
||||||
Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content. Unlike text models, the output of `apply_chat_template`
|
Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content and return the `input_ids` and `pixel_values`.
|
||||||
contains a `pixel_values` key with the preprocessed image data, in addition to the tokenized text.
|
|
||||||
|
|
||||||
```py
|
```py
|
||||||
processed_chat = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt")
|
processed_chat = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt")
|
||||||
print(list(processed_chat.keys()))
|
print(processed_chat.keys())
|
||||||
```
|
```
|
||||||
|
|
||||||
|
These inputs are now ready to be used in [`~GenerationMixin.generate`].
|
||||||
```
|
|
||||||
['input_ids', 'attention_mask', 'pixel_values', 'image_grid_thw']
|
|
||||||
```
|
|
||||||
|
|
||||||
Pass these inputs to [`~GenerationMixin.generate`].
|
|
||||||
|
|
||||||
```python
|
|
||||||
out = model.generate(**processed_chat.to(model.device), max_new_tokens=128)
|
|
||||||
print(processor.decode(out[0]))
|
|
||||||
```
|
|
||||||
|
|
||||||
The decoded output contains the full conversation so far, including the user message and the placeholder tokens that contain the image information. You may need to trim the previous conversation from the output before displaying it to the user.
|
|
||||||
|
|
||||||
|
|
||||||
## Video inputs
|
## Video inputs
|
||||||
|
|
||||||
@ -120,7 +111,6 @@ Some vision models also support video inputs. The message format is very similar
|
|||||||
|
|
||||||
- The content `"type"` should be `"video"` to indicate the content is a video.
|
- The content `"type"` should be `"video"` to indicate the content is a video.
|
||||||
- For videos, it can be a link to the video (`"url"`) or it could be a file path (`"path"`). Videos loaded from a URL can only be decoded with [PyAV](https://pyav.basswood-io.com/docs/stable/) or [Decord](https://github.com/dmlc/decord).
|
- For videos, it can be a link to the video (`"url"`) or it could be a file path (`"path"`). Videos loaded from a URL can only be decoded with [PyAV](https://pyav.basswood-io.com/docs/stable/) or [Decord](https://github.com/dmlc/decord).
|
||||||
- In addition to loading videos from a URL or file path, you can also pass decoded video data directly. This is useful if you’ve already preprocessed or decoded video frames elsewhere in memory (e.g., using OpenCV, decord, or torchvision). You don't need to save to files or store it in an URL.
|
|
||||||
|
|
||||||
> [!WARNING]
|
> [!WARNING]
|
||||||
> Loading a video from `"url"` is only supported by the PyAV or Decord backends.
|
> Loading a video from `"url"` is only supported by the PyAV or Decord backends.
|
||||||
@ -147,52 +137,6 @@ messages = [
|
|||||||
]
|
]
|
||||||
```
|
```
|
||||||
|
|
||||||
### Example: Passing decoded video objects
|
|
||||||
```python
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
video_object1 = np.random.randint(0, 255, size=(16, 224, 224, 3), dtype=np.uint8),
|
|
||||||
|
|
||||||
messages = [
|
|
||||||
{
|
|
||||||
"role": "system",
|
|
||||||
"content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
{"type": "video", "video": video_object1},
|
|
||||||
{"type": "text", "text": "What do you see in this video?"}
|
|
||||||
],
|
|
||||||
},
|
|
||||||
]
|
|
||||||
```
|
|
||||||
You can also use existing (`"load_video()"`) function to load a video, edit the video in memory and pass it in the messages.
|
|
||||||
```python
|
|
||||||
|
|
||||||
# Make sure a video backend library (pyav, decord, or torchvision) is available.
|
|
||||||
from transformers.video_utils import load_video
|
|
||||||
|
|
||||||
# load a video file in memory for testing
|
|
||||||
video_object2, _ = load_video(
|
|
||||||
"https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4"
|
|
||||||
)
|
|
||||||
|
|
||||||
messages = [
|
|
||||||
{
|
|
||||||
"role": "system",
|
|
||||||
"content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
{"type": "video", "video": video_object2},
|
|
||||||
{"type": "text", "text": "What do you see in this video?"}
|
|
||||||
],
|
|
||||||
},
|
|
||||||
]
|
|
||||||
```
|
|
||||||
|
|
||||||
Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content. There are a few extra parameters to include in [`~ProcessorMixin.apply_chat_template`] that controls the sampling process.
|
Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content. There are a few extra parameters to include in [`~ProcessorMixin.apply_chat_template`] that controls the sampling process.
|
||||||
|
|
||||||
The `video_load_backend` parameter refers to a specific framework to load a video. It supports [PyAV](https://pyav.basswood-io.com/docs/stable/), [Decord](https://github.com/dmlc/decord), [OpenCV](https://github.com/opencv/opencv), and [torchvision](https://pytorch.org/vision/stable/index.html).
|
The `video_load_backend` parameter refers to a specific framework to load a video. It supports [PyAV](https://pyav.basswood-io.com/docs/stable/), [Decord](https://github.com/dmlc/decord), [OpenCV](https://github.com/opencv/opencv), and [torchvision](https://pytorch.org/vision/stable/index.html).
|
||||||
@ -231,7 +175,7 @@ processed_chat = processor.apply_chat_template(
|
|||||||
add_generation_prompt=True,
|
add_generation_prompt=True,
|
||||||
tokenize=True,
|
tokenize=True,
|
||||||
return_dict=True,
|
return_dict=True,
|
||||||
video_fps=16,
|
video_fps=32,
|
||||||
video_load_backend="decord",
|
video_load_backend="decord",
|
||||||
)
|
)
|
||||||
print(processed_chat.keys())
|
print(processed_chat.keys())
|
||||||
@ -272,3 +216,28 @@ print(processed_chat.keys())
|
|||||||
</hfoption>
|
</hfoption>
|
||||||
</hfoptions>
|
</hfoptions>
|
||||||
|
|
||||||
|
## Template configuration
|
||||||
|
|
||||||
|
You can create a custom chat template with [Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) and set it with [`~ProcessorMixin.apply_chat_template`]. Refer to the [Template writing](./chat_templating_writing) guide for more details.
|
||||||
|
|
||||||
|
For example, to enable a template to handle a *list of content* from multiple modalities while still supporting plain strings for text-only inference, specify how to handle the `content['type']` if it is an image or text as shown below in the Llama 3.2 Vision Instruct [template](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct/blob/main/chat_template.json).
|
||||||
|
|
||||||
|
```jinja
|
||||||
|
{% for message in messages %}
|
||||||
|
{% if loop.index0 == 0 %}{{ bos_token }}{% endif %}
|
||||||
|
{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
|
||||||
|
{% if message['content'] is string %}
|
||||||
|
{{ message['content'] }}
|
||||||
|
{% else %}
|
||||||
|
{% for content in message['content'] %}
|
||||||
|
{% if content['type'] == 'image' %}
|
||||||
|
{{ '<|image|>' }}
|
||||||
|
{% elif content['type'] == 'text' %}
|
||||||
|
{{ content['text'] }}
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endif %}
|
||||||
|
{{ '<|eot_id|>' }}
|
||||||
|
{% endfor %}
|
||||||
|
{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}
|
||||||
|
```
|
||||||
|
@ -14,10 +14,15 @@ rendered properly in your Markdown viewer.
|
|||||||
|
|
||||||
-->
|
-->
|
||||||
|
|
||||||
# Writing a chat template
|
# Template writing
|
||||||
|
|
||||||
A chat template is a [Jinja](https://jinja.palletsprojects.com/en/stable/templates/) template stored in the tokenizer's [chat_template](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.chat_template) attribute. Jinja is a templating language that allows you to write Python-like code and syntax.
|
A chat template is a [Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) template stored in the tokenizers [chat_template](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.chat_template) attribute. Jinja is a templating language that allows you to write Python-like code and syntax. A chat template performs the following three roles.
|
||||||
|
|
||||||
|
1. Print the role enclosed in `<|` and `|>` (`<|user|>`, `<|assistant|>`, etc.).
|
||||||
|
2. Print the message followed by an end-of-sequence (`EOS`) token.
|
||||||
|
3. Print the assistant token if [add_generation_prompt=True](./chat_templating#add_generation_prompt) so the model generates an assistant response.
|
||||||
|
|
||||||
|
An example template is shown below.
|
||||||
|
|
||||||
```jinja
|
```jinja
|
||||||
{%- for message in messages %}
|
{%- for message in messages %}
|
||||||
@ -29,76 +34,47 @@ A chat template is a [Jinja](https://jinja.palletsprojects.com/en/stable/templat
|
|||||||
{%- endif %}
|
{%- endif %}
|
||||||
```
|
```
|
||||||
|
|
||||||
If you stare at this for a while, you should realize that this is actually very like Python, albeit with some strange
|
The template can be customized to handle more complex use cases. This guide will show you how to add and edit templates and includes template writing tips.
|
||||||
`{%-` syntax. The template iterates over a list of messages, and for each message, it prints the role and content of
|
|
||||||
the message, followed by an end-of-sequence token. If `add_generation_prompt=True`, it adds
|
|
||||||
the starting header for an assistant message to the end of the conversation.
|
|
||||||
|
|
||||||
Load the written template as a string and assign it to the tokenizer's `chat_template` attribute. Once set, the template is used whenever you call [`~PreTrainedTokenizerBase.apply_chat_template`]. It is also saved
|
## Create a template
|
||||||
with the tokenizer whenever [`~PreTrainedTokenizer.save_pretrained`] or [`~PreTrainedTokenizer.push_to_hub`] is called. The template is saved in the `chat_template.jinja` file in the tokenizer directory. You can
|
|
||||||
edit this file directly to change the template, which is often easier than manipulating a template string.
|
|
||||||
|
|
||||||
## Template writing tips
|
Create a template by writing a Jinja template and then setting it as the chat template in the tokenizer. For example, the template below adds `[ASST]` and `[/ASST]` tags to the assistant messages.
|
||||||
|
|
||||||
The easiest way to start writing Jinja templates is to refer to existing templates. Use `print(tokenizer.chat_template)` on any chat model to see the template it's using. Try starting with simple models that don't call any tools or support RAG because tool-use models can have very complex templates. Finally, take a look at the [Jinja documentation](https://jinja.palletsprojects.com/en/stable/templates/#synopsis) for more details about formatting and syntax.
|
|
||||||
|
|
||||||
There are some specific tips and pitfalls you may encounter while writing chat templates specifically, though, and this section will cover some of them in more detail.
|
|
||||||
|
|
||||||
### Writing multimodal chat templates
|
|
||||||
|
|
||||||
For multimodal templates, the `chat_template` attribute is set on the **processor**, not the tokenizer. The `content` key of a message is often a list of content dicts,
|
|
||||||
rather than just a single string. You may wish to check the type of each content item in the list, and handle it accordingly.
|
|
||||||
|
|
||||||
Generally, the template should not directly access image or video data. This is normally handled by the processor after template rendering has finished. Instead,
|
|
||||||
your template should emit a single special token like `<|image|>` or `<|video|>` when it encounters image or video content. The processor will
|
|
||||||
expand the single special token out into a sequence of image or video tokens later. The exact tokens to emit depends on the model you're working with. We strongly recommend loading an existing multimodal processor to see how it handles data.
|
|
||||||
|
|
||||||
The example template below handles mixed image and text content.
|
|
||||||
|
|
||||||
```jinja
|
```jinja
|
||||||
{%- for message in messages %}
|
{%- for message in messages %}
|
||||||
{%- if loop.index0 == 0 %}
|
{%- if message['role'] == 'user' %}
|
||||||
{{- bos_token }}
|
{{- bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }}
|
||||||
|
{%- elif message['role'] == 'system' %}
|
||||||
|
{{- '<<SYS>>\\n' + message['content'].strip() + '\\n<</SYS>>\\n\\n' }}
|
||||||
|
{%- elif message['role'] == 'assistant' %}
|
||||||
|
{{- '[ASST] ' + message['content'] + ' [/ASST]' + eos_token }}
|
||||||
{%- endif %}
|
{%- endif %}
|
||||||
{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
|
|
||||||
{%- if message['content'] is string %}
|
|
||||||
{{- message['content'] }}
|
|
||||||
{%- else %}
|
|
||||||
{%- for content in message['content'] %}
|
|
||||||
{%- if content['type'] == 'image' %}
|
|
||||||
{{- '<|image|>' }}
|
|
||||||
{%- elif content['type'] == 'text' %}
|
|
||||||
{{- content['text'] }}
|
|
||||||
{%- endif %}
|
|
||||||
{%- endfor %}
|
|
||||||
{%- endif %}
|
|
||||||
{{- '<|eot_id|>' }}
|
|
||||||
{%- endfor %}
|
{%- endfor %}
|
||||||
{%- if add_generation_prompt %}
|
|
||||||
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
|
|
||||||
{%- endif %}
|
|
||||||
```
|
```
|
||||||
|
|
||||||
This multimodal template is very similar to the more simple template above, but it checks for `content` lists,
|
Set the template in the tokenizer, and the next time you use [`~PreTrainedTokenizerBase.apply_chat_template`], the new template is used.
|
||||||
and iterates over them to render `<|image|>` tokens where necessary. This allows images to be inserted "into the flow"
|
|
||||||
of user text.
|
|
||||||
|
|
||||||
Not all models work this way - some may move all images to the end of the user message,
|
```py
|
||||||
for example. The chat template should always match the format the model was trained with.
|
template = tokenizer.chat_template
|
||||||
|
template = template.replace("SYS", "SYSTEM") # Change the system token
|
||||||
|
tokenizer.chat_template = template # Set the new template
|
||||||
|
```
|
||||||
|
|
||||||
|
The template is saved in the `tokenizer_config.json` file. Upload it to the Hub with [`~PreTrainedTokenizer.push_to_hub`] so you can reuse it later and make sure everyone is using the right template for your model.
|
||||||
|
|
||||||
|
```py
|
||||||
|
tokenizer.push_to_hub("model_name")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Template writing tips
|
||||||
|
|
||||||
|
The easiest way to start writing Jinja templates is to refer to existing templates. Use `print(tokenizer.chat_template)` on any chat model to see what template it's using. Try starting with simple models that don't call any tools or support RAG. Finally, take a look at the [Jinja documentation](https://jinja.palletsprojects.com/en/3.1.x/templates/#synopsis) for more details about formatting and syntax.
|
||||||
|
|
||||||
|
This section curates some best practices for writing clean and efficient Jinja templates.
|
||||||
|
|
||||||
### Trimming whitespace
|
### Trimming whitespace
|
||||||
|
|
||||||
Jinja prints any whitespace before or after a block of text. This can be an issue for chat templates because adding extra whitespace that was not present during model training can harm performance. To remove the whitespace, add `-` to the Jinja line syntax. This allows you to write your template with Pythonic indentation and linebreaks, without accidentally printing an indentation in the rendered output.
|
Jinja prints any whitespace before or after a block of text. This can be an issue for chat templates because whitespace usage should be intentional. Add `-` to strip any whitespace before a block.
|
||||||
|
|
||||||
The example template below doesn't use `-`, resulting in extra whitespace being printed in the output.
|
|
||||||
|
|
||||||
```jinja
|
|
||||||
{% for message in messages %}
|
|
||||||
{{ message['role'] + message['content'] }}
|
|
||||||
{% endfor %}
|
|
||||||
```
|
|
||||||
|
|
||||||
We strongly recommend using `-` to ensure only the intended content is printed.
|
|
||||||
|
|
||||||
```jinja
|
```jinja
|
||||||
{%- for message in messages %}
|
{%- for message in messages %}
|
||||||
@ -106,20 +82,30 @@ We strongly recommend using `-` to ensure only the intended content is printed.
|
|||||||
{%- endfor %}
|
{%- endfor %}
|
||||||
```
|
```
|
||||||
|
|
||||||
### Special variables and callables
|
The incorrect whitespace usage example below may introduce a newline and indentation in the output.
|
||||||
|
|
||||||
|
```jinja
|
||||||
|
{% for message in messages %}
|
||||||
|
{{ message['role'] + message['content'] }}
|
||||||
|
{% endfor %}
|
||||||
|
```
|
||||||
|
|
||||||
The only constants in a template are the `messages` variable and the `add_generation_prompt` boolean. However, you have
|
### Special variables
|
||||||
access to **any other keyword arguments that are passed** to the [`~PreTrainedTokenizerBase.apply_chat_template`] method.
|
|
||||||
|
|
||||||
This provides flexibility and enables support for use-cases we may not have thought of while designing the spec. The most common additional variable is `tools`, which contains a list of tools in JSON schema format. Although you can use any variable name you like, we highly recommend sticking to convention and using `tools` for this purpose. This makes templates more compatible with the standard API.
|
There are five special variables available inside a template. You can pass virtually any additional arguments to [`~PreTrainedTokenizerBase.apply_chat_template`] and it will be available inside the template as a variable. However, you should try to keep the number of variables to the five below to make it easier for users to use the chat model without writing custom code to handle model-specific arguments.
|
||||||
|
|
||||||
You also have access to any tokens contained in `tokenizer.special_tokens_map`, which often includes special tokens like `bos_token` and `eos_token`. Access these directly by name, like `{{- bos_token }}`.
|
- `messages` contains the chat history as a list of message dicts.
|
||||||
|
- `tools` contains a list of tools in JSON schema format.
|
||||||
|
- `documents` contains a list of documents with the format `{"title": Title, "contents": "Contents"}` (designed for RAG models).
|
||||||
|
- `add_generation_prompt` is a boolean that determines whether to add an assistant header at the end of the conversation.
|
||||||
|
- `bos_token` and `eos_token` are special tokens extracted from a tokenizers `special_tokens_map`.
|
||||||
|
|
||||||
There are two callable functions available to you. To call them, use `{{- function_name(argument) }}`.
|
### Callable functions
|
||||||
|
|
||||||
|
There are two callable functions available inside a template.
|
||||||
|
|
||||||
- `raise_exception(msg)` raises a `TemplateException`. This is useful for debugging or warning users about incorrect template usage.
|
- `raise_exception(msg)` raises a `TemplateException`. This is useful for debugging or warning users about incorrect template usage.
|
||||||
- `strftime_now(format_str)` retrieves the current date and time in a specific format, which is often required in system messages. It is equivalent to [datetime.now().strftime(format_str)](https://docs.python.org/3/library/datetime.html#datetime.datetime.now) in Python.
|
- `strftime_now(format_str)` retrieves the current date and time in a specific format which could be useful to include in system messages. It is equivalent to [datetime.now().strftime(format_str)](https://docs.python.org/3/library/datetime.html#datetime.datetime.now) in Python.
|
||||||
|
|
||||||
### Compatibility with non-Python Jinja
|
### Compatibility with non-Python Jinja
|
||||||
|
|
||||||
@ -158,11 +144,9 @@ The following section lists elements of the standard API for writing templates f
|
|||||||
|
|
||||||
### Tool definitions
|
### Tool definitions
|
||||||
|
|
||||||
[Tools](./chat_extras) are passed as Python functions or a JSON schema. When functions are passed, a JSON schema is automatically generated and passed to the template. When a template accesses the `tools` variable, it is always a list of JSON schemas.
|
Transformers chat template methods allow a user to pass tools as Python functions or a JSON schema. When functions are passed, a JSON schema is automatically generated and passed to the template. The `tools` variable in a template always takes a list of JSON schemas.
|
||||||
|
|
||||||
Even though a template always receive tools as a JSON schema, you may need to radically change this format when rendering them to match the format a model was trained with. For example, [Command-R](./model_doc/cohere) was trained with tools defined with Python function headers. The template internally converts JSON schema types and renders the input tools as Python headers.
|
The specific tokens and tool descriptions should match the ones your model was trained with. Your model doesn't need to understand the JSON schema input because your template can translate the JSON schema into your models format. For example, [Command-R](./model_doc/cohere) was trained with tools defined with Python function headers, but the Command-R tool template accepts JSON schemas. The template internally converts types and renders the input tools as Python headers.
|
||||||
|
|
||||||
The example below shows how a tool is defined in JSON schema format.
|
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
@ -188,7 +172,7 @@ The example below shows how a tool is defined in JSON schema format.
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
An example of handling tool definitions in a chat template is shown below. The specific tokens and layouts should be changed to match the ones the model was trained with.
|
An example for handling tool definitions in a chat template is shown below. The specific tokens and tool descriptions should be changed to match the ones a model was trained with.
|
||||||
|
|
||||||
```
|
```
|
||||||
{%- if tools %}
|
{%- if tools %}
|
||||||
@ -204,9 +188,7 @@ An example of handling tool definitions in a chat template is shown below. The s
|
|||||||
|
|
||||||
### Tool calls
|
### Tool calls
|
||||||
|
|
||||||
In addition to rendering the tool definitions, you also need to render **tool calls** and **tool responses** in the template.
|
Tool calls, if present, is a list with the `"assistant”` role. This is always a list even though most tool-calling models only support single tool calls, which means the list usually only contains a single element.
|
||||||
|
|
||||||
Tool calls are generally passed in the `tool_calls` key of an `"assistant”` message. This is always a list even though most tool-calling models only support single tool calls, which means the list usually only contains a single element.
|
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
@ -226,7 +208,7 @@ Tool calls are generally passed in the `tool_calls` key of an `"assistant”` me
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
A common pattern for handling tool calls is shown below. You can use this as a starting point, but make sure you template actually matches the format the model was trained with!
|
A common pattern for handling tool calls is shown below.
|
||||||
|
|
||||||
```
|
```
|
||||||
{%- if message['role'] == 'assistant' and 'tool_calls' in message %}
|
{%- if message['role'] == 'assistant' and 'tool_calls' in message %}
|
||||||
@ -239,7 +221,7 @@ A common pattern for handling tool calls is shown below. You can use this as a s
|
|||||||
|
|
||||||
### Tool responses
|
### Tool responses
|
||||||
|
|
||||||
Tool responses are message dicts with the `tool` role. They are much simpler than tool calls, and usually only contain the `role`, `name` and `content` keys.
|
Tool responses are a message dict with the `role`, `name` (name of the function) and `content` (result of the tool call) keys.
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
@ -249,7 +231,7 @@ Tool responses are message dicts with the `tool` role. They are much simpler tha
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
Some templates may not even need the `name` key, in which case, you can write your template to only read the `content` key.
|
Not all the keys need to be used in the tool response. For example, if a model doesn’t expect the function name to be included in the tool response, then you can just include the `role` and `content`.
|
||||||
|
|
||||||
```
|
```
|
||||||
{%- if message['role'] == 'tool' %}
|
{%- if message['role'] == 'tool' %}
|
||||||
@ -259,11 +241,11 @@ Some templates may not even need the `name` key, in which case, you can write yo
|
|||||||
|
|
||||||
## Contribute
|
## Contribute
|
||||||
|
|
||||||
Once a template is ready, set it to the `chat_template` attribute in the tokenizer and test it with [`~PreTrainedTokenizerBase.apply_chat_template`]. If it works as expected, then upload it to the Hub with [`~PreTrainedTokenizer.push_to_hub`].
|
Add a chat template by setting the `chat_template` attribute in the tokenizer and testing it with [`~PreTrainedTokenizerBase.apply_chat_template`]. If it works as expected, then you can upload it to the Hub with with [`~PreTrainedTokenizer.push_to_hub`].
|
||||||
|
|
||||||
Even if you're not the model owner, it is still helpful to add a template for a model with an empty or incorrect chat template. Open a [pull request](https://hf.co/docs/hub/repositories-pull-requests-discussions) on the model repository to add the template!
|
Even if you're not the model owner, it is still helpful to add a template for a model with an empty chat template or a model that is using a default class template. Open a [pull request](https://hf.co/docs/hub/repositories-pull-requests-discussions) on the model repository to add the template.
|
||||||
|
|
||||||
```py
|
```py
|
||||||
tokenizer.chat_template = template
|
tokenizer.chat_template = template
|
||||||
tokenizer.push_to_hub("amazing_company/cool_model", commit_message="Add chat template", create_pr=True)
|
tokenizer.push_to_hub("model_name")
|
||||||
```
|
```
|
||||||
|
@ -17,6 +17,7 @@ This page regroups resources around 🤗 Transformers developed by the community
|
|||||||
| Notebook | Description | Author | |
|
| Notebook | Description | Author | |
|
||||||
|:----------|:-------------|:-------------|------:|
|
|:----------|:-------------|:-------------|------:|
|
||||||
| [Fine-tune a pre-trained Transformer to generate lyrics](https://github.com/AlekseyKorshuk/huggingartists) | How to generate lyrics in the style of your favorite artist by fine-tuning a GPT-2 model | [Aleksey Korshuk](https://github.com/AlekseyKorshuk) | [](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb) |
|
| [Fine-tune a pre-trained Transformer to generate lyrics](https://github.com/AlekseyKorshuk/huggingartists) | How to generate lyrics in the style of your favorite artist by fine-tuning a GPT-2 model | [Aleksey Korshuk](https://github.com/AlekseyKorshuk) | [](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb) |
|
||||||
|
| [Train T5 in Tensorflow 2](https://github.com/snapthat/TF-T5-text-to-text) | How to train T5 for any task using Tensorflow 2. This notebook demonstrates a Question & Answer task implemented in Tensorflow 2 using SQUAD | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) |
|
||||||
| [Train T5 on TPU](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb) | How to train T5 on SQUAD with Transformers and Nlp | [Suraj Patil](https://github.com/patil-suraj) |[](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) |
|
| [Train T5 on TPU](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb) | How to train T5 on SQUAD with Transformers and Nlp | [Suraj Patil](https://github.com/patil-suraj) |[](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) |
|
||||||
| [Fine-tune T5 for Classification and Multiple Choice](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) | How to fine-tune T5 for classification and multiple choice tasks using a text-to-text format with PyTorch Lightning | [Suraj Patil](https://github.com/patil-suraj) | [](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) |
|
| [Fine-tune T5 for Classification and Multiple Choice](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) | How to fine-tune T5 for classification and multiple choice tasks using a text-to-text format with PyTorch Lightning | [Suraj Patil](https://github.com/patil-suraj) | [](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) |
|
||||||
| [Fine-tune DialoGPT on New Datasets and Languages](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) | How to fine-tune the DialoGPT model on a new dataset for open-dialog conversational chatbots | [Nathan Cooper](https://github.com/ncoop57) | [](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) |
|
| [Fine-tune DialoGPT on New Datasets and Languages](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) | How to fine-tune the DialoGPT model on a new dataset for open-dialog conversational chatbots | [Nathan Cooper](https://github.com/ncoop57) | [](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) |
|
||||||
@ -41,6 +42,7 @@ This page regroups resources around 🤗 Transformers developed by the community
|
|||||||
|[Fine-tune ALBERT for sentence-pair classification](https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb) | How to fine-tune an ALBERT model or another BERT-based model for the sentence-pair classification task | [Nadir El Manouzi](https://github.com/NadirEM) | [](https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb)|
|
|[Fine-tune ALBERT for sentence-pair classification](https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb) | How to fine-tune an ALBERT model or another BERT-based model for the sentence-pair classification task | [Nadir El Manouzi](https://github.com/NadirEM) | [](https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb)|
|
||||||
|[Fine-tune Roberta for sentiment analysis](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | How to fine-tune a Roberta model for sentiment analysis | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)|
|
|[Fine-tune Roberta for sentiment analysis](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | How to fine-tune a Roberta model for sentiment analysis | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)|
|
||||||
|[Evaluating Question Generation Models](https://github.com/flexudy-pipe/qugeev) | How accurate are the answers to questions generated by your seq2seq transformer model? | [Pascal Zoleko](https://github.com/zolekode) | [](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)|
|
|[Evaluating Question Generation Models](https://github.com/flexudy-pipe/qugeev) | How accurate are the answers to questions generated by your seq2seq transformer model? | [Pascal Zoleko](https://github.com/zolekode) | [](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)|
|
||||||
|
|[Classify text with DistilBERT and Tensorflow](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | How to fine-tune DistilBERT for text classification in TensorFlow | [Peter Bayerle](https://github.com/peterbayerle) | [](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)|
|
||||||
|[Leverage BERT for Encoder-Decoder Summarization on CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | How to warm-start a *EncoderDecoderModel* with a *google-bert/bert-base-uncased* checkpoint for summarization on CNN/Dailymail | [Patrick von Platen](https://github.com/patrickvonplaten) | [](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)|
|
|[Leverage BERT for Encoder-Decoder Summarization on CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | How to warm-start a *EncoderDecoderModel* with a *google-bert/bert-base-uncased* checkpoint for summarization on CNN/Dailymail | [Patrick von Platen](https://github.com/patrickvonplaten) | [](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)|
|
||||||
|[Leverage RoBERTa for Encoder-Decoder Summarization on BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | How to warm-start a shared *EncoderDecoderModel* with a *FacebookAI/roberta-base* checkpoint for summarization on BBC/XSum | [Patrick von Platen](https://github.com/patrickvonplaten) | [](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)|
|
|[Leverage RoBERTa for Encoder-Decoder Summarization on BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | How to warm-start a shared *EncoderDecoderModel* with a *FacebookAI/roberta-base* checkpoint for summarization on BBC/XSum | [Patrick von Platen](https://github.com/patrickvonplaten) | [](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)|
|
||||||
|[Fine-tune TAPAS on Sequential Question Answering (SQA)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | How to fine-tune *TapasForQuestionAnswering* with a *tapas-base* checkpoint on the Sequential Question Answering (SQA) dataset | [Niels Rogge](https://github.com/nielsrogge) | [](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)|
|
|[Fine-tune TAPAS on Sequential Question Answering (SQA)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | How to fine-tune *TapasForQuestionAnswering* with a *tapas-base* checkpoint on the Sequential Question Answering (SQA) dataset | [Niels Rogge](https://github.com/nielsrogge) | [](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)|
|
||||||
|
@ -16,15 +16,18 @@ rendered properly in your Markdown viewer.
|
|||||||
|
|
||||||
# Chat basics
|
# Chat basics
|
||||||
|
|
||||||
Chat models are conversational models you can send a message to and receive a response. Most language models from mid-2023 onwards are chat models and may be referred to as "instruct" or "instruction-tuned" models. Models that do not support chat are often referred to as "base" or "pretrained" models.
|
Chat models are conversational models you can send and receive messages from. There are many chat models available to choose from, but in general, larger models tend to be better though that's not always the case. The model size is often included in the name, like "8B" or "70B", and it describes the number of parameters. Mixture-of-expert (MoE) models have names like "8x7B" or "141B-A35B" which means it's a 56B and 141B parameter model. You can try quantizing larger models to reduce memory requirements, otherwise you'll need ~2 bytes of memory per parameter.
|
||||||
|
|
||||||
Larger and newer models are generally more capable, but models specialized in certain domains (medical, legal text, non-English languages, etc.) can often outperform these larger models. Try leaderboards like [OpenLLM](https://hf.co/spaces/HuggingFaceH4/open_llm_leaderboard) and [LMSys Chatbot Arena](https://chat.lmsys.org/?leaderboard) to help you identify the best model for your use case.
|
Check model leaderboards like [OpenLLM](https://hf.co/spaces/HuggingFaceH4/open_llm_leaderboard) and [LMSys Chatbot Arena](https://chat.lmsys.org/?leaderboard) to further help you identify the best chat models for your use case. Models that are specialized in certain domains (medical, legal text, non-English languages, etc.) may sometimes outperform larger general purpose models.
|
||||||
|
|
||||||
This guide shows you how to quickly load chat models in Transformers from the command line, how to build and format a conversation, and how to chat using the [`TextGenerationPipeline`].
|
> [!TIP]
|
||||||
|
> Chat with a number of open-source models for free on [HuggingChat](https://hf.co/chat/)!
|
||||||
|
|
||||||
## chat CLI
|
This guide shows you how to quickly start chatting with Transformers from the command line, how build and format a conversation, and how to chat using the [`TextGenerationPipeline`].
|
||||||
|
|
||||||
After you've [installed Transformers](./installation), you can chat with a model directly from the command line. The command below launches an interactive session with a model, with a few base commands listed at the start of the session.
|
## transformers CLI
|
||||||
|
|
||||||
|
After you've [installed Transformers](./installation.md), chat with a model directly from the command line as shown below. It launches an interactive session with a model, with a few base commands listed at the start of the session.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
transformers chat Qwen/Qwen2.5-0.5B-Instruct
|
transformers chat Qwen/Qwen2.5-0.5B-Instruct
|
||||||
@ -46,61 +49,91 @@ For a full list of options, run the command below.
|
|||||||
transformers chat -h
|
transformers chat -h
|
||||||
```
|
```
|
||||||
|
|
||||||
The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating). It uses the `transformers serve` CLI under the hood ([docs](./serving.md#serve-cli)).
|
The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating).
|
||||||
|
|
||||||
|
|
||||||
## TextGenerationPipeline
|
## TextGenerationPipeline
|
||||||
|
|
||||||
[`TextGenerationPipeline`] is a high-level text generation class with a "chat mode". Chat mode is enabled when a conversational model is detected and the chat prompt is [properly formatted](./llm_tutorial#wrong-prompt-format).
|
[`TextGenerationPipeline`] is a high-level text generation class with a "chat mode". Chat mode is enabled when a conversational model is detected and the chat prompt is [properly formatted](./llm_tutorial#wrong-prompt-format).
|
||||||
|
|
||||||
Chat models accept a list of messages (the chat history) as the input. Each message is a dictionary with `role` and `content` keys.
|
To start, build a chat history with the following two roles.
|
||||||
To start the chat, add a single `user` message. You can also optionally include a `system` message to give the model directions on how to behave.
|
|
||||||
|
- `system` describes how the model should behave and respond when you're chatting with it. This role isn't supported by all chat models.
|
||||||
|
- `user` is where you enter your first message to the model.
|
||||||
|
|
||||||
```py
|
```py
|
||||||
chat = [
|
chat = [
|
||||||
{"role": "system", "content": "You are a helpful science assistant."},
|
{"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
|
||||||
{"role": "user", "content": "Hey, can you explain gravity to me?"}
|
{"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
|
||||||
]
|
]
|
||||||
```
|
```
|
||||||
|
|
||||||
Create the [`TextGenerationPipeline`] and pass `chat` to it. For large models, setting [device_map="auto"](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available.
|
Create the [`TextGenerationPipeline`] and pass `chat` to it. For large models, setting [device_map="auto"](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Changing the data type to [torch.bfloat16](./models#model-data-type) also helps save memory.
|
||||||
|
|
||||||
```py
|
```py
|
||||||
import torch
|
import torch
|
||||||
from transformers import pipeline
|
from transformers import pipeline
|
||||||
|
|
||||||
pipeline = pipeline(task="text-generation", model="HuggingFaceTB/SmolLM2-1.7B-Instruct", dtype="auto", device_map="auto")
|
pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")
|
||||||
response = pipeline(chat, max_new_tokens=512)
|
response = pipeline(chat, max_new_tokens=512)
|
||||||
print(response[0]["generated_text"][-1]["content"])
|
print(response[0]["generated_text"][-1]["content"])
|
||||||
```
|
```
|
||||||
|
|
||||||
If this works successfully, you should see a response from the model! If you want to continue the conversation,
|
```txt
|
||||||
you need to update the chat history with the model's response. You can do this either by appending the text
|
(sigh) Oh boy, you're asking me for advice? You're gonna need a map, pal! Alright,
|
||||||
to `chat` (use the `assistant` role), or by reading `response[0]["generated_text"]`, which contains
|
alright, I'll give you the lowdown. But don't say I didn't warn you, I'm a robot, not a tour guide!
|
||||||
the full chat history, including the most recent response.
|
|
||||||
|
|
||||||
Once you have the model's response, you can continue the conversation by appending a new `user` message to the chat history.
|
So, you wanna know what's fun to do in the Big Apple? Well, let me tell you, there's a million
|
||||||
|
things to do, but I'll give you the highlights. First off, you gotta see the sights: the Statue of
|
||||||
|
Liberty, Central Park, Times Square... you know, the usual tourist traps. But if you're lookin' for
|
||||||
|
something a little more... unusual, I'd recommend checkin' out the Museum of Modern Art. It's got
|
||||||
|
some wild stuff, like that Warhol guy's soup cans and all that jazz.
|
||||||
|
|
||||||
|
And if you're feelin' adventurous, take a walk across the Brooklyn Bridge. Just watch out for
|
||||||
|
those pesky pigeons, they're like little feathered thieves! (laughs) Get it? Thieves? Ah, never mind.
|
||||||
|
|
||||||
|
Now, if you're lookin' for some serious fun, hit up the comedy clubs in Greenwich Village. You might
|
||||||
|
even catch a glimpse of some up-and-coming comedians... or a bunch of wannabes tryin' to make it big. (winks)
|
||||||
|
|
||||||
|
And finally, if you're feelin' like a real New Yorker, grab a slice of pizza from one of the many amazing
|
||||||
|
pizzerias around the city. Just don't try to order a "robot-sized" slice, trust me, it won't end well. (laughs)
|
||||||
|
|
||||||
|
So, there you have it, pal! That's my expert advice on what to do in New York. Now, if you'll
|
||||||
|
excuse me, I've got some oil changes to attend to. (winks)
|
||||||
|
```
|
||||||
|
|
||||||
|
Use the `append` method on `chat` to respond to the models message.
|
||||||
|
|
||||||
```py
|
```py
|
||||||
chat = response[0]["generated_text"]
|
chat = response[0]["generated_text"]
|
||||||
chat.append(
|
chat.append(
|
||||||
{"role": "user", "content": "Woah! But can it be reconciled with quantum mechanics?"}
|
{"role": "user", "content": "Wait, what's so wild about soup cans?"}
|
||||||
)
|
)
|
||||||
response = pipeline(chat, max_new_tokens=512)
|
response = pipeline(chat, max_new_tokens=512)
|
||||||
print(response[0]["generated_text"][-1]["content"])
|
print(response[0]["generated_text"][-1]["content"])
|
||||||
```
|
```
|
||||||
|
|
||||||
By repeating this process, you can continue the conversation as long as you like, at least until the model runs out of context window
|
```txt
|
||||||
or you run out of memory.
|
(laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man!
|
||||||
|
It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's
|
||||||
|
like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!"
|
||||||
|
(sarcastically) Oh, yeah, real original, Andy.
|
||||||
|
|
||||||
## Performance and memory usage
|
But, you know, back in the '60s, it was like, a big deal. People were all about challenging the
|
||||||
|
status quo, and Warhol was like, the king of that. He took the ordinary and made it extraordinary.
|
||||||
|
And, let me tell you, it was like, a real game-changer. I mean, who would've thought that a can of soup could be art? (laughs)
|
||||||
|
|
||||||
Transformers load models in full `float32` precision by default, and for a 8B model, this requires ~32GB of memory! Use the `torch_dtype="auto"` argument, which generally uses `bfloat16` for models that were trained with it, to reduce your memory usage.
|
But, hey, you're not alone, pal. I mean, I'm a robot, and even I don't get it. (winks)
|
||||||
|
But, hey, that's what makes art, art, right? (laughs)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Performance
|
||||||
|
|
||||||
|
Transformers load models in full precision by default, and for a 8B model, this requires ~32GB of memory! Reduce memory usage by loading a model in half-precision or bfloat16 (only uses ~2 bytes per parameter). You can even quantize the model to a lower precision like 8-bit or 4-bit with [bitsandbytes](https://hf.co/docs/bitsandbytes/index).
|
||||||
|
|
||||||
> [!TIP]
|
> [!TIP]
|
||||||
> Refer to the [Quantization](./quantization/overview) docs for more information about the different quantization backends available.
|
> Refer to the [Quantization](./quantization/overview) docs for more information about the different quantization backends available.
|
||||||
|
|
||||||
To lower memory usage even lower, you can quantize the model to 8-bit or 4-bit with [bitsandbytes](https://hf.co/docs/bitsandbytes/index). Create a [`BitsAndBytesConfig`] with your desired quantization settings and pass it to the pipelines `model_kwargs` parameter. The example below quantizes a model to 8-bits.
|
Create a [`BitsAndBytesConfig`] with your desired quantization settings and pass it to the pipelines `model_kwargs` parameter. The example below quantizes a model to 8-bits.
|
||||||
|
|
||||||
```py
|
```py
|
||||||
from transformers import pipeline, BitsAndBytesConfig
|
from transformers import pipeline, BitsAndBytesConfig
|
||||||
@ -109,10 +142,19 @@ quantization_config = BitsAndBytesConfig(load_in_8bit=True)
|
|||||||
pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", model_kwargs={"quantization_config": quantization_config})
|
pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", model_kwargs={"quantization_config": quantization_config})
|
||||||
```
|
```
|
||||||
|
|
||||||
In general, model size and performance are directly correlated. Larger models are slower in addition to requiring more memory because each active parameter must be read from memory for every generated token.
|
In general, larger models are slower in addition to requiring more memory because text generation is bottlenecked by **memory bandwidth** instead of compute power. Each active parameter must be read from memory for every generated token. For a 16GB model, 16GB must be read from memory for every generated token.
|
||||||
This is a bottleneck for LLM text generation and the main options for improving generation speed are to either quantize a model or use hardware with higher memory bandwidth. Adding more compute power doesn't meaningfully help.
|
|
||||||
|
|
||||||
You can also try techniques like [speculative decoding](./generation_strategies#speculative-decoding), where a smaller model generates candidate tokens that are verified by the larger model. If the candidate tokens are correct, the larger model can generate more than one token at a time. This significantly alleviates the bandwidth bottleneck and improves generation speed.
|
The number of generated tokens/sec is proportional to the total memory bandwidth of the system divided by the model size. Depending on your hardware, total memory bandwidth can vary. Refer to the table below for approximate generation speeds for different hardware types.
|
||||||
|
|
||||||
|
| Hardware | Memory bandwidth |
|
||||||
|
|---|---|
|
||||||
|
| consumer CPU | 20-100GB/sec |
|
||||||
|
| specialized CPU (Intel Xeon, AMD Threadripper/Epyc, Apple silicon) | 200-900GB/sec |
|
||||||
|
| data center GPU (NVIDIA A100/H100) | 2-3TB/sec |
|
||||||
|
|
||||||
|
The easiest solution for improving generation speed is to either quantize a model or use hardware with higher memory bandwidth.
|
||||||
|
|
||||||
|
You can also try techniques like [speculative decoding](./generation_strategies#speculative-decoding), where a smaller model generates candidate tokens that are verified by the larger model. If the candidate tokens are correct, the larger model can generate more than one token per `forward` pass. This significantly alleviates the bandwidth bottleneck and improves generation speed.
|
||||||
|
|
||||||
> [!TIP]
|
> [!TIP]
|
||||||
Mixture-of-Expert (MoE) models such as [Mixtral](./model_doc/mixtral), [Qwen2MoE](./model_doc/qwen2_moe), and [GPT-OSS](./model_doc/gpt-oss) have lots of parameters, but only "activate" a small fraction of them to generate each token. As a result, MoE models generally have much lower memory bandwidth requirements and can be faster than a regular LLM of the same size. However, techniques like speculative decoding are ineffective with MoE models because more parameters become activated with each new speculated token.
|
> Parameters may not be active for every generated token in MoE models such as [Mixtral](./model_doc/mixtral), [Qwen2MoE](./model_doc/qwen2_moe.md), and [DBRX](./model_doc/dbrx). As a result, MoE models generally have much lower memory bandwidth requirements and can be faster than a regular LLM of the same size. However, techniques like speculative decoding are ineffective with MoE models because parameters become activated with each new speculated token.
|
||||||
|
@ -1,42 +0,0 @@
|
|||||||
# Using Cursor as a client of transformers serve
|
|
||||||
|
|
||||||
This example shows how to use `transformers serve` as a local LLM provider for [Cursor](https://cursor.com/), the popular IDE. In this particular case, requests to `transformers serve` will come from an external IP (Cursor's server IPs), which requires some additional setup. Furthermore, some of Cursor's requests require [CORS](https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/CORS), which is disabled by default for security reasons.
|
|
||||||
|
|
||||||
To launch a server with CORS enabled, run
|
|
||||||
|
|
||||||
```shell
|
|
||||||
transformers serve --enable-cors
|
|
||||||
```
|
|
||||||
|
|
||||||
You'll also need to expose your server to external IPs. A potential solution is to use [`ngrok`](https://ngrok.com/), which has a permissive free tier. After setting up your `ngrok` account and authenticating on your server machine, you run
|
|
||||||
|
|
||||||
```shell
|
|
||||||
ngrok http [port]
|
|
||||||
```
|
|
||||||
|
|
||||||
where `port` is the port used by `transformers serve` (`8000` by default). On the terminal where you launched `ngrok`, you'll see a https address in the "Forwarding" row, as in the image below. This is the address to send requests to.
|
|
||||||
|
|
||||||
<h3 align="center">
|
|
||||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_ngrok.png"/>
|
|
||||||
</h3>
|
|
||||||
|
|
||||||
You're now ready to set things up on the app side! In Cursor, while you can't set a new provider, you can change the endpoint for OpenAI requests in the model selection settings. First, navigate to "Settings" > "Cursor Settings", "Models" tab, and expand the "API Keys" collapsible. To set your `transformers serve` endpoint, follow this order:
|
|
||||||
1. Unselect ALL models in the list above (e.g. `gpt4`, ...);
|
|
||||||
2. Add and select the model you want to use (e.g. `Qwen/Qwen3-4B`)
|
|
||||||
3. Add some random text to OpenAI API Key. This field won't be used, but it can’t be empty;
|
|
||||||
4. Add the https address from `ngrok` to the "Override OpenAI Base URL" field, appending `/v1` to the address (i.e. `https://(...).ngrok-free.app/v1`);
|
|
||||||
5. Hit "Verify".
|
|
||||||
|
|
||||||
After you follow these steps, your "Models" tab should look like the image below. Your server should also have received a few requests from the verification step.
|
|
||||||
|
|
||||||
<h3 align="center">
|
|
||||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_cursor.png"/>
|
|
||||||
</h3>
|
|
||||||
|
|
||||||
You are now ready to use your local model in Cursor! For instance, if you toggle the AI Pane, you can select the model you added and ask it questions about your local files.
|
|
||||||
|
|
||||||
<h3 align="center">
|
|
||||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_cursor_chat.png"/>
|
|
||||||
</h3>
|
|
||||||
|
|
||||||
|
|
@ -271,7 +271,7 @@ The model is ready to be pushed to the Hub now. Log in to your Hugging Face acco
|
|||||||
<hfoption id="huggingface-CLI">
|
<hfoption id="huggingface-CLI">
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
hf auth login
|
huggingface-cli login
|
||||||
```
|
```
|
||||||
|
|
||||||
</hfoption>
|
</hfoption>
|
||||||
|
@ -260,7 +260,7 @@ with deepspeed.zero.Init():
|
|||||||
The DeepSped config file needs to have `is_deepspeed_zero3_enabled: true` setup in [`TrainingArguments`] and it needs a ZeRO configuration enabled. The [`TrainingArguments`] object must be created **before** calling [`~PreTrainedModel.from_pretrained`].
|
The DeepSped config file needs to have `is_deepspeed_zero3_enabled: true` setup in [`TrainingArguments`] and it needs a ZeRO configuration enabled. The [`TrainingArguments`] object must be created **before** calling [`~PreTrainedModel.from_pretrained`].
|
||||||
|
|
||||||
> [!TIP]
|
> [!TIP]
|
||||||
> You'll need ZeRO-3 when the fp16 weights don't fit on a single GPU. But if you're able to load the fp16 weights, set `dtype=torch.float16` in [`~PreTrainedModel.from_pretrained`].
|
> You'll need ZeRO-3 when the fp16 weights don't fit on a single GPU. But if you're able to load the fp16 weights, set `torch_dtype=torch.float16` in [`~PreTrainedModel.from_pretrained`].
|
||||||
|
|
||||||
```py
|
```py
|
||||||
from transformers import AutoModel, Trainer, TrainingArguments
|
from transformers import AutoModel, Trainer, TrainingArguments
|
||||||
|
@ -38,7 +38,7 @@ generation_config = GenerationConfig(
|
|||||||
)
|
)
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", pad_token="</s>", padding_side="right")
|
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", pad_token="</s>", padding_side="right")
|
||||||
model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", device_map="auto", dtype=torch.bfloat16, attn_implementation="sdpa", generation_config=generation_config)
|
model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="sdpa", generation_config=generation_config)
|
||||||
|
|
||||||
exported_program = convert_and_export_with_cache(model)
|
exported_program = convert_and_export_with_cache(model)
|
||||||
```
|
```
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user