mirror of
https://github.com/huggingface/transformers.git
synced 2025-11-12 01:04:36 +08:00
Compare commits
105 Commits
v4.36.0
...
refactor-a
| Author | SHA1 | Date | |
|---|---|---|---|
| a446a09318 | |||
| 5d6d031e9f | |||
| 00917b20fb | |||
| 5580e29a77 | |||
| 5d51dc01b5 | |||
| 1e323f80a0 | |||
| 53888f8461 | |||
| 1dc571d695 | |||
| 5bb4662054 | |||
| 23f8e4db77 | |||
| 4a04b4ccca | |||
| 98c90805e2 | |||
| d276285237 | |||
| e63bf49f5d | |||
| 24e07d4833 | |||
| ac974199c8 | |||
| bd7a356135 | |||
| e126711e01 | |||
| 3dc777a696 | |||
| 5aec50ecaf | |||
| d5ace03056 | |||
| db35c83e14 | |||
| a9734d7ae9 | |||
| 7bf59622db | |||
| 251887604e | |||
| 4edffda636 | |||
| c52b515e94 | |||
| a52e180a0f | |||
| 08a6e7a702 | |||
| 71d47f0ad4 | |||
| 0695b2421a | |||
| 7c5408dade | |||
| a0522de497 | |||
| e6cb8e052a | |||
| 7f2a8f92e4 | |||
| b8378b658e | |||
| e6dcf8abd6 | |||
| f85a1e82c1 | |||
| 238d2e3c44 | |||
| ebfdb9ca62 | |||
| 0d63d17765 | |||
| 1faeff85ce | |||
| ffa04def0e | |||
| 29a1c1b472 | |||
| 26ea725bc0 | |||
| 1c286be508 | |||
| dec84b3211 | |||
| 74cae670ce | |||
| e2b6df7971 | |||
| deb72cb6d9 | |||
| d269c4b2d7 | |||
| 70a127a37a | |||
| c817c17dbe | |||
| 6af3ce7757 | |||
| 7e876dca54 | |||
| e737446ee6 | |||
| 1e20931765 | |||
| 1a585c1222 | |||
| 3060899be5 | |||
| 050e0b44f6 | |||
| 52c37882fb | |||
| 388fd314d8 | |||
| 0ede762636 | |||
| bb1d0d0d9e | |||
| e2b16485f3 | |||
| 9e5c28c573 | |||
| dde6c427a1 | |||
| 73de5108e1 | |||
| 2788f8d8d5 | |||
| 131a528be0 | |||
| 17506d1256 | |||
| fe44b1f1a9 | |||
| 3ed3e3190c | |||
| 815ea8e8a2 | |||
| 93766251cb | |||
| ec43d6870a | |||
| 749f94e460 | |||
| c7f076a00e | |||
| 371fb0b7dc | |||
| 230ac352d8 | |||
| f4db565b69 | |||
| 9936143014 | |||
| 78172dcdb7 | |||
| 5e4ef0a0f6 | |||
| a49f4acab3 | |||
| 680c610f97 | |||
| 4b759da8be | |||
| e660424717 | |||
| e5079b0b2a | |||
| 35478182ce | |||
| 67b1335cb9 | |||
| 54d0b1c278 | |||
| 4850aaba6f | |||
| 4b4b864224 | |||
| c0a354d8d7 | |||
| 7e35f37071 | |||
| 39acfe84ba | |||
| 0f59d2f173 | |||
| 417bb91484 | |||
| 5cec306cdc | |||
| 921a6bf26e | |||
| 44127ec667 | |||
| b911c1f10f | |||
| e49c385266 | |||
| 6ff109227b |
36
.github/workflows/build-docker-images.yml
vendored
36
.github/workflows/build-docker-images.yml
vendored
@ -271,3 +271,39 @@ jobs:
|
||||
REF=main
|
||||
push: true
|
||||
tags: huggingface/transformers-tensorflow-gpu
|
||||
|
||||
# latest-pytorch-deepspeed-amd:
|
||||
# name: "PyTorch + DeepSpeed (AMD) [dev]"
|
||||
|
||||
# runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210]
|
||||
# steps:
|
||||
# - name: Set up Docker Buildx
|
||||
# uses: docker/setup-buildx-action@v3
|
||||
# - name: Check out code
|
||||
# uses: actions/checkout@v3
|
||||
# - name: Login to DockerHub
|
||||
# uses: docker/login-action@v3
|
||||
# with:
|
||||
# username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
# password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||
# - name: Build and push
|
||||
# uses: docker/build-push-action@v5
|
||||
# with:
|
||||
# context: ./docker/transformers-pytorch-deepspeed-amd-gpu
|
||||
# build-args: |
|
||||
# REF=main
|
||||
# push: true
|
||||
# tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}
|
||||
# # Push CI images still need to be re-built daily
|
||||
# -
|
||||
# name: Build and push (for Push CI) in a daily basis
|
||||
# # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
|
||||
# # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
|
||||
# if: inputs.image_postfix != '-push-ci'
|
||||
# uses: docker/build-push-action@v5
|
||||
# with:
|
||||
# context: ./docker/transformers-pytorch-deepspeed-amd-gpu
|
||||
# build-args: |
|
||||
# REF=main
|
||||
# push: true
|
||||
# tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci
|
||||
|
||||
4
.github/workflows/self-nightly-scheduled.yml
vendored
4
.github/workflows/self-nightly-scheduled.yml
vendored
@ -212,7 +212,7 @@ jobs:
|
||||
python3 -m pip uninstall -y deepspeed
|
||||
rm -rf DeepSpeed
|
||||
git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
|
||||
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
@ -286,4 +286,4 @@ jobs:
|
||||
with:
|
||||
name: |
|
||||
single-*
|
||||
multi-*
|
||||
multi-*
|
||||
|
||||
4
.github/workflows/self-past.yml
vendored
4
.github/workflows/self-past.yml
vendored
@ -267,7 +267,7 @@ jobs:
|
||||
python3 -m pip uninstall -y deepspeed
|
||||
rm -rf DeepSpeed
|
||||
git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
|
||||
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
@ -353,4 +353,4 @@ jobs:
|
||||
with:
|
||||
name: |
|
||||
single-*
|
||||
multi-*
|
||||
multi-*
|
||||
|
||||
@ -18,7 +18,7 @@ on:
|
||||
jobs:
|
||||
run_amd_ci:
|
||||
name: AMD mi210
|
||||
if: (cancelled() != true) && ((github.event_name == 'push') && (github.ref_name == 'main' || startsWith(github.ref_name, 'run_amd_push_ci_caller')))
|
||||
if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
|
||||
uses: ./.github/workflows/self-push-amd.yml
|
||||
with:
|
||||
gpu_flavor: mi210
|
||||
|
||||
@ -18,7 +18,7 @@ on:
|
||||
jobs:
|
||||
run_amd_ci:
|
||||
name: AMD mi250
|
||||
if: (cancelled() != true) && ((github.event_name == 'push') && (github.ref_name == 'main' || startsWith(github.ref_name, 'run_amd_push_ci_caller')))
|
||||
if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
|
||||
uses: ./.github/workflows/self-push-amd.yml
|
||||
with:
|
||||
gpu_flavor: mi250
|
||||
|
||||
4
.github/workflows/self-push.yml
vendored
4
.github/workflows/self-push.yml
vendored
@ -366,7 +366,7 @@ jobs:
|
||||
working-directory: /workspace
|
||||
run: |
|
||||
python3 -m pip uninstall -y deepspeed
|
||||
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
@ -456,7 +456,7 @@ jobs:
|
||||
working-directory: /workspace
|
||||
run: |
|
||||
python3 -m pip uninstall -y deepspeed
|
||||
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
|
||||
@ -12,7 +12,7 @@ on:
|
||||
jobs:
|
||||
run_amd_ci:
|
||||
name: AMD mi210
|
||||
if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_scheduled_ci_caller')))
|
||||
if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_scheduled_ci_caller')))
|
||||
uses: ./.github/workflows/self-scheduled-amd.yml
|
||||
with:
|
||||
gpu_flavor: mi210
|
||||
|
||||
@ -12,7 +12,7 @@ on:
|
||||
jobs:
|
||||
run_amd_ci:
|
||||
name: AMD mi250
|
||||
if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_scheduled_ci_caller')))
|
||||
if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_scheduled_ci_caller')))
|
||||
uses: ./.github/workflows/self-scheduled-amd.yml
|
||||
with:
|
||||
gpu_flavor: mi250
|
||||
|
||||
61
.github/workflows/self-scheduled-amd.yml
vendored
61
.github/workflows/self-scheduled-amd.yml
vendored
@ -356,6 +356,63 @@ jobs:
|
||||
name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
|
||||
path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
|
||||
|
||||
run_tests_torch_deepspeed_gpu:
|
||||
name: Torch ROCm deepspeed tests
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
|
||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||
needs: setup
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-deepspeed-amd-gpu
|
||||
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
run: git fetch && git checkout ${{ github.sha }}
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: /transformers
|
||||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
|
||||
|
||||
- name: ROCM-SMI
|
||||
run: |
|
||||
rocm-smi
|
||||
- name: ROCM-INFO
|
||||
run: |
|
||||
rocminfo | grep "Agent" -A 14
|
||||
|
||||
- name: Show ROCR environment
|
||||
run: |
|
||||
echo "ROCR: $ROCR_VISIBLE_DEVICES"
|
||||
|
||||
- name: Environment
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
working-directory: /transformers
|
||||
run: pip freeze
|
||||
|
||||
- name: Run all tests on GPU
|
||||
working-directory: /transformers
|
||||
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_deepspeed_gpu tests/deepspeed tests/extended
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_deepspeed_gpu/failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: ${{ matrix.machine_type }}_run_tests_torch_deepspeed_gpu_test_reports
|
||||
path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_deepspeed_gpu
|
||||
|
||||
run_extract_warnings:
|
||||
name: Extract warnings in CI artifacts
|
||||
runs-on: ubuntu-22.04
|
||||
@ -368,7 +425,7 @@ jobs:
|
||||
run_tests_multi_gpu,
|
||||
run_examples_gpu,
|
||||
run_pipelines_torch_gpu,
|
||||
# run_all_tests_torch_cuda_extensions_gpu
|
||||
run_tests_torch_deepspeed_gpu
|
||||
]
|
||||
steps:
|
||||
- name: Checkout transformers
|
||||
@ -417,7 +474,7 @@ jobs:
|
||||
run_tests_multi_gpu,
|
||||
run_examples_gpu,
|
||||
run_pipelines_torch_gpu,
|
||||
# run_all_tests_torch_cuda_extensions_gpu,
|
||||
run_tests_torch_deepspeed_gpu,
|
||||
run_extract_warnings
|
||||
]
|
||||
steps:
|
||||
|
||||
2
.github/workflows/self-scheduled.yml
vendored
2
.github/workflows/self-scheduled.yml
vendored
@ -366,7 +366,7 @@ jobs:
|
||||
working-directory: /workspace
|
||||
run: |
|
||||
python3 -m pip uninstall -y deepspeed
|
||||
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||
DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
|
||||
@ -505,6 +505,7 @@ Current number of checkpoints: ** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
|
||||
1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
|
||||
1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
|
||||
1. **[VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)** (from University of Wisconsin–Madison) released with the paper [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784) by Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee.
|
||||
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
|
||||
1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||
|
||||
@ -480,6 +480,7 @@ Número actual de puntos de control: ** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
|
||||
1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
|
||||
1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
|
||||
1. **[VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)** (from University of Wisconsin–Madison) released with the paper [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784) by Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee.
|
||||
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
|
||||
1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||
|
||||
@ -454,6 +454,7 @@ conda install -c huggingface transformers
|
||||
1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (सिंघुआ यूनिवर्सिटी और ननकाई यूनिवर्सिटी से) साथ में पेपर [विजुअल अटेंशन नेटवर्क](https://arxiv.org/ pdf/2202.09741.pdf) मेंग-हाओ गुओ, चेंग-ज़े लू, झेंग-निंग लियू, मिंग-मिंग चेंग, शि-मिन हू द्वारा।
|
||||
1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (मल्टीमीडिया कम्प्यूटिंग ग्रुप, नानजिंग यूनिवर्सिटी से) साथ में पेपर [वीडियोएमएई: मास्क्ड ऑटोएन्कोडर स्व-पर्यवेक्षित वीडियो प्री-ट्रेनिंग के लिए डेटा-कुशल सीखने वाले हैं] (https://arxiv.org/abs/2203.12602) ज़ान टोंग, यिबिंग सॉन्ग, जुए द्वारा वांग, लिमिन वांग द्वारा पोस्ट किया गया।
|
||||
1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (NAVER AI Lab/Kakao Enterprise/Kakao Brain से) साथ में कागज [ViLT: Vision-and-Language Transformer बिना कनवल्शन या रीजन सुपरविजन](https://arxiv.org/abs/2102.03334) वोनजे किम, बोक्यूंग सोन, इल्डू किम द्वारा पोस्ट किया गया।
|
||||
1. **[VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)** (University of Wisconsin–Madison से) Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee. द्वाराअनुसंधान पत्र [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784) के साथ जारी किया गया
|
||||
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (गूगल एआई से) कागज के साथ [एक इमेज इज़ वर्थ 16x16 वर्ड्स: ट्रांसफॉर्मर्स फॉर इमेज रिकॉग्निशन एट स्केल](https://arxiv.org/abs/2010.11929) एलेक्सी डोसोवित्स्की, लुकास बेयर, अलेक्जेंडर कोलेसनिकोव, डिर्क वीसेनबोर्न, शियाओहुआ झाई, थॉमस अनटरथिनर, मुस्तफा देहघानी, मैथियास मिंडरर, जॉर्ज हेगोल्ड, सिल्वेन गेली, जैकब उस्ज़कोरेइट द्वारा हॉल्सबी द्वारा पोस्ट किया गया।
|
||||
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (UCLA NLP से) साथ वाला पेपर [VisualBERT: A Simple and Performant Baseline for Vision and Language](https:/ /arxiv.org/pdf/1908.03557) लियुनियन हेरोल्ड ली, मार्क यात्स्कर, दा यिन, चो-जुई हसीह, काई-वेई चांग द्वारा।
|
||||
1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||
|
||||
@ -514,6 +514,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
|
||||
1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (Tsinghua University and Nankai University から) Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu から公開された研究論文: [Visual Attention Network](https://arxiv.org/abs/2202.09741)
|
||||
1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (Multimedia Computing Group, Nanjing University から) Zhan Tong, Yibing Song, Jue Wang, Limin Wang から公開された研究論文: [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602)
|
||||
1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (NAVER AI Lab/Kakao Enterprise/Kakao Brain から) Wonjae Kim, Bokyung Son, Ildoo Kim から公開された研究論文: [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334)
|
||||
1. **[VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)** (University of Wisconsin–Madison から) Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee. から公開された研究論文 [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784)
|
||||
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (Google AI から) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby から公開された研究論文: [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929)
|
||||
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (UCLA NLP から) Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang から公開された研究論文: [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557)
|
||||
1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (Google AI から) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby から公開された研究論文: [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929)
|
||||
|
||||
@ -429,6 +429,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
|
||||
1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (Tsinghua University and Nankai University 에서) Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu 의 [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) 논문과 함께 발표했습니다.
|
||||
1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (Multimedia Computing Group, Nanjing University 에서) Zhan Tong, Yibing Song, Jue Wang, Limin Wang 의 [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) 논문과 함께 발표했습니다.
|
||||
1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (NAVER AI Lab/Kakao Enterprise/Kakao Brain 에서) Wonjae Kim, Bokyung Son, Ildoo Kim 의 [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) 논문과 함께 발표했습니다.
|
||||
1. **[VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)** (University of Wisconsin–Madison 에서 제공)은 Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee.의 [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784)논문과 함께 발표했습니다.
|
||||
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (Google AI 에서) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 의 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 논문과 함께 발표했습니다.
|
||||
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (UCLA NLP 에서) Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 의 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 논문과 함께 발표했습니다.
|
||||
1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (Google AI 에서) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 의 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 논문과 함께 발표했습니다.
|
||||
|
||||
@ -453,6 +453,7 @@ conda install -c huggingface transformers
|
||||
1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (来自 Tsinghua University and Nankai University) 伴随论文 [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) 由 Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu 发布。
|
||||
1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (来自 Multimedia Computing Group, Nanjing University) 伴随论文 [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) 由 Zhan Tong, Yibing Song, Jue Wang, Limin Wang 发布。
|
||||
1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (来自 NAVER AI Lab/Kakao Enterprise/Kakao Brain) 伴随论文 [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) 由 Wonjae Kim, Bokyung Son, Ildoo Kim 发布。
|
||||
1. **[VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)** (来自 University of Wisconsin–Madison) 伴随论文 [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784) 由 Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee 发布。
|
||||
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
|
||||
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (来自 UCLA NLP) 伴随论文 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 由 Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 发布。
|
||||
1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
|
||||
|
||||
@ -465,6 +465,7 @@ conda install -c huggingface transformers
|
||||
1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
|
||||
1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
|
||||
1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
|
||||
1. **[VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)** (from University of Wisconsin–Madison) released with the paper [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784) by Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee.
|
||||
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
|
||||
1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
|
||||
|
||||
@ -22,7 +22,11 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools ninja git+htt
|
||||
|
||||
ARG REF=main
|
||||
WORKDIR /
|
||||
|
||||
# Invalidate docker cache from here if new commit is available.
|
||||
ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
|
||||
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]
|
||||
|
||||
RUN python3 -m pip uninstall -y tensorflow flax
|
||||
|
||||
45
docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
Normal file
45
docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
Normal file
@ -0,0 +1,45 @@
|
||||
FROM rocm/dev-ubuntu-22.04:5.6
|
||||
LABEL maintainer="Hugging Face"
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
ARG PYTORCH='2.1.1'
|
||||
ARG TORCH_VISION='0.16.1'
|
||||
ARG TORCH_AUDIO='2.1.1'
|
||||
ARG ROCM='5.6'
|
||||
|
||||
RUN apt update && \
|
||||
apt install -y --no-install-recommends \
|
||||
libaio-dev \
|
||||
git \
|
||||
# These are required to build deepspeed.
|
||||
python3-dev \
|
||||
python-is-python3 \
|
||||
rocrand-dev \
|
||||
rocthrust-dev \
|
||||
hipsparse-dev \
|
||||
hipblas-dev \
|
||||
rocblas-dev && \
|
||||
apt clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip ninja "pydantic<2"
|
||||
RUN python3 -m pip uninstall -y apex torch torchvision torchaudio
|
||||
RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM --no-cache-dir
|
||||
|
||||
# Pre-build DeepSpeed, so it's be ready for testing (to avoid timeout)
|
||||
RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache-dir -v --disable-pip-version-check 2>&1
|
||||
|
||||
ARG REF=main
|
||||
WORKDIR /
|
||||
|
||||
# Invalidate docker cache from here if new commit is available.
|
||||
ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
|
||||
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
|
||||
|
||||
RUN python3 -m pip install --no-cache-dir ./transformers[accelerate,testing,sentencepiece,sklearn]
|
||||
|
||||
# When installing in editable mode, `transformers` is not recognized as a package.
|
||||
# this line must be added in order for python to be aware of transformers.
|
||||
RUN cd transformers && python3 setup.py develop
|
||||
|
||||
RUN python3 -c "from deepspeed.launcher.runner import main"
|
||||
@ -34,7 +34,7 @@ RUN python3 -m pip uninstall -y torch-tensorrt
|
||||
|
||||
# recompile apex
|
||||
RUN python3 -m pip uninstall -y apex
|
||||
RUN git clone https://github.com/NVIDIA/apex
|
||||
# RUN git clone https://github.com/NVIDIA/apex
|
||||
# `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
|
||||
# TODO: check if there is alternative way to install latest apex
|
||||
# RUN cd apex && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .
|
||||
@ -44,7 +44,7 @@ RUN python3 -m pip uninstall -y deepspeed
|
||||
# This has to be run (again) inside the GPU VMs running the tests.
|
||||
# The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
|
||||
# TODO: Find out why test fail.
|
||||
RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
|
||||
RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
|
||||
|
||||
# When installing in editable mode, `transformers` is not recognized as a package.
|
||||
# this line must be added in order for python to be aware of transformers.
|
||||
|
||||
@ -135,6 +135,8 @@
|
||||
title: Overview
|
||||
- local: quantization
|
||||
title: Quantization
|
||||
- local: trainer
|
||||
title: Trainer
|
||||
- sections:
|
||||
- local: perf_train_gpu_one
|
||||
title: Methods and tools for efficient training on a single GPU
|
||||
@ -144,12 +146,10 @@
|
||||
title: Efficient training on CPU
|
||||
- local: perf_train_cpu_many
|
||||
title: Distributed CPU training
|
||||
- local: perf_train_tpu
|
||||
title: Training on TPUs
|
||||
- local: perf_train_tpu_tf
|
||||
title: Training on TPU with TensorFlow
|
||||
- local: perf_train_special
|
||||
title: Training on Specialized Hardware
|
||||
title: PyTorch training on Apple silicon
|
||||
- local: perf_hardware
|
||||
title: Custom hardware for training
|
||||
- local: hpo_train
|
||||
@ -218,6 +218,8 @@
|
||||
title: Agents and Tools
|
||||
- local: model_doc/auto
|
||||
title: Auto Classes
|
||||
- local: main_classes/backbones
|
||||
title: Backbones
|
||||
- local: main_classes/callback
|
||||
title: Callbacks
|
||||
- local: main_classes/configuration
|
||||
@ -739,6 +741,8 @@
|
||||
title: TVP
|
||||
- local: model_doc/vilt
|
||||
title: ViLT
|
||||
- local: model_doc/vipllava
|
||||
title: VipLlava
|
||||
- local: model_doc/vision-encoder-decoder
|
||||
title: Vision Encoder Decoder Models
|
||||
- local: model_doc/vision-text-dual-encoder
|
||||
|
||||
@ -31,6 +31,7 @@ In this tutorial, learn to:
|
||||
* Load a pretrained feature extractor.
|
||||
* Load a pretrained processor.
|
||||
* Load a pretrained model.
|
||||
* Load a model as a backbone.
|
||||
|
||||
## AutoTokenizer
|
||||
|
||||
@ -95,7 +96,7 @@ Load a processor with [`AutoProcessor.from_pretrained`]:
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
Finally, the `AutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`AutoModelForSequenceClassification.from_pretrained`]:
|
||||
The `AutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`AutoModelForSequenceClassification.from_pretrained`]:
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoModelForSequenceClassification
|
||||
@ -141,3 +142,24 @@ Easily reuse the same checkpoint to load an architecture for a different task:
|
||||
Generally, we recommend using the `AutoTokenizer` class and the `TFAutoModelFor` class to load pretrained instances of models. This will ensure you load the correct architecture every time. In the next [tutorial](preprocessing), learn how to use your newly loaded tokenizer, image processor, feature extractor and processor to preprocess a dataset for fine-tuning.
|
||||
</tf>
|
||||
</frameworkcontent>
|
||||
|
||||
## AutoBackbone
|
||||
|
||||
`AutoBackbone` lets you use pretrained models as backbones and get feature maps as outputs from different stages of the models. Below you can see how to get feature maps from a [Swin](model_doc/swin) checkpoint.
|
||||
|
||||
```py
|
||||
>>> from transformers import AutoImageProcessor, AutoBackbone
|
||||
>>> import torch
|
||||
>>> from PIL import Image
|
||||
>>> import requests
|
||||
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
>>> image = Image.open(requests.get(url, stream=True).raw)
|
||||
>>> processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
|
||||
>>> model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(0,))
|
||||
|
||||
>>> inputs = processor(image, return_tensors="pt")
|
||||
>>> outputs = model(**inputs)
|
||||
>>> feature_maps = outputs.feature_maps
|
||||
>>> list(feature_maps[-1].shape)
|
||||
[1, 96, 56, 56]
|
||||
```
|
||||
|
||||
@ -100,7 +100,7 @@ reading the whole sentence but using a mask inside the model to hide the future
|
||||
|
||||
### channel
|
||||
|
||||
Color images are made up of some combination of values in three channels - red, green, and blue (RGB) - and grayscale images only have one channel. In 🤗 Transformers, the channel can be the first or last dimension of an image's tensor: [`n_channels`, `height`, `width`] or [`height`, `width`, `n_channels`].
|
||||
Color images are made up of some combination of values in three channels: red, green, and blue (RGB) and grayscale images only have one channel. In 🤗 Transformers, the channel can be the first or last dimension of an image's tensor: [`n_channels`, `height`, `width`] or [`height`, `width`, `n_channels`].
|
||||
|
||||
### connectionist temporal classification (CTC)
|
||||
|
||||
@ -116,6 +116,7 @@ A type of layer in a neural network where the input matrix is multiplied element
|
||||
|
||||
Parallelism technique for training on multiple GPUs where the same setup is replicated multiple times, with each instance
|
||||
receiving a distinct data slice. The processing is done in parallel and all setups are synchronized at the end of each training step.
|
||||
|
||||
Learn more about how DataParallel works [here](perf_train_gpu_many#dataparallel-vs-distributeddataparallel).
|
||||
|
||||
### decoder input IDs
|
||||
@ -165,8 +166,7 @@ embeddings `[batch_size, sequence_length, config.intermediate_size]` can account
|
||||
use. The authors of [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) noticed that since the
|
||||
computation is independent of the `sequence_length` dimension, it is mathematically equivalent to compute the output
|
||||
embeddings of both feed forward layers `[batch_size, config.hidden_size]_0, ..., [batch_size, config.hidden_size]_n`
|
||||
individually and concat them afterward to `[batch_size, sequence_length, config.hidden_size]` with `n =
|
||||
sequence_length`, which trades increased computation time against reduced memory use, but yields a mathematically
|
||||
individually and concat them afterward to `[batch_size, sequence_length, config.hidden_size]` with `n = sequence_length`, which trades increased computation time against reduced memory use, but yields a mathematically
|
||||
**equivalent** result.
|
||||
|
||||
For models employing the function [`apply_chunking_to_forward`], the `chunk_size` defines the number of output
|
||||
@ -187,7 +187,7 @@ The model head refers to the last layer of a neural network that accepts the raw
|
||||
|
||||
* [`GPT2ForSequenceClassification`] is a sequence classification head - a linear layer - on top of the base [`GPT2Model`].
|
||||
* [`ViTForImageClassification`] is an image classification head - a linear layer on top of the final hidden state of the `CLS` token - on top of the base [`ViTModel`].
|
||||
* [`Wav2Vec2ForCTC`] ia a language modeling head with [CTC](#connectionist-temporal-classification-(CTC)) on top of the base [`Wav2Vec2Model`].
|
||||
* [`Wav2Vec2ForCTC`] is a language modeling head with [CTC](#connectionist-temporal-classification-(CTC)) on top of the base [`Wav2Vec2Model`].
|
||||
|
||||
## I
|
||||
|
||||
@ -232,9 +232,7 @@ is added for "RA" and "M":
|
||||
['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
|
||||
```
|
||||
|
||||
These tokens can then be converted into IDs which are understandable by the model. This can be done by directly feeding
|
||||
the sentence to the tokenizer, which leverages the Rust implementation of [🤗
|
||||
Tokenizers](https://github.com/huggingface/tokenizers) for peak performance.
|
||||
These tokens can then be converted into IDs which are understandable by the model. This can be done by directly feeding the sentence to the tokenizer, which leverages the Rust implementation of [🤗 Tokenizers](https://github.com/huggingface/tokenizers) for peak performance.
|
||||
|
||||
```python
|
||||
>>> inputs = tokenizer(sequence)
|
||||
@ -383,7 +381,7 @@ self-supervised objective, which can be reading the text and trying to predict t
|
||||
modeling](#causal-language-modeling)) or masking some words and trying to predict them (see [masked language
|
||||
modeling](#masked-language-modeling-mlm)).
|
||||
|
||||
Speech and vision models have their own pretraining objectives. For example, Wav2Vec2 is a speech model pretrained on a contrastive task which requires the model to identify the "true" speech representation from a set of "false" speech representations. On the other hand, BEiT is a vision model pretrained on a masked image modeling task which masks some of the image patches and requires the model to predict the masked patches (similar to the masked language modeling objective).
|
||||
Speech and vision models have their own pretraining objectives. For example, Wav2Vec2 is a speech model pretrained on a contrastive task which requires the model to identify the "true" speech representation from a set of "false" speech representations. On the other hand, BEiT is a vision model pretrained on a masked image modeling task which masks some of the image patches and requires the model to predict the masked patches (similar to the masked language modeling objective).
|
||||
|
||||
## R
|
||||
|
||||
@ -518,7 +516,7 @@ A form of model training in which data provided to the model is not labeled. Uns
|
||||
|
||||
### Zero Redundancy Optimizer (ZeRO)
|
||||
|
||||
Parallelism technique which performs sharding of the tensors somewhat similar to [TensorParallel](#tensorparallel--tp-),
|
||||
Parallelism technique which performs sharding of the tensors somewhat similar to [TensorParallel](#tensor-parallelism-tp),
|
||||
except the whole tensor gets reconstructed in time for a forward or backward computation, therefore the model doesn't need
|
||||
to be modified. This method also supports various offloading techniques to compensate for limited GPU memory.
|
||||
Learn more about ZeRO [here](perf_train_gpu_many#zero-data-parallelism).
|
||||
@ -280,6 +280,7 @@ Flax), PyTorch, and/or TensorFlow.
|
||||
| [VAN](model_doc/van) | ✅ | ❌ | ❌ |
|
||||
| [VideoMAE](model_doc/videomae) | ✅ | ❌ | ❌ |
|
||||
| [ViLT](model_doc/vilt) | ✅ | ❌ | ❌ |
|
||||
| [VipLlava](model_doc/vipllava) | ✅ | ❌ | ❌ |
|
||||
| [Vision Encoder decoder](model_doc/vision-encoder-decoder) | ✅ | ✅ | ✅ |
|
||||
| [VisionTextDualEncoder](model_doc/vision-text-dual-encoder) | ✅ | ✅ | ✅ |
|
||||
| [VisualBERT](model_doc/visual_bert) | ✅ | ❌ | ❌ |
|
||||
|
||||
93
docs/source/en/main_classes/backbones.md
Normal file
93
docs/source/en/main_classes/backbones.md
Normal file
@ -0,0 +1,93 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Backbones
|
||||
|
||||
Backbones are models used for feature extraction for computer vision tasks. One can use a model as backbone in two ways:
|
||||
|
||||
* initializing `AutoBackbone` class with a pretrained model,
|
||||
* initializing a supported backbone configuration and passing it to the model architecture.
|
||||
|
||||
## Using AutoBackbone
|
||||
|
||||
You can use `AutoBackbone` class to initialize a model as a backbone and get the feature maps for any stage. You can define `out_indices` to indicate the index of the layers which you would like to get the feature maps from. You can also use `out_features` if you know the name of the layers. You can use them interchangeably. If you are using both `out_indices` and `out_features`, ensure they are consistent. Not passing any of the feature map arguments will make the backbone yield the feature maps of the last layer.
|
||||
To visualize how stages look like, let's take the Swin model. Each stage is responsible from feature extraction, outputting feature maps.
|
||||
<div style="text-align: center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Swin%20Stages.png">
|
||||
</div>
|
||||
|
||||
Illustrating feature maps of the first stage looks like below.
|
||||
<div style="text-align: center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Swin%20Stage%201.png">
|
||||
</div>
|
||||
|
||||
Let's see with an example. Note that `out_indices=(0,)` results in yielding the stem of the model. Stem refers to the stage before the first feature extraction stage. In above diagram, it refers to patch partition. We would like to have the feature maps from stem, first, and second stage of the model.
|
||||
```py
|
||||
>>> from transformers import AutoImageProcessor, AutoBackbone
|
||||
>>> import torch
|
||||
>>> from PIL import Image
|
||||
>>> import requests
|
||||
|
||||
>>> processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
|
||||
>>> model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(0,1,2))
|
||||
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
>>> image = Image.open(requests.get(url, stream=True).raw)
|
||||
|
||||
>>> inputs = processor(image, return_tensors="pt")
|
||||
>>> outputs = model(**inputs)
|
||||
>>> feature_maps = outputs.feature_maps
|
||||
```
|
||||
`feature_maps` object now has three feature maps, each can be accessed like below. Say we would like to get the feature map of the stem.
|
||||
```python
|
||||
>>> list(feature_maps[0].shape)
|
||||
[1, 96, 56, 56]
|
||||
```
|
||||
|
||||
We can get the feature maps of first and second stages like below.
|
||||
```python
|
||||
>>> list(feature_maps[1].shape)
|
||||
[1, 96, 56, 56]
|
||||
>>> list(feature_maps[2].shape)
|
||||
[1, 192, 28, 28]
|
||||
```
|
||||
|
||||
## Initializing Backbone Configuration
|
||||
|
||||
In computer vision, models consist of backbone, neck, and a head. Backbone extracts the features, neck transforms the output of the backbone and head is used for the main task (e.g. object detection). You can initialize neck and head with model backbones by passing a model configuration to `backbone_config`. For example, below you can see how to initialize the [MaskFormer](../model_doc/maskformer) model with instance segmentation head with [ResNet](../model_doc/resnet) backbone.
|
||||
|
||||
```py
|
||||
from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig
|
||||
|
||||
backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-50")
|
||||
config = MaskFormerConfig(backbone_config=backbone_config)
|
||||
model = MaskFormerForInstanceSegmentation(config)
|
||||
```
|
||||
You can also initialize a backbone with random weights to initialize the model neck with it.
|
||||
|
||||
```py
|
||||
backbone_config = ResNetConfig()
|
||||
config = MaskFormerConfig(backbone_config=backbone_config)
|
||||
model = MaskFormerForInstanceSegmentation(config)
|
||||
```
|
||||
|
||||
`timm` models are also supported in transformers through `TimmBackbone` and `TimmBackboneConfig`.
|
||||
|
||||
```python
|
||||
from transformers import TimmBackboneConfig, TimmBackbone
|
||||
|
||||
backbone_config = TimmBackboneConfig("resnet50")
|
||||
model = TimmBackbone(config=backbone_config)
|
||||
```
|
||||
@ -400,12 +400,6 @@ Pipelines available for natural language processing tasks include the following.
|
||||
- __call__
|
||||
- all
|
||||
|
||||
### NerPipeline
|
||||
|
||||
[[autodoc]] NerPipeline
|
||||
|
||||
See [`TokenClassificationPipeline`] for all details.
|
||||
|
||||
### QuestionAnsweringPipeline
|
||||
|
||||
[[autodoc]] QuestionAnsweringPipeline
|
||||
|
||||
@ -16,70 +16,23 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
# Trainer
|
||||
|
||||
The [`Trainer`] class provides an API for feature-complete training in PyTorch for most standard use cases. It's used in most of the [example scripts](https://github.com/huggingface/transformers/tree/main/examples).
|
||||
The [`Trainer`] class provides an API for feature-complete training in PyTorch, and it supports distributed training on multiple GPUs/TPUs, mixed precision for [NVIDIA GPUs](https://nvidia.github.io/apex/), [AMD GPUs](https://rocm.docs.amd.com/en/latest/rocm.html), and [`torch.amp`](https://pytorch.org/docs/stable/amp.html) for PyTorch. [`Trainer`] goes hand-in-hand with the [`TrainingArguments`] class, which offers a wide range of options to customize how a model is trained. Together, these two classes provide a complete training API.
|
||||
|
||||
<Tip>
|
||||
|
||||
If you're looking to fine-tune a language model like Llama-2 or Mistral on a text dataset using autoregressive techniques, consider using [`trl`](https://github.com/huggingface/trl)'s [`~trl.SFTTrainer`]. The [`~trl.SFTTrainer`] wraps the [`Trainer`] and is specially optimized for this particular task and supports sequence packing, LoRA, quantization, and DeepSpeed for efficient scaling to any model size. On the other hand, the [`Trainer`] is a more versatile option, suitable for a broader spectrum of tasks.
|
||||
|
||||
</Tip>
|
||||
|
||||
Before instantiating your [`Trainer`], create a [`TrainingArguments`] to access all the points of customization during training.
|
||||
|
||||
The API supports distributed training on multiple GPUs/TPUs, mixed precision through [NVIDIA Apex] for NVIDIA GPUs, [ROCm APEX](https://github.com/ROCmSoftwarePlatform/apex) for AMD GPUs, and Native AMP for PyTorch.
|
||||
|
||||
The [`Trainer`] contains the basic training loop which supports the above features. To inject custom behavior you can subclass them and override the following methods:
|
||||
|
||||
- **get_train_dataloader** -- Creates the training DataLoader.
|
||||
- **get_eval_dataloader** -- Creates the evaluation DataLoader.
|
||||
- **get_test_dataloader** -- Creates the test DataLoader.
|
||||
- **log** -- Logs information on the various objects watching training.
|
||||
- **create_optimizer_and_scheduler** -- Sets up the optimizer and learning rate scheduler if they were not passed at
|
||||
init. Note, that you can also subclass or override the `create_optimizer` and `create_scheduler` methods
|
||||
separately.
|
||||
- **create_optimizer** -- Sets up the optimizer if it wasn't passed at init.
|
||||
- **create_scheduler** -- Sets up the learning rate scheduler if it wasn't passed at init.
|
||||
- **compute_loss** - Computes the loss on a batch of training inputs.
|
||||
- **training_step** -- Performs a training step.
|
||||
- **prediction_step** -- Performs an evaluation/test step.
|
||||
- **evaluate** -- Runs an evaluation loop and returns metrics.
|
||||
- **predict** -- Returns predictions (with metrics if labels are available) on a test set.
|
||||
[`Seq2SeqTrainer`] and [`Seq2SeqTrainingArguments`] inherit from the [`Trainer`] and [`TrainingArgument`] classes and they're adapted for training models for sequence-to-sequence tasks such as summarization or translation.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
The [`Trainer`] class is optimized for 🤗 Transformers models and can have surprising behaviors
|
||||
when you use it on other models. When using it on your own model, make sure:
|
||||
when used with other models. When using it with your own model, make sure:
|
||||
|
||||
- your model always return tuples or subclasses of [`~utils.ModelOutput`].
|
||||
- your model always return tuples or subclasses of [`~utils.ModelOutput`]
|
||||
- your model can compute the loss if a `labels` argument is provided and that loss is returned as the first
|
||||
element of the tuple (if your model returns tuples)
|
||||
- your model can accept multiple label arguments (use the `label_names` in your [`TrainingArguments`] to indicate their name to the [`Trainer`]) but none of them should be named `"label"`.
|
||||
- your model can accept multiple label arguments (use `label_names` in [`TrainingArguments`] to indicate their name to the [`Trainer`]) but none of them should be named `"label"`
|
||||
|
||||
</Tip>
|
||||
|
||||
Here is an example of how to customize [`Trainer`] to use a weighted loss (useful when you have an unbalanced training set):
|
||||
|
||||
```python
|
||||
from torch import nn
|
||||
from transformers import Trainer
|
||||
|
||||
|
||||
class CustomTrainer(Trainer):
|
||||
def compute_loss(self, model, inputs, return_outputs=False):
|
||||
labels = inputs.pop("labels")
|
||||
# forward pass
|
||||
outputs = model(**inputs)
|
||||
logits = outputs.get("logits")
|
||||
# compute custom loss (suppose one has 3 labels with different weights)
|
||||
loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0], device=model.device))
|
||||
loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
|
||||
return (loss, outputs) if return_outputs else loss
|
||||
```
|
||||
|
||||
Another way to customize the training loop behavior for the PyTorch [`Trainer`] is to use [callbacks](callback) that can inspect the training loop state (for progress reporting, logging on TensorBoard or other ML platforms...) and take decisions (like early stopping).
|
||||
|
||||
|
||||
## Trainer
|
||||
## Trainer[[api-reference]]
|
||||
|
||||
[[autodoc]] Trainer
|
||||
- all
|
||||
@ -100,105 +53,6 @@ Another way to customize the training loop behavior for the PyTorch [`Trainer`]
|
||||
[[autodoc]] Seq2SeqTrainingArguments
|
||||
- all
|
||||
|
||||
## Checkpoints
|
||||
|
||||
By default, [`Trainer`] will save all checkpoints in the `output_dir` you set in the
|
||||
[`TrainingArguments`] you are using. Those will go in subfolder named `checkpoint-xxx` with xxx
|
||||
being the step at which the training was at.
|
||||
|
||||
Resuming training from a checkpoint can be done when calling [`Trainer.train`] with either:
|
||||
|
||||
- `resume_from_checkpoint=True` which will resume training from the latest checkpoint
|
||||
- `resume_from_checkpoint=checkpoint_dir` which will resume training from the specific checkpoint in the directory
|
||||
passed.
|
||||
|
||||
In addition, you can easily save your checkpoints on the Model Hub when using `push_to_hub=True`. By default, all
|
||||
the models saved in intermediate checkpoints are saved in different commits, but not the optimizer state. You can adapt
|
||||
the `hub-strategy` value of your [`TrainingArguments`] to either:
|
||||
|
||||
- `"checkpoint"`: the latest checkpoint is also pushed in a subfolder named last-checkpoint, allowing you to
|
||||
resume training easily with `trainer.train(resume_from_checkpoint="output_dir/last-checkpoint")`.
|
||||
- `"all_checkpoints"`: all checkpoints are pushed like they appear in the output folder (so you will get one
|
||||
checkpoint folder per folder in your final repository)
|
||||
|
||||
|
||||
## Logging
|
||||
|
||||
By default [`Trainer`] will use `logging.INFO` for the main process and `logging.WARNING` for the replicas if any.
|
||||
|
||||
These defaults can be overridden to use any of the 5 `logging` levels with [`TrainingArguments`]'s
|
||||
arguments:
|
||||
|
||||
- `log_level` - for the main process
|
||||
- `log_level_replica` - for the replicas
|
||||
|
||||
Further, if [`TrainingArguments`]'s `log_on_each_node` is set to `False` only the main node will
|
||||
use the log level settings for its main process, all other nodes will use the log level settings for replicas.
|
||||
|
||||
Note that [`Trainer`] is going to set `transformers`'s log level separately for each node in its
|
||||
[`Trainer.__init__`]. So you may want to set this sooner (see the next example) if you tap into other
|
||||
`transformers` functionality before creating the [`Trainer`] object.
|
||||
|
||||
Here is an example of how this can be used in an application:
|
||||
|
||||
```python
|
||||
[...]
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
handlers=[logging.StreamHandler(sys.stdout)],
|
||||
)
|
||||
|
||||
# set the main code and the modules it uses to the same log-level according to the node
|
||||
log_level = training_args.get_process_log_level()
|
||||
logger.setLevel(log_level)
|
||||
datasets.utils.logging.set_verbosity(log_level)
|
||||
transformers.utils.logging.set_verbosity(log_level)
|
||||
|
||||
trainer = Trainer(...)
|
||||
```
|
||||
|
||||
And then if you only want to see warnings on the main node and all other nodes to not print any most likely duplicated
|
||||
warnings you could run it as:
|
||||
|
||||
```bash
|
||||
my_app.py ... --log_level warning --log_level_replica error
|
||||
```
|
||||
|
||||
In the multi-node environment if you also don't want the logs to repeat for each node's main process, you will want to
|
||||
change the above to:
|
||||
|
||||
```bash
|
||||
my_app.py ... --log_level warning --log_level_replica error --log_on_each_node 0
|
||||
```
|
||||
|
||||
and then only the main process of the first node will log at the "warning" level, and all other processes on the main
|
||||
node and all processes on other nodes will log at the "error" level.
|
||||
|
||||
If you need your application to be as quiet as possible you could do:
|
||||
|
||||
```bash
|
||||
my_app.py ... --log_level error --log_level_replica error --log_on_each_node 0
|
||||
```
|
||||
|
||||
(add `--log_on_each_node 0` if on multi-node environment)
|
||||
|
||||
|
||||
## Randomness
|
||||
|
||||
When resuming from a checkpoint generated by [`Trainer`] all efforts are made to restore the
|
||||
_python_, _numpy_ and _pytorch_ RNG states to the same states as they were at the moment of saving that checkpoint,
|
||||
which should make the "stop and resume" style of training as close as possible to non-stop training.
|
||||
|
||||
However, due to various default non-deterministic pytorch settings this might not fully work. If you want full
|
||||
determinism please refer to [Controlling sources of randomness](https://pytorch.org/docs/stable/notes/randomness). As explained in the document, that some of those settings
|
||||
that make things deterministic (.e.g., `torch.backends.cudnn.deterministic`) may slow things down, therefore this
|
||||
can't be done by default, but you can enable those yourself if needed.
|
||||
|
||||
|
||||
## Specific GPUs Selection
|
||||
|
||||
Let's discuss how you can tell your program which GPUs are to be used and in what order.
|
||||
@ -295,9 +149,6 @@ In this example we are working with just 2 GPUs, but of course the same would ap
|
||||
|
||||
Also if you do set this environment variable it's the best to set it in your `~/.bashrc` file or some other startup config file and forget about it.
|
||||
|
||||
|
||||
|
||||
|
||||
## Trainer Integrations
|
||||
|
||||
The [`Trainer`] has been extended to support libraries that may dramatically improve your training
|
||||
@ -518,217 +369,6 @@ Pass `--fsdp "full shard"` along with following changes to be made in `--fsdp_co
|
||||
- For size based auto wrap policy, please add `min_num_params` in the config file.
|
||||
It specifies FSDP's minimum number of parameters for auto wrapping.
|
||||
|
||||
|
||||
### Using Trainer for accelerated PyTorch Training on Mac
|
||||
|
||||
With PyTorch v1.12 release, developers and researchers can take advantage of Apple silicon GPUs for significantly faster model training.
|
||||
This unlocks the ability to perform machine learning workflows like prototyping and fine-tuning locally, right on Mac.
|
||||
Apple's Metal Performance Shaders (MPS) as a backend for PyTorch enables this and can be used via the new `"mps"` device.
|
||||
This will map computational graphs and primitives on the MPS Graph framework and tuned kernels provided by MPS.
|
||||
For more information please refer official documents [Introducing Accelerated PyTorch Training on Mac](https://pytorch.org/blog/introducing-accelerated-pytorch-training-on-mac/)
|
||||
and [MPS BACKEND](https://pytorch.org/docs/stable/notes/mps.html).
|
||||
|
||||
<Tip warning={false}>
|
||||
|
||||
We strongly recommend to install PyTorch >= 1.13 (nightly version at the time of writing) on your MacOS machine.
|
||||
It has major fixes related to model correctness and performance improvements for transformer based models.
|
||||
Please refer to https://github.com/pytorch/pytorch/issues/82707 for more details.
|
||||
|
||||
</Tip>
|
||||
|
||||
**Benefits of Training and Inference using Apple Silicon Chips**
|
||||
|
||||
1. Enables users to train larger networks or batch sizes locally
|
||||
2. Reduces data retrieval latency and provides the GPU with direct access to the full memory store due to unified memory architecture.
|
||||
Therefore, improving end-to-end performance.
|
||||
3. Reduces costs associated with cloud-based development or the need for additional local GPUs.
|
||||
|
||||
**Pre-requisites**: To install torch with mps support,
|
||||
please follow this nice medium article [GPU-Acceleration Comes to PyTorch on M1 Macs](https://medium.com/towards-data-science/gpu-acceleration-comes-to-pytorch-on-m1-macs-195c399efcc1).
|
||||
|
||||
**Usage**:
|
||||
`mps` device will be used by default if available similar to the way `cuda` device is used.
|
||||
Therefore, no action from user is required.
|
||||
For example, you can run the official Glue text classififcation task (from the root folder) using Apple Silicon GPU with below command:
|
||||
|
||||
```bash
|
||||
export TASK_NAME=mrpc
|
||||
|
||||
python examples/pytorch/text-classification/run_glue.py \
|
||||
--model_name_or_path bert-base-cased \
|
||||
--task_name $TASK_NAME \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--max_seq_length 128 \
|
||||
--per_device_train_batch_size 32 \
|
||||
--learning_rate 2e-5 \
|
||||
--num_train_epochs 3 \
|
||||
--output_dir /tmp/$TASK_NAME/ \
|
||||
--overwrite_output_dir
|
||||
```
|
||||
|
||||
**A few caveats to be aware of**
|
||||
|
||||
1. Some PyTorch operations have not been implemented in mps and will throw an error.
|
||||
One way to get around that is to set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1`,
|
||||
which will fallback to CPU for these operations. It still throws a UserWarning however.
|
||||
2. Distributed setups `gloo` and `nccl` are not working with `mps` device.
|
||||
This means that currently only single GPU of `mps` device type can be used.
|
||||
|
||||
Finally, please, remember that, 🤗 `Trainer` only integrates MPS backend, therefore if you
|
||||
have any problems or questions with regards to MPS backend usage, please,
|
||||
file an issue with [PyTorch GitHub](https://github.com/pytorch/pytorch/issues).
|
||||
|
||||
|
||||
## Using Accelerate Launcher with Trainer
|
||||
|
||||
Accelerate now powers Trainer. In terms of what users should expect:
|
||||
- They can keep using the Trainer ingterations such as FSDP, DeepSpeed vis trainer arguments without any changes on their part.
|
||||
- They can now use Accelerate Launcher with Trainer (recommended).
|
||||
|
||||
Steps to use Accelerate Launcher with Trainer:
|
||||
1. Make sure 🤗 Accelerate is installed, you can't use the `Trainer` without it anyway. If not `pip install accelerate`. You may also need to update your version of Accelerate: `pip install accelerate --upgrade`
|
||||
2. Run `accelerate config` and fill the questionnaire. Below are example accelerate configs:
|
||||
a. DDP Multi-node Multi-GPU config:
|
||||
```yaml
|
||||
compute_environment: LOCAL_MACHINE
|
||||
distributed_type: MULTI_GPU
|
||||
downcast_bf16: 'no'
|
||||
gpu_ids: all
|
||||
machine_rank: 0 #change rank as per the node
|
||||
main_process_ip: 192.168.20.1
|
||||
main_process_port: 9898
|
||||
main_training_function: main
|
||||
mixed_precision: fp16
|
||||
num_machines: 2
|
||||
num_processes: 8
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
```
|
||||
|
||||
b. FSDP config:
|
||||
```yaml
|
||||
compute_environment: LOCAL_MACHINE
|
||||
distributed_type: FSDP
|
||||
downcast_bf16: 'no'
|
||||
fsdp_config:
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_backward_prefetch_policy: BACKWARD_PRE
|
||||
fsdp_forward_prefetch: true
|
||||
fsdp_offload_params: false
|
||||
fsdp_sharding_strategy: 1
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_transformer_layer_cls_to_wrap: BertLayer
|
||||
fsdp_use_orig_params: true
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
mixed_precision: bf16
|
||||
num_machines: 1
|
||||
num_processes: 2
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
```
|
||||
c. DeepSpeed config pointing to a file:
|
||||
```yaml
|
||||
compute_environment: LOCAL_MACHINE
|
||||
deepspeed_config:
|
||||
deepspeed_config_file: /home/user/configs/ds_zero3_config.json
|
||||
zero3_init_flag: true
|
||||
distributed_type: DEEPSPEED
|
||||
downcast_bf16: 'no'
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
num_machines: 1
|
||||
num_processes: 4
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
```
|
||||
|
||||
d. DeepSpeed config using accelerate plugin:
|
||||
```yaml
|
||||
compute_environment: LOCAL_MACHINE
|
||||
deepspeed_config:
|
||||
gradient_accumulation_steps: 1
|
||||
gradient_clipping: 0.7
|
||||
offload_optimizer_device: cpu
|
||||
offload_param_device: cpu
|
||||
zero3_init_flag: true
|
||||
zero_stage: 2
|
||||
distributed_type: DEEPSPEED
|
||||
downcast_bf16: 'no'
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
mixed_precision: bf16
|
||||
num_machines: 1
|
||||
num_processes: 4
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
```
|
||||
|
||||
3. Run the Trainer script with args other than the ones handled above by accelerate config or launcher args.
|
||||
Below is an example to run `run_glue.py` using `accelerate launcher` with FSDP config from above.
|
||||
|
||||
```bash
|
||||
cd transformers
|
||||
|
||||
accelerate launch \
|
||||
./examples/pytorch/text-classification/run_glue.py \
|
||||
--model_name_or_path bert-base-cased \
|
||||
--task_name $TASK_NAME \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--max_seq_length 128 \
|
||||
--per_device_train_batch_size 16 \
|
||||
--learning_rate 5e-5 \
|
||||
--num_train_epochs 3 \
|
||||
--output_dir /tmp/$TASK_NAME/ \
|
||||
--overwrite_output_dir
|
||||
```
|
||||
|
||||
4. You can also directly use the cmd args for `accelerate launch`. Above example would map to:
|
||||
|
||||
```bash
|
||||
cd transformers
|
||||
|
||||
accelerate launch --num_processes=2 \
|
||||
--use_fsdp \
|
||||
--mixed_precision=bf16 \
|
||||
--fsdp_auto_wrap_policy=TRANSFORMER_BASED_WRAP \
|
||||
--fsdp_transformer_layer_cls_to_wrap="BertLayer" \
|
||||
--fsdp_sharding_strategy=1 \
|
||||
--fsdp_state_dict_type=FULL_STATE_DICT \
|
||||
./examples/pytorch/text-classification/run_glue.py
|
||||
--model_name_or_path bert-base-cased \
|
||||
--task_name $TASK_NAME \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--max_seq_length 128 \
|
||||
--per_device_train_batch_size 16 \
|
||||
--learning_rate 5e-5 \
|
||||
--num_train_epochs 3 \
|
||||
--output_dir /tmp/$TASK_NAME/ \
|
||||
--overwrite_output_dir
|
||||
```
|
||||
|
||||
For more information, please refer the 🤗 Accelerate CLI guide: [Launching your 🤗 Accelerate scripts](https://huggingface.co/docs/accelerate/basic_tutorials/launch).
|
||||
|
||||
Sections that were moved:
|
||||
|
||||
[ <a href="./deepspeed#deepspeed-trainer-integration">DeepSpeed</a><a id="deepspeed"></a>
|
||||
@ -755,27 +395,3 @@ Sections that were moved:
|
||||
| <a href="./deepspeed#deepspeed-grad-clip">Gradient Clipping</a><a id="gradient-clipping"></a>
|
||||
| <a href="./deepspeed#deepspeed-weight-extraction">Getting The Model Weights Out</a><a id="getting-the-model-weights-out"></a>
|
||||
]
|
||||
|
||||
## Boost your fine-tuning performances using NEFTune
|
||||
|
||||
|
||||
NEFTune is a technique to boost the performance of chat models and was introduced by the paper “NEFTune: Noisy Embeddings Improve Instruction Finetuning” from Jain et al. it consists of adding noise to the embedding vectors during training. According to the abstract of the paper:
|
||||
|
||||
> Standard finetuning of LLaMA-2-7B using Alpaca achieves 29.79% on AlpacaEval, which rises to 64.69% using noisy embeddings. NEFTune also improves over strong baselines on modern instruction datasets. Models trained with Evol-Instruct see a 10% improvement, with ShareGPT an 8% improvement, and with OpenPlatypus an 8% improvement. Even powerful models further refined with RLHF such as LLaMA-2-Chat benefit from additional training with NEFTune.
|
||||
|
||||
<div style="text-align: center">
|
||||
<img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/neft-screenshot.png">
|
||||
</div>
|
||||
|
||||
To use it in `Trainer` simply pass `neftune_noise_alpha` when creating your `TrainingArguments` instance. Note that to avoid any surprising behaviour, NEFTune is disabled after training to retrieve back the original behaviour of the embedding layer.
|
||||
|
||||
```python
|
||||
from transformers import Trainer, TrainingArguments
|
||||
|
||||
args = TrainingArguments(..., neftune_noise_alpha=0.1)
|
||||
trainer = Trainer(..., args=args)
|
||||
|
||||
...
|
||||
|
||||
trainer.train()
|
||||
```
|
||||
|
||||
@ -146,7 +146,7 @@ As a summary, consider the following table:
|
||||
| **Model** | [`~transformers.DetrForObjectDetection`] | [`~transformers.DetrForSegmentation`] | [`~transformers.DetrForSegmentation`] |
|
||||
| **Example dataset** | COCO detection | COCO detection, COCO panoptic | COCO panoptic | |
|
||||
| **Format of annotations to provide to** [`~transformers.DetrImageProcessor`] | {'image_id': `int`, 'annotations': `List[Dict]`} each Dict being a COCO object annotation | {'image_id': `int`, 'annotations': `List[Dict]`} (in case of COCO detection) or {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} (in case of COCO panoptic) | {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} and masks_path (path to directory containing PNG files of the masks) |
|
||||
| **Postprocessing** (i.e. converting the output of the model to COCO API) | [`~transformers.DetrImageProcessor.post_process`] | [`~transformers.DetrImageProcessor.post_process_segmentation`] | [`~transformers.DetrImageProcessor.post_process_segmentation`], [`~transformers.DetrImageProcessor.post_process_panoptic`] |
|
||||
| **Postprocessing** (i.e. converting the output of the model to Pascal VOC format) | [`~transformers.DetrImageProcessor.post_process`] | [`~transformers.DetrImageProcessor.post_process_segmentation`] | [`~transformers.DetrImageProcessor.post_process_segmentation`], [`~transformers.DetrImageProcessor.post_process_panoptic`] |
|
||||
| **evaluators** | `CocoEvaluator` with `iou_types="bbox"` | `CocoEvaluator` with `iou_types="bbox"` or `"segm"` | `CocoEvaluator` with `iou_tupes="bbox"` or `"segm"`, `PanopticEvaluator` |
|
||||
|
||||
In short, one should prepare the data either in COCO detection or COCO panoptic format, then use
|
||||
|
||||
@ -20,7 +20,7 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
Mixtral-8x7B is Mistral AI's second Large Language Model (LLM).
|
||||
|
||||
The Mixtral model was proposed in the by the [Mistral AI](https://mistral.ai/) team.
|
||||
The Mixtral model was proposed by the [Mistral AI](https://mistral.ai/) team.
|
||||
|
||||
It was introduced in the [Mixtral of Experts blogpost](https://mistral.ai/news/mixtral-of-experts/) with the following introduction:
|
||||
|
||||
@ -30,7 +30,7 @@ Tips:
|
||||
|
||||
|
||||
- The model needs to be converted using the [conversion script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py).
|
||||
- If the model is quantized to 4bits, a single A100 is enough to fit the entire 84B model.
|
||||
- If the model is quantized to 4bits, a single A100 is enough to fit the entire 45B model.
|
||||
|
||||
This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada) and [Arthur Zucker](https://huggingface.co/ArthurZ) .
|
||||
The original code can be found [here](https://github.com/mistralai/mistral-src).
|
||||
@ -38,9 +38,9 @@ The original code can be found [here](https://github.com/mistralai/mistral-src).
|
||||
|
||||
### Model Details
|
||||
|
||||
Mixtral-84B is a decoder-based LM with the following architectural choices:
|
||||
Mixtral-45B is a decoder-based LM with the following architectural choices:
|
||||
|
||||
* Mixtral is a Mixture of Expert (MOE) model with 8 experts per MLP, with a total of 85B paramateres but the compute required is the same as a 14B model. This is because even though each experts have to be loaded in RAM (70B like ram requirement) each token from the hidden states are dipatched twice (top 2 routing) and thus the compute (the operation required at each foward computation) is just 2 X sequence_length.
|
||||
* Mixtral is a Mixture of Expert (MOE) model with 8 experts per MLP, with a total of 45B paramateres but the compute required is the same as a 14B model. This is because even though each experts have to be loaded in RAM (70B like ram requirement) each token from the hidden states are dipatched twice (top 2 routing) and thus the compute (the operation required at each foward computation) is just 2 X sequence_length.
|
||||
|
||||
The following implementation details are shared with Mistral AI's first model [mistral](~models/doc/mistral):
|
||||
* Sliding Window Attention - Trained with 8k context length and fixed cache size, with a theoretical attention span of 128K tokens
|
||||
|
||||
@ -56,7 +56,7 @@ OWLv2 is, just like its predecessor [OWL-ViT](owlvit), a zero-shot text-conditio
|
||||
|
||||
>>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
|
||||
>>> target_sizes = torch.Tensor([image.size[::-1]])
|
||||
>>> # Convert outputs (bounding boxes and class logits) to COCO API
|
||||
>>> # Convert outputs (bounding boxes and class logits) to Pascal VOC Format (xmin, ymin, xmax, ymax)
|
||||
>>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)
|
||||
>>> i = 0 # Retrieve predictions for the first image for the corresponding text queries
|
||||
>>> text = texts[i]
|
||||
|
||||
@ -55,7 +55,7 @@ OWL-ViT is a zero-shot text-conditioned object detection model. OWL-ViT uses [CL
|
||||
|
||||
>>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
|
||||
>>> target_sizes = torch.Tensor([image.size[::-1]])
|
||||
>>> # Convert outputs (bounding boxes and class logits) to COCO API
|
||||
>>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
|
||||
>>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)
|
||||
>>> i = 0 # Retrieve predictions for the first image for the corresponding text queries
|
||||
>>> text = texts[i]
|
||||
|
||||
@ -15,7 +15,8 @@ specific language governing permissions and limitations under the License.
|
||||
## Overview
|
||||
|
||||
The SeamlessM4T model was proposed in [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team from Meta AI.
|
||||
This is the version 1 release of the model. For the updated version 2 release, refer to the [Seamless M4T v2 docs](./seamless_m4t_v2.md).
|
||||
|
||||
This is the **version 1** release of the model. For the updated **version 2** release, refer to the [Seamless M4T v2 docs](https://huggingface.co/docs/transformers/main/model_doc/seamless_m4t_v2).
|
||||
|
||||
SeamlessM4T is a collection of models designed to provide high quality translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
|
||||
|
||||
|
||||
@ -16,7 +16,7 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
The SeamlessM4T-v2 model was proposed in [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team from Meta AI.
|
||||
|
||||
SeamlessM4T-v2 is a collection of models designed to provide high quality translation, allowing people from different linguistic communities to communicate effortlessly through speech and text. It is an improvement on the [previous version](./seamless_m4t.md). For more details on the differences between v1 and v2, refer to section [Difference with SeamlessM4T-v1](#difference-with-seamlessm4t-v1).
|
||||
SeamlessM4T-v2 is a collection of models designed to provide high quality translation, allowing people from different linguistic communities to communicate effortlessly through speech and text. It is an improvement on the [previous version](https://huggingface.co/docs/transformers/main/model_doc/seamless_m4t). For more details on the differences between v1 and v2, refer to section [Difference with SeamlessM4T-v1](#difference-with-seamlessm4t-v1).
|
||||
|
||||
SeamlessM4T-v2 enables multiple tasks without relying on separate models:
|
||||
|
||||
|
||||
61
docs/source/en/model_doc/vipllava.md
Normal file
61
docs/source/en/model_doc/vipllava.md
Normal file
@ -0,0 +1,61 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# VipLlava
|
||||
|
||||
## Overview
|
||||
|
||||
The VipLlava model was proposed in [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784) by Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee.
|
||||
|
||||
VipLlava enhances the training protocol of Llava by marking images and interact with the model using natural cues like a "red bounding box" or "pointed arrow" during training.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*While existing large vision-language multimodal models focus on whole image understanding, there is a prominent gap in achieving region-specific comprehension. Current approaches that use textual coordinates or spatial encodings often fail to provide a user-friendly interface for visual prompting. To address this challenge, we introduce a novel multimodal model capable of decoding arbitrary visual prompts. This allows users to intuitively mark images and interact with the model using natural cues like a "red bounding box" or "pointed arrow". Our simple design directly overlays visual markers onto the RGB image, eliminating the need for complex region encodings, yet achieves state-of-the-art performance on region-understanding tasks like Visual7W, PointQA, and Visual Commonsense Reasoning benchmark. Furthermore, we present ViP-Bench, a comprehensive benchmark to assess the capability of models in understanding visual prompts across multiple dimensions, enabling future research in this domain. Code, data, and model are publicly available.*
|
||||
|
||||
Tips:
|
||||
|
||||
- The architecture is similar than llava architecture except that the multi-modal projector takes a set of concatenated vision hidden states and has an additional layernorm layer on that module.
|
||||
|
||||
- We advise users to use `padding_side="left"` when computing batched generation as it leads to more accurate results. Simply make sure to call `processor.tokenizer.padding_side = "left"` before generating.
|
||||
|
||||
- Note the model has not been explicitly trained to process multiple images in the same prompt, although this is technically possible, you may experience inaccurate results.
|
||||
|
||||
- For better results, we recommend users to prompt the model with the correct prompt format:
|
||||
|
||||
```bash
|
||||
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n<prompt>###Assistant:
|
||||
```
|
||||
|
||||
For multiple turns conversation:
|
||||
|
||||
```bash
|
||||
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n<prompt1>###Assistant: <answer1>###Human: <prompt2>###Assistant:
|
||||
```
|
||||
|
||||
The original code can be found [here](https://github.com/mu-cai/ViP-LLaVA).
|
||||
|
||||
This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada)
|
||||
|
||||
|
||||
## VipLlavaConfig
|
||||
|
||||
[[autodoc]] VipLlavaConfig
|
||||
|
||||
## VipLlavaForConditionalGeneration
|
||||
|
||||
[[autodoc]] VipLlavaForConditionalGeneration
|
||||
- forward
|
||||
@ -46,6 +46,7 @@ FlashAttention-2 is currently supported for the following architectures:
|
||||
* [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel)
|
||||
* [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
|
||||
* [Llava](https://huggingface.co/docs/transformers/model_doc/llava)
|
||||
* [VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)
|
||||
* [MBart](https://huggingface.co/docs/transformers/model_doc/mbart#transformers.MBartModel)
|
||||
* [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel)
|
||||
* [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel)
|
||||
@ -55,9 +56,24 @@ FlashAttention-2 is currently supported for the following architectures:
|
||||
|
||||
You can request to add FlashAttention-2 support for another model by opening a GitHub Issue or Pull Request.
|
||||
|
||||
Before you begin, make sure you have FlashAttention-2 installed. For NVIDIA GPUs, the library is installable through pip: `pip install flash-attn --no-build-isolation`. We strongly suggest to refer to the [detailed installation instructions](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features).
|
||||
Before you begin, make sure you have FlashAttention-2 installed.
|
||||
|
||||
FlashAttention-2 is also supported on AMD GPUs, with the current support limited to **Instinct MI210 and Instinct MI250**. We strongly suggest to use the following [Dockerfile](https://github.com/huggingface/optimum-amd/tree/main/docker/transformers-pytorch-amd-gpu-flash/Dockerfile) to use FlashAttention-2 on AMD GPUs.
|
||||
<hfoptions id="install">
|
||||
<hfoption id="NVIDIA">
|
||||
|
||||
```bash
|
||||
pip install flash-attn --no-build-isolation
|
||||
```
|
||||
|
||||
We strongly suggest referring to the detailed [installation instructions](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features) to learn more about supported hardware and data types!
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="AMD">
|
||||
|
||||
FlashAttention-2 is also supported on AMD GPUs and current support is limited to **Instinct MI210** and **Instinct MI250**. We strongly suggest using this [Dockerfile](https://github.com/huggingface/optimum-amd/tree/main/docker/transformers-pytorch-amd-gpu-flash/Dockerfile) to use FlashAttention-2 on AMD GPUs.
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
To enable FlashAttention-2, pass the argument `attn_implementation="flash_attention_2"` to [`~AutoModelForCausalLM.from_pretrained`]:
|
||||
|
||||
@ -79,7 +95,9 @@ model = AutoModelForCausalLM.from_pretrained(
|
||||
|
||||
FlashAttention-2 can only be used when the model's dtype is `fp16` or `bf16`. Make sure to cast your model to the appropriate dtype and load them on a supported device before using FlashAttention-2.
|
||||
|
||||
Note that `use_flash_attention_2=True` can also be used to enable Flash Attention 2, but is deprecated in favor of `attn_implementation="flash_attention_2"`.
|
||||
<br>
|
||||
|
||||
You can also set `use_flash_attention_2=True` to enable FlashAttention-2 but it is deprecated in favor of `attn_implementation="flash_attention_2"`.
|
||||
|
||||
</Tip>
|
||||
|
||||
@ -143,11 +161,11 @@ FlashAttention is more memory efficient, meaning you can train on much larger se
|
||||
<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/llama-2-large-seqlen-padding.png">
|
||||
</div>
|
||||
|
||||
## FlashAttention and memory-efficient attention through PyTorch's scaled_dot_product_attention
|
||||
## PyTorch scaled dot product attention
|
||||
|
||||
PyTorch's [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA) can also call FlashAttention and memory-efficient attention kernels under the hood. SDPA support is currently being added natively in Transformers, and is used by default for `torch>=2.1.1` when an implementation is available.
|
||||
PyTorch's [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA) can also call FlashAttention and memory-efficient attention kernels under the hood. SDPA support is currently being added natively in Transformers and is used by default for `torch>=2.1.1` when an implementation is available.
|
||||
|
||||
For now, Transformers supports inference and training through SDPA for the following architectures:
|
||||
For now, Transformers supports SDPA inference and training for the following architectures:
|
||||
* [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel)
|
||||
* [GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode#transformers.GPTBigCodeModel)
|
||||
* [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel)
|
||||
@ -155,9 +173,13 @@ For now, Transformers supports inference and training through SDPA for the follo
|
||||
* [Idefics](https://huggingface.co/docs/transformers/model_doc/idefics#transformers.IdeficsModel)
|
||||
* [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperModel)
|
||||
|
||||
Note that FlashAttention can only be used for models with the `fp16` or `bf16` torch type, so make sure to cast your model to the appropriate type before using it.
|
||||
<Tip>
|
||||
|
||||
By default, `torch.nn.functional.scaled_dot_product_attention` selects the most performant kernel available, but to check whether a backend is available in a given setting (hardware, problem size), you can use [`torch.backends.cuda.sdp_kernel`](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager:
|
||||
FlashAttention can only be used for models with the `fp16` or `bf16` torch type, so make sure to cast your model to the appropriate type first.
|
||||
|
||||
</Tip>
|
||||
|
||||
By default, SDPA selects the most performant kernel available but you can check whether a backend is available in a given setting (hardware, problem size) with [`torch.backends.cuda.sdp_kernel`](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager:
|
||||
|
||||
```diff
|
||||
import torch
|
||||
@ -177,7 +199,7 @@ inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
|
||||
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
If you see a bug with the traceback below, try using nightly version of PyTorch which may have broader coverage for FlashAttention:
|
||||
If you see a bug with the traceback below, try using the nightly version of PyTorch which may have broader coverage for FlashAttention:
|
||||
|
||||
```bash
|
||||
RuntimeError: No available kernel. Aborting execution.
|
||||
@ -190,11 +212,10 @@ pip3 install -U --pre torch torchvision torchaudio --index-url https://download.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Part of BetterTransformer features are being upstreamed in Transformers, with native `torch.nn.scaled_dot_product_attention` default support. BetterTransformer still has a wider coverage than the Transformers SDPA integration, but you can expect more and more architectures to support natively SDPA in Transformers.
|
||||
Some BetterTransformer features are being upstreamed to Transformers with default support for native `torch.nn.scaled_dot_product_attention`. BetterTransformer still has a wider coverage than the Transformers SDPA integration, but you can expect more and more architectures to natively support SDPA in Transformers.
|
||||
|
||||
</Tip>
|
||||
|
||||
|
||||
<Tip>
|
||||
|
||||
Check out our benchmarks with BetterTransformer and scaled dot product attention in the [Out of the box acceleration and memory savings of 🤗 decoder models with PyTorch 2.0](https://pytorch.org/blog/out-of-the-box-acceleration/) and learn more about the fastpath execution in the [BetterTransformer](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2) blog post.
|
||||
@ -227,7 +248,7 @@ model.save_pretrained("saved_model")
|
||||
|
||||
bitsandbytes is a quantization library that includes support for 4-bit and 8-bit quantization. Quantization reduces your model size compared to its native full precision version, making it easier to fit large models onto GPUs with limited memory.
|
||||
|
||||
Make sure you have bitsnbytes and 🤗 Accelerate installed:
|
||||
Make sure you have bitsandbytes and 🤗 Accelerate installed:
|
||||
|
||||
```bash
|
||||
# these versions support 8-bit and 4-bit
|
||||
|
||||
@ -13,12 +13,51 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Training on Specialized Hardware
|
||||
# PyTorch training on Apple silicon
|
||||
|
||||
<Tip>
|
||||
Previously, training models on a Mac was limited to the CPU only. With the release of PyTorch v1.12, you can take advantage of training models with Apple's silicon GPUs for significantly faster performance and training. This is powered in PyTorch by integrating Apple's Metal Performance Shaders (MPS) as a backend. The [MPS backend](https://pytorch.org/docs/stable/notes/mps.html) implements PyTorch operations as custom Metal shaders and places these modules on a `mps` device.
|
||||
|
||||
Note: Most of the strategies introduced in the [single GPU section](perf_train_gpu_one) (such as mixed precision training or gradient accumulation) and [multi-GPU section](perf_train_gpu_many) are generic and apply to training models in general so make sure to have a look at it before diving into this section.
|
||||
<Tip warning={true}>
|
||||
|
||||
Some PyTorch operations are not implemented in MPS yet and will throw an error. To avoid this, you should set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to use the CPU kernels instead (you'll still see a `UserWarning`).
|
||||
|
||||
<br>
|
||||
|
||||
If you run into any other errors, please open an issue in the [PyTorch](https://github.com/pytorch/pytorch/issues) repository because the [`Trainer`] only integrates the MPS backend.
|
||||
|
||||
</Tip>
|
||||
|
||||
This document will be completed soon with information on how to train on specialized hardware.
|
||||
With the `mps` device set, you can:
|
||||
|
||||
* train larger networks or batch sizes locally
|
||||
* reduce data retrieval latency because the GPU's unified memory architecture allows direct access to the full memory store
|
||||
* reduce costs because you don't need to train on cloud-based GPUs or add additional local GPUs
|
||||
|
||||
Get started by making sure you have PyTorch installed. MPS acceleration is supported on macOS 12.3+.
|
||||
|
||||
```bash
|
||||
pip install torch torchvision torchaudio
|
||||
```
|
||||
|
||||
[`TrainingArguments`] uses the `mps` device by default if it's available which means you don't need to explicitly set the device. For example, you can run the [run_glue.py](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py) script with the MPS backend automatically enabled without making any changes.
|
||||
|
||||
```diff
|
||||
export TASK_NAME=mrpc
|
||||
|
||||
python examples/pytorch/text-classification/run_glue.py \
|
||||
--model_name_or_path bert-base-cased \
|
||||
--task_name $TASK_NAME \
|
||||
- --use_mps_device \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--max_seq_length 128 \
|
||||
--per_device_train_batch_size 32 \
|
||||
--learning_rate 2e-5 \
|
||||
--num_train_epochs 3 \
|
||||
--output_dir /tmp/$TASK_NAME/ \
|
||||
--overwrite_output_dir
|
||||
```
|
||||
|
||||
Backends for [distributed setups](https://pytorch.org/docs/stable/distributed.html#backends) like `gloo` and `nccl` are not supported by the `mps` device which means you can only train on a single GPU with the MPS backend.
|
||||
|
||||
You can learn more about the MPS backend in the [Introducing Accelerated PyTorch Training on Mac](https://pytorch.org/blog/introducing-accelerated-pytorch-training-on-mac/) blog post.
|
||||
|
||||
@ -1,24 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Training on TPUs
|
||||
|
||||
<Tip>
|
||||
|
||||
Note: Most of the strategies introduced in the [single GPU section](perf_train_gpu_one) (such as mixed precision training or gradient accumulation) and [multi-GPU section](perf_train_gpu_many) are generic and apply to training models in general so make sure to have a look at it before diving into this section.
|
||||
|
||||
</Tip>
|
||||
|
||||
This document will be completed soon with information on how to train on TPUs.
|
||||
@ -85,49 +85,22 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
model = AutoModelForCausalLM.from_pretrained("TheBloke/zephyr-7B-alpha-AWQ", attn_implementation="flash_attention_2", device_map="cuda:0")
|
||||
```
|
||||
|
||||
### Fused modules
|
||||
|
||||
### Benchmarks
|
||||
Fused modules offers improved accuracy and performance and it is supported out-of-the-box for AWQ modules for [Llama](https://huggingface.co/meta-llama) and [Mistral](https://huggingface.co/mistralai/Mistral-7B-v0.1) architectures, but you can also fuse AWQ modules for unsupported architectures.
|
||||
|
||||
We performed some speed, throughput and latency benchmarks using [`optimum-benchmark`](https://github.com/huggingface/optimum-benchmark) library.
|
||||
<Tip warning={true}>
|
||||
|
||||
Note at that time of writing this documentation section, the available quantization methods were: `awq`, `gptq` and `bitsandbytes`.
|
||||
Fused modules cannot be combined with other optimization techniques such as FlashAttention-2.
|
||||
|
||||
The benchmark was run on a NVIDIA-A100 instance and the model used was [`TheBloke/Mistral-7B-v0.1-AWQ`](https://huggingface.co/TheBloke/Mistral-7B-v0.1-AWQ) for the AWQ model, [`TheBloke/Mistral-7B-v0.1-GPTQ`](https://huggingface.co/TheBloke/Mistral-7B-v0.1-GPTQ) for the GPTQ model. We also benchmarked it against `bitsandbytes` quantization methods and native `float16` model. Some results are shown below:
|
||||
</Tip>
|
||||
|
||||
<div style="text-align: center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/forward_memory_plot.png">
|
||||
</div>
|
||||
<hfoptions id="fuse">
|
||||
<hfoption id="supported architectures">
|
||||
|
||||
<div style="text-align: center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/generate_memory_plot.png">
|
||||
</div>
|
||||
To enable fused modules for supported architectures, create an [`AwqConfig`] and set the parameters `fuse_max_seq_len` and `do_fuse=True`. The `fuse_max_seq_len` parameter is the total sequence length and it should include the context length and the expected generation length. You can set it to a larger value to be safe.
|
||||
|
||||
<div style="text-align: center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/generate_throughput_plot.png">
|
||||
</div>
|
||||
|
||||
<div style="text-align: center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/forward_latency_plot.png">
|
||||
</div>
|
||||
|
||||
You can find the full results together with packages versions in [this link](https://github.com/huggingface/optimum-benchmark/tree/main/examples/running-mistrals).
|
||||
|
||||
From the results it appears that AWQ quantization method is the fastest quantization method for inference, text generation and among the lowest peak memory for text generation. However, AWQ seems to have the largest forward latency per batch size.
|
||||
|
||||
|
||||
### Make use of fused modules
|
||||
|
||||
You can benefit from fused modules by passing an `AwqConfig` with `fuse_modules=True` and your expected maximum sequence length for generation to `fuse_max_seq_len`. For architectures that do not support `do_fuse=True`, you can still fuse the modules, however you need to pass a custom `fusing_mapping` to `AwqConfig()`. Let's dive into these specific usecases.
|
||||
|
||||
Note that you cannot combine fusing modules and other optimization techniques such as Flash Attention 2.
|
||||
|
||||
#### Fusing modules for supported architectures
|
||||
|
||||
Currently we support out of the box AWQ module fusing for `llama` and `mistral`.
|
||||
|
||||
To enable this feature for supported architectures simply create an `AwqConfig` and pass the arguments `fuse_max_seq_len` and `do_fuse=True`.
|
||||
|
||||
For example to enable module fusing for the model `TheBloke/Mistral-7B-OpenOrca-AWQ`, run:
|
||||
For example, to fuse the AWQ modules of the [TheBloke/Mistral-7B-OpenOrca-AWQ](https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-AWQ) model.
|
||||
|
||||
```python
|
||||
import torch
|
||||
@ -144,14 +117,10 @@ quantization_config = AwqConfig(
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config).to(0)
|
||||
```
|
||||
|
||||
Note that you need to define `fuse_max_seq_len` to `AwqConfig`. That total sequence length should include the context length and the expected generation length. You can set it to a large value to be on the safe zone.
|
||||
|
||||
You can also apply module fusing for other architectures that are not supported.
|
||||
|
||||
#### Fusing modules for unsupported architectures
|
||||
|
||||
For architectures that do not support out of the box module fusing, you can pass a custom fusing mapping; simply pass a dictionnary `modules_to_fuse` to `AwqConfig`, let's take an example with the Yi model:
|
||||
</hfoption>
|
||||
<hfoption id="unsupported architectures">
|
||||
|
||||
For architectures that don't support fused modules yet, you need to create a custom fusing mapping to define which modules need to be fused with the `modules_to_fuse` parameter. For example, to fuse the AWQ modules of the [TheBloke/Yi-34B-AWQ](https://huggingface.co/TheBloke/Yi-34B-AWQ) model.
|
||||
|
||||
```python
|
||||
import torch
|
||||
@ -176,55 +145,18 @@ quantization_config = AwqConfig(
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config).to(0)
|
||||
```
|
||||
|
||||
The parameter `modules_to_fuse` needs to have the following respective fields:
|
||||
The parameter `modules_to_fuse` should include:
|
||||
|
||||
- `"attention"`: The names of the attention layers to fuse - in the order: query, key, value and output projection layer. In case you don't want to fuse the attention layers you can pass an empty list.
|
||||
- `"layernorm"`: The names of all the layernorm layers you want to replace with a custom fused layer norm. In case you don't want to fuse these layers you can also pass an empty list.
|
||||
- `"mlp"`: The names of the MLP layers you want to fuse into a single MLP layer in the order: (gate (dense layer post-attention) / up / down layers).
|
||||
- `"use_alibi"`: If you model uses alibi positional embedding
|
||||
- `"num_attention_heads"`: The number of attention heads
|
||||
- `"num_key_value_heads"`: This is the number of key value heads that should be used to implement Grouped Query Attention. If num_key_value_heads=num_attention_heads, the model will use Multi Head Attention (MHA), if num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used.
|
||||
- `"hidden_size"`: Dimension of the hidden representations.
|
||||
|
||||
|
||||
#### Benchmarks
|
||||
|
||||
We benchmarked the model with and without fused modules first using only `batch_size=1` on the `TheBloke/Mistral-7B-OpenOrca-AWQ` model and below are the results:
|
||||
|
||||
*unfused case*
|
||||
|
||||
| Batch Size | Prefill Length | Decode Length | Prefill tokens/s | Decode tokens/s | Memory (VRAM) |
|
||||
|-------------:|-----------------:|----------------:|-------------------:|------------------:|:----------------|
|
||||
| 1 | 32 | 32 | 60.0984 | 38.4537 | 4.50 GB (5.68%) |
|
||||
| 1 | 64 | 64 | 1333.67 | 31.6604 | 4.50 GB (5.68%) |
|
||||
| 1 | 128 | 128 | 2434.06 | 31.6272 | 4.50 GB (5.68%) |
|
||||
| 1 | 256 | 256 | 3072.26 | 38.1731 | 4.50 GB (5.68%) |
|
||||
| 1 | 512 | 512 | 3184.74 | 31.6819 | 4.59 GB (5.80%) |
|
||||
| 1 | 1024 | 1024 | 3148.18 | 36.8031 | 4.81 GB (6.07%) |
|
||||
| 1 | 2048 | 2048 | 2927.33 | 35.2676 | 5.73 GB (7.23%) |
|
||||
|
||||
*fused case*
|
||||
|
||||
| Batch Size | Prefill Length | Decode Length | Prefill tokens/s | Decode tokens/s | Memory (VRAM) |
|
||||
|-------------:|-----------------:|----------------:|-------------------:|------------------:|:----------------|
|
||||
| 1 | 32 | 32 | 81.4899 | 80.2569 | 4.00 GB (5.05%) |
|
||||
| 1 | 64 | 64 | 1756.1 | 106.26 | 4.00 GB (5.05%) |
|
||||
| 1 | 128 | 128 | 2479.32 | 105.631 | 4.00 GB (5.06%) |
|
||||
| 1 | 256 | 256 | 1813.6 | 85.7485 | 4.01 GB (5.06%) |
|
||||
| 1 | 512 | 512 | 2848.9 | 97.701 | 4.11 GB (5.19%) |
|
||||
| 1 | 1024 | 1024 | 3044.35 | 87.7323 | 4.41 GB (5.57%) |
|
||||
| 1 | 2048 | 2048 | 2715.11 | 89.4709 | 5.57 GB (7.04%) |
|
||||
|
||||
We also performed benchmarks with [`optimum-benchmark`](https://github.com/huggingface/optimum-benchmark) library. And below are the results:
|
||||
|
||||
<div style="text-align: center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/fused_forward_memory_plot.png">
|
||||
</div>
|
||||
|
||||
<div style="text-align: center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/fused_generate_throughput_plot.png">
|
||||
</div>
|
||||
- `"attention"`: The names of the attention layers to fuse in the following order: query, key, value and output projection layer. If you don't want to fuse these layers, pass an empty list.
|
||||
- `"layernorm"`: The names of all the LayerNorm layers you want to replace with a custom fused LayerNorm. If you don't want to fuse these layers, pass an empty list.
|
||||
- `"mlp"`: The names of the MLP layers you want to fuse into a single MLP layer in the order: (gate (dense, layer, post-attention) / up / down layers).
|
||||
- `"use_alibi"`: If your model uses ALiBi positional embedding.
|
||||
- `"num_attention_heads"`: The number of attention heads.
|
||||
- `"num_key_value_heads"`: The number of key value heads that should be used to implement Grouped Query Attention (GQA). If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA), otherwise GQA is used.
|
||||
- `"hidden_size"`: The dimension of the hidden representations.
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## AutoGPTQ
|
||||
|
||||
@ -610,3 +542,44 @@ To compare the speed, throughput, and latency of each quantization scheme, check
|
||||
</div>
|
||||
|
||||
The benchmarks indicate AWQ quantization is the fastest for inference, text generation, and has the lowest peak memory for text generation. However, AWQ has the largest forward latency per batch size. For a more detailed discussion about the pros and cons of each quantization method, read the [Overview of natively supported quantization schemes in 🤗 Transformers](https://huggingface.co/blog/overview-quantization-transformers) blog post.
|
||||
|
||||
### Fused AWQ modules
|
||||
|
||||
The [TheBloke/Mistral-7B-OpenOrca-AWQ](https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-AWQ) model was benchmarked with `batch_size=1` with and without fused modules.
|
||||
|
||||
<figcaption class="text-center text-gray-500 text-lg">Unfused module</figcaption>
|
||||
|
||||
| Batch Size | Prefill Length | Decode Length | Prefill tokens/s | Decode tokens/s | Memory (VRAM) |
|
||||
|-------------:|-----------------:|----------------:|-------------------:|------------------:|:----------------|
|
||||
| 1 | 32 | 32 | 60.0984 | 38.4537 | 4.50 GB (5.68%) |
|
||||
| 1 | 64 | 64 | 1333.67 | 31.6604 | 4.50 GB (5.68%) |
|
||||
| 1 | 128 | 128 | 2434.06 | 31.6272 | 4.50 GB (5.68%) |
|
||||
| 1 | 256 | 256 | 3072.26 | 38.1731 | 4.50 GB (5.68%) |
|
||||
| 1 | 512 | 512 | 3184.74 | 31.6819 | 4.59 GB (5.80%) |
|
||||
| 1 | 1024 | 1024 | 3148.18 | 36.8031 | 4.81 GB (6.07%) |
|
||||
| 1 | 2048 | 2048 | 2927.33 | 35.2676 | 5.73 GB (7.23%) |
|
||||
|
||||
<figcaption class="text-center text-gray-500 text-lg">Fused module</figcaption>
|
||||
|
||||
| Batch Size | Prefill Length | Decode Length | Prefill tokens/s | Decode tokens/s | Memory (VRAM) |
|
||||
|-------------:|-----------------:|----------------:|-------------------:|------------------:|:----------------|
|
||||
| 1 | 32 | 32 | 81.4899 | 80.2569 | 4.00 GB (5.05%) |
|
||||
| 1 | 64 | 64 | 1756.1 | 106.26 | 4.00 GB (5.05%) |
|
||||
| 1 | 128 | 128 | 2479.32 | 105.631 | 4.00 GB (5.06%) |
|
||||
| 1 | 256 | 256 | 1813.6 | 85.7485 | 4.01 GB (5.06%) |
|
||||
| 1 | 512 | 512 | 2848.9 | 97.701 | 4.11 GB (5.19%) |
|
||||
| 1 | 1024 | 1024 | 3044.35 | 87.7323 | 4.41 GB (5.57%) |
|
||||
| 1 | 2048 | 2048 | 2715.11 | 89.4709 | 5.57 GB (7.04%) |
|
||||
|
||||
The speed and throughput of fused and unfused modules were also tested with the [optimum-benchmark](https://github.com/huggingface/optimum-benchmark) library.
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/fused_forward_memory_plot.png" alt="generate throughput per batch size" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">foward peak memory/batch size</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/fused_generate_throughput_plot.png" alt="forward latency per batch size" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">generate throughput/batch size</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@ -182,7 +182,7 @@ Like classification tasks in any modality, text classification labels a sequence
|
||||
|
||||
### Token classification
|
||||
|
||||
In any NLP task, text is preprocessed by separating the sequence of text into individual words or subwords. These are known as [tokens](/glossary#token). Token classification assigns each token a label from a predefined set of classes.
|
||||
In any NLP task, text is preprocessed by separating the sequence of text into individual words or subwords. These are known as [tokens](glossary#token). Token classification assigns each token a label from a predefined set of classes.
|
||||
|
||||
Two common types of token classification are:
|
||||
|
||||
|
||||
@ -61,8 +61,8 @@ import torch.nn.functional as F
|
||||
|
||||
|
||||
class ImageDistilTrainer(Trainer):
|
||||
def __init__(self, *args, teacher_model=None, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
def __init__(self, teacher_model=None, student_model=None, temperature=None, lambda_param=None, *args, **kwargs):
|
||||
super().__init__(model=student_model, *args, **kwargs)
|
||||
self.teacher = teacher_model
|
||||
self.student = student_model
|
||||
self.loss_function = nn.KLDivLoss(reduction="batchmean")
|
||||
@ -164,7 +164,7 @@ trainer = ImageDistilTrainer(
|
||||
train_dataset=processed_datasets["train"],
|
||||
eval_dataset=processed_datasets["validation"],
|
||||
data_collator=data_collator,
|
||||
tokenizer=teacher_extractor,
|
||||
tokenizer=teacher_processor,
|
||||
compute_metrics=compute_metrics,
|
||||
temperature=5,
|
||||
lambda_param=0.5
|
||||
|
||||
@ -512,7 +512,7 @@ Finally, load the metrics and run the evaluation.
|
||||
... outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
|
||||
|
||||
... orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
|
||||
... results = im_processor.post_process(outputs, orig_target_sizes) # convert outputs of model to COCO api
|
||||
... results = im_processor.post_process(outputs, orig_target_sizes) # convert outputs of model to Pascal VOC format (xmin, ymin, xmax, ymax)
|
||||
|
||||
... module.add(prediction=results, reference=labels)
|
||||
... del batch
|
||||
|
||||
@ -276,8 +276,7 @@ You could also create and use your own dataset if you prefer to train with the [
|
||||
"label": sorted(label_paths)})
|
||||
dataset = dataset.cast_column("image", Image())
|
||||
dataset = dataset.cast_column("label", Image())
|
||||
|
||||
return dataset
|
||||
return dataset
|
||||
|
||||
# step 1: create Dataset objects
|
||||
train_dataset = create_dataset(image_paths_train, label_paths_train)
|
||||
|
||||
408
docs/source/en/trainer.md
Normal file
408
docs/source/en/trainer.md
Normal file
@ -0,0 +1,408 @@
|
||||
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Trainer
|
||||
|
||||
The [`Trainer`] is a complete training and evaluation loop for PyTorch models implemented in the Transformers library. You only need to pass it the necessary pieces for training (model, tokenizer, dataset, evaluation function, training hyperparameters, etc.), and the [`Trainer`] class takes care of the rest. This makes it easier to start training faster without manually writing your own training loop. But at the same time, [`Trainer`] is very customizable and offers a ton of training options so you can tailor it to your exact training needs.
|
||||
|
||||
<Tip>
|
||||
|
||||
In addition to the [`Trainer`] class, Transformers also provides a [`Seq2SeqTrainer`] class for sequence-to-sequence tasks like translation or summarization. There is also the [`~trl.SFTTrainer`] class from the [TRL](https://hf.co/docs/trl) library which wraps the [`Trainer`] class and is optimized for training language models like Llama-2 and Mistral with autoregressive techniques. [`~trl.SFTTrainer`] also supports features like sequence packing, LoRA, quantization, and DeepSpeed for efficiently scaling to any model size.
|
||||
|
||||
<br>
|
||||
|
||||
Feel free to check out the [API reference](./main_classes/trainer) for these other [`Trainer`]-type classes to learn more about when to use which one. In general, [`Trainer`] is the most versatile option and is appropriate for a broad spectrum of tasks. [`Seq2SeqTrainer`] is designed for sequence-to-sequence tasks and [`~trl.SFTTrainer`] is designed for training language models.
|
||||
|
||||
</Tip>
|
||||
|
||||
Before you start, make sure [Accelerate](https://hf.co/docs/accelerate) - a library for enabling and running PyTorch training across distributed environments - is installed.
|
||||
|
||||
```bash
|
||||
pip install accelerate
|
||||
|
||||
# upgrade
|
||||
pip install accelerate --upgrade
|
||||
```
|
||||
|
||||
This guide provides an overview of the [`Trainer`] class.
|
||||
|
||||
## Basic usage
|
||||
|
||||
[`Trainer`] includes all the code you'll find in a basic training loop:
|
||||
|
||||
1. perform a training step to calculate the loss
|
||||
2. calculate the gradients with the [`~accelerate.Accelerator.backward`] method
|
||||
3. update the weights based on the gradients
|
||||
4. repeat this process until you've reached a predetermined number of epochs
|
||||
|
||||
The [`Trainer`] class abstracts all of this code away so you don't have to worry about manually writing a training loop every time or if you're just getting started with PyTorch and training. You only need to provide the essential components required for training, such as a model and a dataset, and the [`Trainer`] class handles everything else.
|
||||
|
||||
If you want to specify any training options or hyperparameters, you can find them in the [`TrainingArguments`] class. For example, let's define where to save the model in `output_dir` and push the model to the Hub after training with `push_to_hub=True`.
|
||||
|
||||
```py
|
||||
from transformers import TrainingArguments
|
||||
|
||||
training_args = TrainingArguments(
|
||||
output_dir="your-model",
|
||||
learning_rate=2e-5,
|
||||
per_device_train_batch_size=16,
|
||||
per_device_eval_batch_size=16,
|
||||
num_train_epochs=2,
|
||||
weight_decay=0.01,
|
||||
evaluation_strategy="epoch",
|
||||
save_strategy="epoch",
|
||||
load_best_model_at_end=True,
|
||||
push_to_hub=True,
|
||||
)
|
||||
```
|
||||
|
||||
Pass `training_args` to the [`Trainer`] along with a model, dataset, something to preprocess the dataset with (depending on your data type it could be a tokenizer, feature extractor or image processor), a data collator, and a function to compute the metrics you want to track during training.
|
||||
|
||||
Finally, call [`~Trainer.train`] to start training!
|
||||
|
||||
```py
|
||||
from transformers import Trainer
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=dataset["train"],
|
||||
eval_dataset=dataset["test"],
|
||||
tokenizer=tokenizer,
|
||||
data_collator=data_collator,
|
||||
compute_metrics=compute_metrics,
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
```
|
||||
|
||||
### Checkpoints
|
||||
|
||||
The [`Trainer`] class saves your model checkpoints to the directory specified in the `output_dir` parameter of [`TrainingArguments`]. You'll find the checkpoints saved in a `checkpoint-000` subfolder where the numbers at the end correspond to the training step. Saving checkpoints are useful for resuming training later.
|
||||
|
||||
```py
|
||||
# resume from latest checkpoint
|
||||
trainer.train(resume_from_checkpoint=True)
|
||||
|
||||
# resume from specific checkpoint saved in output directory
|
||||
trainer.train(resume_from_checkpoint="your-model/checkpoint-1000")
|
||||
```
|
||||
|
||||
You can save your checkpoints (the optimizer state is not saved by default) to the Hub by setting `push_to_hub=True` in [`TrainingArguments`] to commit and push them. Other options for deciding how your checkpoints are saved are set up in the [`hub_strategy`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.hub_strategy) parameter:
|
||||
|
||||
* `hub_strategy="checkpoint"` pushes the latest checkpoint to a subfolder named "last-checkpoint" from which you can resume training
|
||||
* `hug_strategy="all_checkpoints"` pushes all checkpoints to the directory defined in `output_dir` (you'll see one checkpoint per folder in your model repository)
|
||||
|
||||
When you resume training from a checkpoint, the [`Trainer`] tries to keep the Python, NumPy, and PyTorch RNG states the same as they were when the checkpoint was saved. But because PyTorch has various non-deterministic default settings, the RNG states aren't guaranteed to be the same. If you want to enable full determinism, take a look at the [Controlling sources of randomness](https://pytorch.org/docs/stable/notes/randomness#controlling-sources-of-randomness) guide to learn what you can enable to make your training fully deterministic. Keep in mind though that by making certain settings deterministic, training may be slower.
|
||||
|
||||
## Customize the Trainer
|
||||
|
||||
While the [`Trainer`] class is designed to be accessible and easy-to-use, it also offers a lot of customizability for more adventurous users. Many of the [`Trainer`]'s method can be subclassed and overridden to support the functionality you want, without having to rewrite the entire training loop from scratch to accommodate it. These methods include:
|
||||
|
||||
* [`~Trainer.get_train_dataloader`] creates a training DataLoader
|
||||
* [`~Trainer.get_eval_dataloader`] creates an evaluation DataLoader
|
||||
* [`~Trainer.get_test_dataloader`] creates a test DataLoader
|
||||
* [`~Trainer.log`] logs information on the various objects that watch training
|
||||
* [`~Trainer.create_optimizer_and_scheduler`] creates an optimizer and learning rate scheduler if they weren't passed in the `__init__`; these can also be separately customized with [`~Trainer.create_optimizer`] and [`~Trainer.create_scheduler`] respectively
|
||||
* [`~Trainer.compute_loss`] computes the loss on a batch of training inputs
|
||||
* [`~Trainer.training_step`] performs the training step
|
||||
* [`~Trainer.prediction_step`] performs the prediction and test step
|
||||
* [`~Trainer.evaluate`] evaluates the model and returns the evaluation metrics
|
||||
* [`~Trainer.predict`] makes predictions (with metrics if labels are available) on the test set
|
||||
|
||||
For example, if you want to customize the [`~Trainer.compute_loss`] method to use a weighted loss instead.
|
||||
|
||||
```py
|
||||
from torch import nn
|
||||
from transformers import Trainer
|
||||
|
||||
class CustomTrainer(Trainer):
|
||||
def compute_loss(self, model, inputs, return_outputs=False):
|
||||
labels = inputs.pop("labels")
|
||||
# forward pass
|
||||
outputs = model(**inputs)
|
||||
logits = outputs.get("logits")
|
||||
# compute custom loss for 3 labels with different weights
|
||||
loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0], device=model.device))
|
||||
loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
|
||||
return (loss, outputs) if return_outputs else loss
|
||||
```
|
||||
|
||||
### Callbacks
|
||||
|
||||
Another option for customizing the [`Trainer`] is to use [callbacks](callbacks). Callbacks *don't change* anything in the training loop. They inspect the training loop state and then execute some action (early stopping, logging results, etc.) depending on the state. In other words, a callback can't be used to implement something like a custom loss function and you'll need to subclass and override the [`~Trainer.compute_loss`] method for that.
|
||||
|
||||
For example, if you want to add an early stopping callback to the training loop after 10 steps.
|
||||
|
||||
```py
|
||||
from transformers import TrainerCallback
|
||||
|
||||
class EarlyStoppingCallback(TrainerCallback):
|
||||
def __init__(self, num_steps=10):
|
||||
self.num_steps = num_steps
|
||||
|
||||
def on_step_end(self, args, state, control, **kwargs):
|
||||
if state.global_step >= self.num_steps:
|
||||
return {"should_training_stop": True}
|
||||
else:
|
||||
return {}
|
||||
```
|
||||
|
||||
Then pass it to the [`Trainer`]'s `callback` parameter.
|
||||
|
||||
```py
|
||||
from transformers import Trainer
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=dataset["train"],
|
||||
eval_dataset=dataset["test"],
|
||||
tokenizer=tokenizer,
|
||||
data_collator=data_collator,
|
||||
compute_metrics=compute_metrics,
|
||||
callback=[EarlyStoppingCallback()],
|
||||
)
|
||||
```
|
||||
|
||||
## Logging
|
||||
|
||||
<Tip>
|
||||
|
||||
Check out the [logging](./main_classes/logging) API reference for more information about the different logging levels.
|
||||
|
||||
</Tip>
|
||||
|
||||
The [`Trainer`] is set to `logging.INFO` by default which reports errors, warnings, and other basic information. A [`Trainer`] replica - in distributed environments - is set to `logging.WARNING` which only reports errors and warnings. You can change the logging level with the [`log_level`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.log_level) and [`log_level_replica`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.log_level_replica) parameters in [`TrainingArguments`].
|
||||
|
||||
To configure the log level setting for each node, use the [`log_on_each_node`](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.log_on_each_node) parameter to determine whether to use the log level on each node or only on the main node.
|
||||
|
||||
<Tip>
|
||||
|
||||
[`Trainer`] sets the log level separately for each node in the [`Trainer.__init__`] method, so you may want to consider setting this sooner if you're using other Transformers functionalities before creating the [`Trainer`] object.
|
||||
|
||||
</Tip>
|
||||
|
||||
For example, to set your main code and modules to use the same log level according to each node:
|
||||
|
||||
```py
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
handlers=[logging.StreamHandler(sys.stdout)],
|
||||
)
|
||||
|
||||
log_level = training_args.get_process_log_level()
|
||||
logger.setLevel(log_level)
|
||||
datasets.utils.logging.set_verbosity(log_level)
|
||||
transformers.utils.logging.set_verbosity(log_level)
|
||||
|
||||
trainer = Trainer(...)
|
||||
```
|
||||
|
||||
Use different combinations of `log_level` and `log_level_replica` to configure what gets logged on each of the nodes.
|
||||
|
||||
<hfoptions id="logging">
|
||||
<hfoption id="single node">
|
||||
|
||||
```bash
|
||||
my_app.py ... --log_level warning --log_level_replica error
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="multi-node">
|
||||
|
||||
Add the `log_on_each_node 0` parameter for multi-node environments.
|
||||
|
||||
```bash
|
||||
my_app.py ... --log_level warning --log_level_replica error --log_on_each_node 0
|
||||
|
||||
# set to only report errors
|
||||
my_app.py ... --log_level error --log_level_replica error --log_on_each_node 0
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## NEFTune
|
||||
|
||||
[NEFTune](https://hf.co/papers/2310.05914) is a technique that can improve performance by adding noise to the embedding vectors during training. To enable it in [`Trainer`], set the `neftune_noise_alpha` parameter in [`TrainingArguments`] to control how much noise is added.
|
||||
|
||||
```py
|
||||
from transformers import TrainingArguments, Trainer
|
||||
|
||||
training_args = TrainingArguments(..., neftune_noise_alpha=0.1)
|
||||
trainer = Trainer(..., args=training_args)
|
||||
```
|
||||
|
||||
NEFTune is disabled after training to restore the original embedding layer to avoid any unexpected behavior.
|
||||
|
||||
## Accelerate and Trainer
|
||||
|
||||
The [`Trainer`] class is powered by [Accelerate](https://hf.co/docs/accelerate), a library for easily training PyTorch models in distributed environments with support for integrations such as [FullyShardedDataParallel (FSDP)](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) and [DeepSpeed](https://www.deepspeed.ai/).
|
||||
|
||||
To use Accelerate with [`Trainer`], run the [`accelerate.config`](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-config) command to set up training for your training environment. This command creates a `config_file.yaml` that'll be used when you launch your training script. For example, some example configurations you can setup are:
|
||||
|
||||
<hfoptions id="config">
|
||||
<hfoption id="DistributedDataParallel">
|
||||
|
||||
```yml
|
||||
compute_environment: LOCAL_MACHINE
|
||||
distributed_type: MULTI_GPU
|
||||
downcast_bf16: 'no'
|
||||
gpu_ids: all
|
||||
machine_rank: 0 #change rank as per the node
|
||||
main_process_ip: 192.168.20.1
|
||||
main_process_port: 9898
|
||||
main_training_function: main
|
||||
mixed_precision: fp16
|
||||
num_machines: 2
|
||||
num_processes: 8
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="FSDP">
|
||||
|
||||
```yml
|
||||
compute_environment: LOCAL_MACHINE
|
||||
distributed_type: FSDP
|
||||
downcast_bf16: 'no'
|
||||
fsdp_config:
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_backward_prefetch_policy: BACKWARD_PRE
|
||||
fsdp_forward_prefetch: true
|
||||
fsdp_offload_params: false
|
||||
fsdp_sharding_strategy: 1
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_transformer_layer_cls_to_wrap: BertLayer
|
||||
fsdp_use_orig_params: true
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
mixed_precision: bf16
|
||||
num_machines: 1
|
||||
num_processes: 2
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="DeepSpeed">
|
||||
|
||||
```yml
|
||||
compute_environment: LOCAL_MACHINE
|
||||
deepspeed_config:
|
||||
deepspeed_config_file: /home/user/configs/ds_zero3_config.json
|
||||
zero3_init_flag: true
|
||||
distributed_type: DEEPSPEED
|
||||
downcast_bf16: 'no'
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
num_machines: 1
|
||||
num_processes: 4
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="DeepSpeed with Accelerate plugin">
|
||||
|
||||
```yml
|
||||
compute_environment: LOCAL_MACHINE
|
||||
deepspeed_config:
|
||||
gradient_accumulation_steps: 1
|
||||
gradient_clipping: 0.7
|
||||
offload_optimizer_device: cpu
|
||||
offload_param_device: cpu
|
||||
zero3_init_flag: true
|
||||
zero_stage: 2
|
||||
distributed_type: DEEPSPEED
|
||||
downcast_bf16: 'no'
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
mixed_precision: bf16
|
||||
num_machines: 1
|
||||
num_processes: 4
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
The [`accelerate_launch`](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-launch) command is the recommended way to launch your training script on a distributed system with Accelerate and [`Trainer`] with the parameters specified in `config_file.yaml`. This file is saved to the Accelerate cache folder and automatically loaded when you run `accelerate_launch`.
|
||||
|
||||
For example, to run the [run_glue.py](https://github.com/huggingface/transformers/blob/f4db565b695582891e43a5e042e5d318e28f20b8/examples/pytorch/text-classification/run_glue.py#L4) training script with the FSDP configuration:
|
||||
|
||||
```bash
|
||||
accelerate launch \
|
||||
./examples/pytorch/text-classification/run_glue.py \
|
||||
--model_name_or_path bert-base-cased \
|
||||
--task_name $TASK_NAME \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--max_seq_length 128 \
|
||||
--per_device_train_batch_size 16 \
|
||||
--learning_rate 5e-5 \
|
||||
--num_train_epochs 3 \
|
||||
--output_dir /tmp/$TASK_NAME/ \
|
||||
--overwrite_output_dir
|
||||
```
|
||||
|
||||
You could also specify the parameters from the `config_file.yaml` file directly in the command line:
|
||||
|
||||
```bash
|
||||
accelerate launch --num_processes=2 \
|
||||
--use_fsdp \
|
||||
--mixed_precision=bf16 \
|
||||
--fsdp_auto_wrap_policy=TRANSFORMER_BASED_WRAP \
|
||||
--fsdp_transformer_layer_cls_to_wrap="BertLayer" \
|
||||
--fsdp_sharding_strategy=1 \
|
||||
--fsdp_state_dict_type=FULL_STATE_DICT \
|
||||
./examples/pytorch/text-classification/run_glue.py
|
||||
--model_name_or_path bert-base-cased \
|
||||
--task_name $TASK_NAME \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--max_seq_length 128 \
|
||||
--per_device_train_batch_size 16 \
|
||||
--learning_rate 5e-5 \
|
||||
--num_train_epochs 3 \
|
||||
--output_dir /tmp/$TASK_NAME/ \
|
||||
--overwrite_output_dir
|
||||
```
|
||||
|
||||
Check out the [Launching your Accelerate scripts](https://huggingface.co/docs/accelerate/basic_tutorials/launch) tutorial to learn more about `accelerate_launch` and custom configurations.
|
||||
@ -75,6 +75,8 @@
|
||||
- sections:
|
||||
- local: philosophy
|
||||
title: Filosofía
|
||||
- local: glossary
|
||||
title: Glosario
|
||||
- local: pad_truncation
|
||||
title: Relleno y truncamiento
|
||||
- local: bertology
|
||||
|
||||
464
docs/source/es/glossary.md
Normal file
464
docs/source/es/glossary.md
Normal file
@ -0,0 +1,464 @@
|
||||
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Glosario
|
||||
|
||||
Este glosario define términos generales de aprendizaje automático y términos relacionados con 🤗 Transformers para ayudarte a comprender mejor la documentación.
|
||||
|
||||
## A
|
||||
|
||||
### attention mask
|
||||
|
||||
La máscara de atención es un argumento opcional utilizado al agrupar secuencias.
|
||||
|
||||
<Youtube id="M6adb1j2jPI"/>
|
||||
|
||||
Este argumento indica al modelo qué tokens deben recibir atención y cuáles no.
|
||||
|
||||
Por ejemplo, considera estas dos secuencias:
|
||||
|
||||
```python
|
||||
>>> from transformers import BertTokenizer
|
||||
|
||||
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
|
||||
|
||||
>>> sequence_a = "This is a short sequence."
|
||||
>>> sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
|
||||
|
||||
>>> encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
|
||||
>>> encoded_sequence_b = tokenizer(sequence_b)["input_ids"]
|
||||
```
|
||||
|
||||
Las versiones codificadas tienen longitudes diferentes:
|
||||
|
||||
```python
|
||||
>>> len(encoded_sequence_a), len(encoded_sequence_b)
|
||||
(8, 19)
|
||||
```
|
||||
|
||||
Por lo tanto, no podemos colocarlas juntas en el mismo tensor tal cual. La primera secuencia necesita ser rellenada hasta la longitud de la segunda, o la segunda necesita ser truncada hasta la longitud de la primera.
|
||||
|
||||
En el primer caso, la lista de IDs se extenderá con los índices de relleno. Podemos pasar una lista al tokenizador y pedirle que realice el relleno de esta manera:
|
||||
|
||||
```python
|
||||
>>> padded_sequences = tokenizer([sequence_a, sequence_b], padding=True)
|
||||
```
|
||||
|
||||
Podemos ver que se han agregado ceros a la derecha de la primera oración para que tenga la misma longitud que la segunda:
|
||||
|
||||
```python
|
||||
>>> padded_sequences["input_ids"]
|
||||
[[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]
|
||||
```
|
||||
|
||||
Esto luego se puede convertir en un tensor en PyTorch o TensorFlow. La máscara de atención es un tensor binario que indica la posición de los índices de relleno para que el modelo no los tenga en cuenta. Para el [`BertTokenizer`], `1` indica un valor al que se debe prestar atención, mientras que `0` indica un valor de relleno. Esta máscara de atención está en el diccionario devuelto por el tokenizador bajo la clave "attention_mask":
|
||||
|
||||
```python
|
||||
>>> padded_sequences["attention_mask"]
|
||||
[[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
|
||||
```
|
||||
|
||||
### autoencoding models
|
||||
|
||||
Consulta [modelos de codificación](#encoder-models) y [modelado de lenguaje enmascarado](#masked-language-modeling-mlm)
|
||||
|
||||
### autoregressive models
|
||||
|
||||
Consulta [modelado de lenguaje causal](#causal-language-modeling) y [modelos de decodificación](#decoder-models)
|
||||
|
||||
## B
|
||||
|
||||
### backbone
|
||||
|
||||
La columna vertebral, backbone en inglés, es la red (embeddings y layers) que produce los estados ocultos o características crudas. Normalmente, está conectado a una [cabecera](#head), que acepta las características como entrada para hacer una predicción. Por ejemplo, [`ViTModel`] es una columna vertebral sin una cabecera específica encima. Otros modelos también pueden usar [`VitModel`] como columna vertebral, como por ejemplo [DPT](model_doc/dpt).
|
||||
|
||||
## C
|
||||
|
||||
### causal language modeling
|
||||
|
||||
Una tarea de preentrenamiento donde el modelo lee los textos en orden y tiene que predecir la siguiente palabra. Generalmente, se realiza leyendo toda la oración, pero utilizando una máscara dentro del modelo para ocultar los tokens futuros en un cierto paso de tiempo.
|
||||
|
||||
### channel
|
||||
|
||||
Las imágenes a color están compuestas por alguna combinación de valores en tres canales: rojo, verde y azul (RGB), y las imágenes en escala de grises solo tienen un canal. En 🤗 Transformers, el canal puede ser la primera o última dimensión del tensor de una imagen: [`n_channels`, `height`, `width`] o [`height`, `width`, `n_channels`].
|
||||
|
||||
### connectionist temporal classification (CTC)
|
||||
|
||||
Un algoritmo que permite que un modelo aprenda sin saber exactamente cómo están alineadas la entrada y la salida; CTC calcula la distribución de todas las salidas posibles para una entrada dada y elige la salida más probable de ella. CTC se utiliza comúnmente en tareas de reconocimiento de voz porque el habla no siempre se alinea perfectamente con la transcripción debido a diversas razones, como las diferentes velocidades de habla de los oradores.
|
||||
|
||||
### convolution
|
||||
|
||||
Un tipo de capa en una red neuronal donde la matriz de entrada se multiplica elemento por elemento por una matriz más pequeña (núcleo o filtro) y los valores se suman en una nueva matriz. Esto se conoce como una operación de convolución que se repite sobre toda la matriz de entrada. Cada operación se aplica a un segmento diferente de la matriz de entrada. Las redes neuronales convolucionales (CNN) se utilizan comúnmente en visión por computadora.
|
||||
|
||||
## D
|
||||
|
||||
### DataParallel (DP)
|
||||
|
||||
Técnica de paralelismo para entrenamiento en múltiples GPUs donde se replica la misma configuración varias veces, con cada instancia recibiendo una porción de datos única. El procesamiento se realiza en paralelo y todas las configuraciones se sincronizan al final de cada paso de entrenamiento.
|
||||
|
||||
Obtén más información sobre cómo funciona el DataParallel [aquí](perf_train_gpu_many#dataparallel-vs-distributeddataparallel).
|
||||
|
||||
### decoder input IDs
|
||||
|
||||
Esta entrada es específica para modelos codificador-decodificador y contiene los IDs de entrada que se enviarán al decodificador. Estas entradas deben usarse para tareas de secuencia a secuencia, como traducción o resumen, y generalmente se construyen de una manera específica para cada modelo.
|
||||
|
||||
La mayoría de los modelos codificador-decodificador (BART, T5) crean sus `decoder_input_ids` por sí mismos a partir de las `labels`. En tales modelos, pasar las `labels` es la forma preferida de manejar el entrenamiento.
|
||||
|
||||
Consulta la documentación de cada modelo para ver cómo manejan estos IDs de entrada para el entrenamiento de secuencia a secuencia.
|
||||
|
||||
### decoder models
|
||||
|
||||
También conocidos como modelos autorregresivos, los modelos decodificadores involucran una tarea de preentrenamiento (llamada modelado de lenguaje causal) donde el modelo lee los textos en orden y tiene que predecir la siguiente palabra. Generalmente, se realiza leyendo la oración completa con una máscara para ocultar los tokens futuros en un cierto paso de tiempo.
|
||||
|
||||
<Youtube id="d_ixlCubqQw"/>
|
||||
|
||||
### deep learning (DL)
|
||||
|
||||
Algoritmos de aprendizaje automático que utilizan redes neuronales con varias capas.
|
||||
|
||||
## E
|
||||
|
||||
### encoder models
|
||||
|
||||
También conocidos como modelos de codificación automática (autoencoding models), los modelos codificadores toman una entrada (como texto o imágenes) y las transforman en una representación numérica condensada llamada embedding. A menudo, los modelos codificadores se entrenan previamente utilizando técnicas como el [modelado de lenguaje enmascarado](#masked-language-modeling-mlm), que enmascara partes de la secuencia de entrada y obliga al modelo a crear representaciones más significativas.
|
||||
|
||||
<Youtube id="H39Z_720T5s"/>
|
||||
|
||||
## F
|
||||
|
||||
### feature extraction
|
||||
|
||||
El proceso de seleccionar y transformar datos crudos en un conjunto de características más informativas y útiles para algoritmos de aprendizaje automático. Algunos ejemplos de extracción de características incluyen transformar texto crudo en embeddings de palabras y extraer características importantes como bordes o formas de datos de imágenes/videos.
|
||||
|
||||
### feed forward chunking
|
||||
|
||||
En cada bloque de atención residual en los transformadores, la capa de autoatención suele ir seguida de 2 capas de avance. El tamaño de embedding intermedio de las capas de avance suele ser mayor que el tamaño oculto del modelo (por ejemplo, para `bert-base-uncased`).
|
||||
|
||||
Para una entrada de tamaño `[batch_size, sequence_length]`, la memoria requerida para almacenar los embeddings intermedios de avance `[batch_size, sequence_length, config.intermediate_size]` puede representar una gran fracción del uso de memoria. Los autores de [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) observaron que, dado que el cálculo es independiente de la dimensión `sequence_length`, es matemáticamente equivalente calcular los embeddings de salida de ambas capas de avance `[batch_size, config.hidden_size]_0, ..., [batch_size, config.hidden_size]_n` individualmente y concatenarlos después a `[batch_size, sequence_length, config.hidden_size]` con `n = sequence_length`, lo que intercambia el aumento del tiempo de cálculo por una reducción en el uso de memoria, pero produce un resultado matemáticamente **equivalente**.
|
||||
|
||||
Para modelos que utilizan la función [`apply_chunking_to_forward`], el `chunk_size` define el número de embeddings de salida que se calculan en paralelo y, por lo tanto, define el equilibrio entre la complejidad de memoria y tiempo. Si `chunk_size` se establece en 0, no se realiza ninguna fragmentación de avance.
|
||||
|
||||
### finetuned models
|
||||
|
||||
El ajuste fino es una forma de transferencia de aprendizaje que implica tomar un modelo entrenado previamente, congelar sus pesos y reemplazar la capa de salida con una nueva [cabecera de modelo](#head) recién añadida. La cabecera del modelo se entrena en tu conjunto de datos objetivo.
|
||||
|
||||
Consulta el tutorial [Ajustar finamente un modelo pre-entrenado](https://huggingface.co/docs/transformers/training) para obtener más detalles y aprende cómo ajustar finamente modelos con 🤗 Transformers.
|
||||
|
||||
## H
|
||||
|
||||
### head
|
||||
|
||||
La cabecera del modelo se refiere a la última capa de una red neuronal que acepta los estados ocultos crudos y los proyecta en una dimensión diferente. Hay una cabecera de modelo diferente para cada tarea. Por ejemplo:
|
||||
|
||||
* [`GPT2ForSequenceClassification`] es una cabecera de clasificación de secuencias, es decir, una capa lineal, encima del modelo base [`GPT2Model`].
|
||||
* [`ViTForImageClassification`] es una cabecera de clasificación de imágenes, es decir, una capa lineal encima del estado oculto final del token `CLS`, encima del modelo base [`ViTModel`].
|
||||
* [`Wav2Vec2ForCTC`] es una cabecera de modelado de lenguaje con [CTC](#connectionist-temporal-classification-(CTC)) encima del modelo base [`Wav2Vec2Model`].
|
||||
|
||||
## I
|
||||
|
||||
### image patch
|
||||
|
||||
Los modelos de Transformers basados en visión dividen una imagen en parches más pequeños que se incorporan linealmente y luego se pasan como una secuencia al modelo. Puedes encontrar el `patch_size` (o resolución del modelo) en su configuración.
|
||||
|
||||
### inference
|
||||
|
||||
La inferencia es el proceso de evaluar un modelo en nuevos datos después de completar el entrenamiento. Consulta el tutorial [Pipeline for inference](https://huggingface.co/docs/transformers/pipeline_tutorial) para aprender cómo realizar inferencias con 🤗 Transformers.
|
||||
|
||||
### input IDs
|
||||
|
||||
Los IDs de entrada a menudo son los únicos parámetros necesarios que se deben pasar al modelo como entrada. Son índices de tokens, representaciones numéricas de tokens que construyen las secuencias que se utilizarán como entrada por el modelo.
|
||||
|
||||
<Youtube id="VFp38yj8h3A"/>
|
||||
|
||||
Cada tokenizador funciona de manera diferente, pero el mecanismo subyacente sigue siendo el mismo. Aquí tienes un ejemplo utilizando el tokenizador BERT, que es un tokenizador [WordPiece](https://arxiv.org/pdf/1609.08144.pdf):
|
||||
|
||||
```python
|
||||
>>> from transformers import BertTokenizer
|
||||
|
||||
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
|
||||
|
||||
>>> sequence = "A Titan RTX has 24GB of VRAM"
|
||||
```
|
||||
|
||||
El tokenizador se encarga de dividir la secuencia en tokens disponibles en el vocabulario del tokenizador.
|
||||
|
||||
```python
|
||||
>>> tokenized_sequence = tokenizer.tokenize(sequence)
|
||||
```
|
||||
|
||||
Los tokens son palabras o sub palabras. Por ejemplo, "VRAM" no estaba en el vocabulario del modelo, así que se dividió
|
||||
en "V", "RA" y "M". Para indicar que estos tokens no son palabras separadas sino partes de la misma palabra, se añade un prefijo de doble almohadilla para "RA" y "M":
|
||||
|
||||
```python
|
||||
>>> print(tokenized_sequence)
|
||||
['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
|
||||
```
|
||||
|
||||
Estos tokens luego se pueden convertir en IDs que son comprensibles por el modelo. Esto se puede hacer alimentando directamente la oración al tokenizador, que aprovecha la implementación en Rust de [🤗 Tokenizers](https://github.com/huggingface/tokenizers) para obtener un rendimiento óptimo.
|
||||
|
||||
```python
|
||||
>>> inputs = tokenizer(sequence)
|
||||
```
|
||||
|
||||
El tokenizador devuelve un diccionario con todos los argumentos necesarios para que su modelo correspondiente funcione correctamente. Los índices de los tokens están bajo la clave `input_ids`:
|
||||
|
||||
```python
|
||||
>>> encoded_sequence = inputs["input_ids"]
|
||||
>>> print(encoded_sequence)
|
||||
[101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
|
||||
```
|
||||
|
||||
Ten en cuenta que el tokenizador añade automáticamente "tokens especiales" (si el modelo asociado depende de ellos), que son IDs especiales que el modelo utiliza en ocasiones.
|
||||
|
||||
Si descodificamos la secuencia anterior de IDs,
|
||||
|
||||
```python
|
||||
>>> decoded_sequence = tokenizer.decode(encoded_sequence)
|
||||
```
|
||||
|
||||
Veremos
|
||||
|
||||
```python
|
||||
>>> print(decoded_sequence)
|
||||
[CLS] A Titan RTX has 24GB of VRAM [SEP]
|
||||
```
|
||||
|
||||
Porque esta es la forma en que un [`BertModel`] espera sus entradas.
|
||||
|
||||
## L
|
||||
|
||||
### labels
|
||||
|
||||
Las etiquetas son un argumento opcional que se puede pasar para que el modelo calcule la pérdida por sí mismo. Estas etiquetas deberían ser la predicción esperada del modelo: usará la pérdida estándar para calcular la pérdida entre sus
|
||||
predicciones y el valor esperado (la etiqueta).
|
||||
|
||||
Estas etiquetas son diferentes según la cabecera del modelo, por ejemplo:
|
||||
|
||||
- Para modelos de clasificación de secuencias ([`BertForSequenceClassification`]), el modelo espera un tensor de dimensión
|
||||
`(batch_size)` con cada valor del lote correspondiente a la etiqueta esperada de toda la secuencia.
|
||||
- Para modelos de clasificación de tokens ([`BertForTokenClassification`]), el modelo espera un tensor de dimensión
|
||||
`(batch_size, seq_length)` con cada valor correspondiente a la etiqueta esperada de cada token individual.
|
||||
- Para el modelado de lenguaje enmascarado ([`BertForMaskedLM`]), el modelo espera un tensor de dimensión `(batch_size, seq_length)` con cada valor correspondiente a la etiqueta esperada de cada token individual: las etiquetas son el ID del token enmascarado y los valores deben ignorarse para el resto (generalmente -100).
|
||||
- Para tareas de secuencia a secuencia ([`BartForConditionalGeneration`], [`MBartForConditionalGeneration`]), el modelo
|
||||
espera un tensor de dimensión `(batch_size, tgt_seq_length)` con cada valor correspondiente a las secuencias objetivo asociadas con cada secuencia de entrada. Durante el entrenamiento, tanto BART como T5 generarán internamente los `decoder_input_ids` y las máscaras de atención del decodificador. Por lo general, no es necesario suministrarlos. Esto no se aplica a los modelos que aprovechan el marco codificador-decodificador.
|
||||
- Para modelos de clasificación de imágenes ([`ViTForImageClassification`]), el modelo espera un tensor de dimensión
|
||||
`(batch_size)` con cada valor del lote correspondiente a la etiqueta esperada de cada imagen individual.
|
||||
- Para modelos de segmentación semántica ([`SegformerForSemanticSegmentation`]), el modelo espera un tensor de dimensión
|
||||
`(batch_size, height, width)` con cada valor del lote correspondiente a la etiqueta esperada de cada píxel individual.
|
||||
- Para modelos de detección de objetos ([`DetrForObjectDetection`]), el modelo espera una lista de diccionarios con claves `class_labels` y `boxes` donde cada valor del lote corresponde a la etiqueta esperada y el número de cajas delimitadoras de cada imagen individual.
|
||||
- Para modelos de reconocimiento automático de voz ([`Wav2Vec2ForCTC`]), el modelo espera un tensor de dimensión `(batch_size, target_length)` con cada valor correspondiente a la etiqueta esperada de cada token individual.
|
||||
|
||||
<Tip>
|
||||
|
||||
Las etiquetas de cada modelo pueden ser diferentes, así que asegúrate siempre de revisar la documentación de cada modelo para obtener más información sobre sus etiquetas específicas.
|
||||
|
||||
</Tip>
|
||||
|
||||
Los modelos base ([`BertModel`]) no aceptan etiquetas, ya que estos son los modelos base de transformadores, que simplemente generan características.
|
||||
|
||||
### large language models (LLM)
|
||||
|
||||
Un término genérico que se refiere a modelos de lenguaje de transformadores (GPT-3, BLOOM, OPT) que fueron entrenados con una gran cantidad de datos. Estos modelos también tienden a tener un gran número de parámetros que se pueden aprender (por ejemplo, 175 mil millones para GPT-3).
|
||||
|
||||
## M
|
||||
|
||||
### masked language modeling (MLM)
|
||||
|
||||
Una tarea de preentrenamiento en la que el modelo ve una versión corrupta de los textos, generalmente hecha
|
||||
al enmascarar algunos tokens al azar, y tiene que predecir el texto original.
|
||||
|
||||
### multimodal
|
||||
|
||||
Una tarea que combina textos con otro tipo de entradas (por ejemplo: imágenes).
|
||||
|
||||
## N
|
||||
|
||||
### Natural language generation (NLG)
|
||||
|
||||
Todas las tareas relacionadas con la generación de texto (por ejemplo: [Escribe con Transformers](https://transformer.huggingface.co/) o traducción).
|
||||
|
||||
### Natural language processing (NLP)
|
||||
|
||||
Una forma genérica de decir "trabajar con textos".
|
||||
|
||||
### Natural language understanding (NLU)
|
||||
|
||||
Todas las tareas relacionadas con entender lo que hay en un texto (por ejemplo: clasificar el
|
||||
texto completo o palabras individuales).
|
||||
|
||||
## P
|
||||
|
||||
### Pipeline
|
||||
|
||||
Un pipeline en 🤗 Transformers es una abstracción que se refiere a una serie de pasos que se ejecutan en un orden específico para preprocesar y transformar datos y devolver una predicción de un modelo. Algunas etapas de ejemplo que se encuentran en un pipeline pueden ser el preprocesamiento de datos, la extracción de características y la normalización.
|
||||
|
||||
Para obtener más detalles, consulta [Pipelines para inferencia](https://huggingface.co/docs/transformers/pipeline_tutorial).
|
||||
|
||||
### PipelineParallel (PP)
|
||||
|
||||
Técnica de paralelismo en la que el modelo se divide verticalmente (a nivel de capa) en varios GPU, de modo que solo una o varias capas del modelo se colocan en un solo GPU. Cada GPU procesa en paralelo diferentes etapas del pipeline y trabaja en un pequeño fragmento del lote. Obtén más información sobre cómo funciona PipelineParallel [aquí](perf_train_gpu_many#from-naive-model-parallelism-to-pipeline-parallelism).
|
||||
|
||||
### pixel values
|
||||
|
||||
Un tensor de las representaciones numéricas de una imagen que se pasa a un modelo. Los valores de píxeles tienen una forma de [`batch_size`, `num_channels`, `height`, `width`], y se generan a partir de un procesador de imágenes.
|
||||
|
||||
### pooling
|
||||
|
||||
Una operación que reduce una matriz a una matriz más pequeña, ya sea tomando el máximo o el promedio de la dimensión (o dimensiones) agrupada(s). Las capas de agrupación se encuentran comúnmente entre capas convolucionales para reducir la representación de características.
|
||||
|
||||
### position IDs
|
||||
|
||||
A diferencia de las RNN que tienen la posición de cada token incrustada en ellas, los transformers no son conscientes de la posición de cada token. Por lo tanto, se utilizan los IDs de posición (`position_ids`) para que el modelo identifique la posición de cada token en la lista de tokens.
|
||||
|
||||
Son un parámetro opcional. Si no se pasan `position_ids` al modelo, los IDs se crean automáticamente como embeddings de posición absolutas.
|
||||
|
||||
Los embeddings de posición absolutas se seleccionan en el rango `[0, config.max_position_embeddings - 1]`. Algunos modelos utilizan otros tipos de embeddings de posición, como embeddings de posición sinusoidales o embeddings de posición relativas.
|
||||
|
||||
### preprocessing
|
||||
|
||||
La tarea de preparar datos crudos en un formato que pueda ser fácilmente consumido por modelos de aprendizaje automático. Por ejemplo, el texto se preprocesa típicamente mediante la tokenización. Para tener una mejor idea de cómo es el preprocesamiento para otros tipos de entrada, consulta el tutorial [Pre-procesar](https://huggingface.co/docs/transformers/preprocessing).
|
||||
|
||||
### pretrained model
|
||||
|
||||
Un modelo que ha sido pre-entrenado en algunos datos (por ejemplo, toda Wikipedia). Los métodos de preentrenamiento involucran un objetivo auto-supervisado, que puede ser leer el texto e intentar predecir la siguiente palabra (ver [modelado de lenguaje causal](#causal-language-modeling)) o enmascarar algunas palabras e intentar predecirlas (ver [modelado de lenguaje enmascarado](#masked-language-modeling-mlm)).
|
||||
|
||||
Los modelos de habla y visión tienen sus propios objetivos de pre-entrenamiento. Por ejemplo, Wav2Vec2 es un modelo de habla pre-entrenado en una tarea contrastiva que requiere que el modelo identifique la representación de habla "verdadera" de un conjunto de representaciones de habla "falsas". Por otro lado, BEiT es un modelo de visión pre-entrenado en una tarea de modelado de imágenes enmascaradas que enmascara algunos de los parches de la imagen y requiere que el modelo prediga los parches enmascarados (similar al objetivo de modelado de lenguaje enmascarado).
|
||||
|
||||
## R
|
||||
|
||||
### recurrent neural network (RNN)
|
||||
|
||||
Un tipo de modelo que utiliza un bucle sobre una capa para procesar textos.
|
||||
|
||||
### representation learning
|
||||
|
||||
Un subcampo del aprendizaje automático que se centra en aprender representaciones significativas de datos en bruto. Algunos ejemplos de técnicas de aprendizaje de representaciones incluyen embeddings de palabras, auto-encoders y Redes Generativas Adversarias (Generative Adversarial Networks, GANs).
|
||||
|
||||
## S
|
||||
|
||||
### sampling rate
|
||||
|
||||
Una medida en hercios del número de muestras (la señal de audio) tomadas por segundo. La tasa de muestreo es el resultado de aproximar una señal continua como el habla.
|
||||
|
||||
### self-attention
|
||||
|
||||
Cada elemento de la entrada averigua a cuáles otros elementos de la entrada debe prestar atención.
|
||||
|
||||
### self-supervised learning
|
||||
|
||||
Una categoría de técnicas de aprendizaje automático en la que un modelo crea su propio objetivo de aprendizaje a partir de datos no etiquetados. Difiere del [aprendizaje no supervisado](#unsupervised-learning) y del [aprendizaje supervisado](#supervised-learning) en que el proceso de aprendizaje está supervisado, pero no explícitamente por el usuario.
|
||||
|
||||
Un ejemplo de aprendizaje auto-supervisado es el [modelado de lenguaje enmascarado](#masked-language-modeling-mlm), donde un modelo recibe oraciones con una proporción de sus tokens eliminados y aprende a predecir los tokens faltantes.
|
||||
|
||||
### semi-supervised learning
|
||||
|
||||
Una amplia categoría de técnicas de entrenamiento de aprendizaje automático que aprovecha una pequeña cantidad de datos etiquetados con una mayor cantidad de datos no etiquetados para mejorar la precisión de un modelo, a diferencia del [aprendizaje supervisado](#supervised-learning) y del [aprendizaje no supervisado](#unsupervised-learning).
|
||||
|
||||
Un ejemplo de un enfoque de aprendizaje semi-supervisado es "auto-entrenamiento", en el que un modelo se entrena con datos etiquetados y luego se utiliza para hacer predicciones sobre los datos no etiquetados. La porción de datos no etiquetados que el modelo predice con mayor confianza se agrega al conjunto de datos etiquetados y se utiliza para volver a entrenar el modelo.
|
||||
|
||||
### sequence-to-sequence (seq2seq)
|
||||
|
||||
Modelos que generan una nueva secuencia a partir de una entrada, como modelos de traducción o modelos de resumen (como
|
||||
[Bart](model_doc/bart) o [T5](model_doc/t5)).
|
||||
|
||||
### Sharded DDP
|
||||
|
||||
Otro nombre para el concepto fundamental de [ZeRO](#zero-redundancy-optimizer--zero-) utilizado por varias otras implementaciones de ZeRO.
|
||||
|
||||
### stride
|
||||
|
||||
En [convolución](#convolution) o [agrupación](#pooling), el paso (stride) se refiere a la distancia que recorre el núcleo sobre una matriz. Un paso de 1 significa que el núcleo se mueve un píxel a la vez, y un paso de 2 significa que el núcleo se mueve dos píxeles a la vez.
|
||||
|
||||
### supervised learning
|
||||
|
||||
Una forma de entrenamiento de modelos que utiliza directamente datos etiquetados para corregir y dirigir el rendimiento del modelo. Los datos se introducen en el modelo en entrenamiento, y sus predicciones se comparan con las etiquetas conocidas. El modelo actualiza sus pesos en función de cuán incorrectas fueron sus predicciones, y el proceso se repite para optimizar el rendimiento del modelo.
|
||||
|
||||
## T
|
||||
|
||||
### Tensor Parallelism (TP)
|
||||
|
||||
Técnica de paralelismo para entrenamiento en múltiples GPU en la que cada tensor se divide en múltiples fragmentos, de modo que en lugar de tener todo el tensor en una sola GPU, cada fragmento del tensor reside en su GPU designada. Los fragmentos se procesan por separado y en paralelo en diferentes GPU y los resultados se sincronizan al final del paso de procesamiento.Esto es lo que a veces se llama paralelismo horizontal, ya que la división ocurre a nivel horizontal.
|
||||
Obtén más información sobre el Paralelismo de Tensores [aquí](perf_train_gpu_many#tensor-parallelism).
|
||||
|
||||
### token
|
||||
|
||||
Parte de una oración, generalmente una palabra, pero también puede ser una sub-palabra (las palabras no comunes a menudo se dividen en sub-palabras) o un símbolo de puntuación.
|
||||
|
||||
### token Type IDs
|
||||
|
||||
Algunos modelos tienen como objetivo realizar clasificación en pares de oraciones o responder preguntas.
|
||||
|
||||
<Youtube id="0u3ioSwev3s"/>
|
||||
|
||||
Estos requieren que dos secuencias diferentes se unan en una única entrada "input_ids", lo cual generalmente se realiza con
|
||||
la ayuda de tokens especiales, como el token de clasificación (`[CLS]`) y el token separador (`[SEP]`). Por ejemplo, el modelo BERT construye sus dos secuencias de entrada de la siguiente manera:
|
||||
|
||||
```python
|
||||
>>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
|
||||
```
|
||||
|
||||
Podemos utilizar nuestro tokenizador para generar automáticamente una oración de este tipo al pasar las dos secuencias a `tokenizer` como dos argumentos (y no como una lista, como antes) de la siguiente manera:
|
||||
|
||||
```python
|
||||
>>> from transformers import BertTokenizer
|
||||
|
||||
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
|
||||
>>> sequence_a = "HuggingFace is based in NYC"
|
||||
>>> sequence_b = "Where is HuggingFace based?"
|
||||
|
||||
>>> encoded_dict = tokenizer(sequence_a, sequence_b)
|
||||
>>> decoded = tokenizer.decode(encoded_dict["input_ids"])
|
||||
```
|
||||
|
||||
Que devolverá:
|
||||
|
||||
```python
|
||||
>>> print(decoded)
|
||||
[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]
|
||||
```
|
||||
|
||||
Esto es suficiente para que algunos modelos comprendan dónde termina una secuencia y comienza otra. Sin embargo, otros modelos, como BERT, también utilizan identificadores de tipo de token (también llamados identificadores de segmento). Se representan como una máscara binaria que identifica los dos tipos de secuencia en el modelo.
|
||||
|
||||
El tokenizador devuelve esta máscara como la entrada "token_type_ids":
|
||||
|
||||
```python
|
||||
>>> encoded_dict["token_type_ids"]
|
||||
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
|
||||
```
|
||||
|
||||
La primera secuencia, el "contexto" utilizado para la pregunta, tiene todos sus tokens representados por un `0`, mientras que la segunda secuencia, correspondiente a la "pregunta", tiene todos sus tokens representados por un `1`.
|
||||
|
||||
Algunos modelos, como [`XLNetModel`], utilizan un token adicional representado por un `2`.
|
||||
|
||||
### transfer learning
|
||||
|
||||
Una técnica que implica tomar un modelo pre-entrenado y adaptarlo a un conjunto de datos específico para tu tarea. En lugar de entrenar un modelo desde cero, puedes aprovechar el conocimiento obtenido de un modelo existente como punto de partida. Esto acelera el proceso de aprendizaje y reduce la cantidad de datos de entrenamiento necesarios.
|
||||
|
||||
### transformer
|
||||
|
||||
Arquitectura de modelo de aprendizaje profundo basada en auto-atención (Self-attention).
|
||||
|
||||
## U
|
||||
|
||||
### unsupervised learning
|
||||
|
||||
Una forma de entrenamiento de modelos en la que los datos proporcionados al modelo no están etiquetados. Las técnicas de aprendizaje no supervisado aprovechan la información estadística de la distribución de datos para encontrar patrones útiles para la tarea en cuestión.
|
||||
|
||||
## Z
|
||||
|
||||
### Zero Redundancy Optimizer (ZeRO)
|
||||
|
||||
Técnica de paralelismo que realiza la fragmentación de los tensores de manera algo similar a [TensorParallel](#tensor-parallelism-tp), excepto que todo el tensor se reconstruye a tiempo para una computación hacia adelante o hacia atrás, por lo tanto, el modelo no necesita ser modificado. Este método también admite diversas técnicas de descarga para compensar la memoria limitada de la GPU. Obtén más información sobre ZeRO [aquí](perf_train_gpu_many#zero-data-parallelism).
|
||||
@ -292,6 +292,14 @@
|
||||
title: ConvBERT
|
||||
- local: model_doc/cpm
|
||||
title: CPM
|
||||
- local: model_doc/cpmant
|
||||
title: CPMANT
|
||||
- local: model_doc/ctrl
|
||||
title: CTRL
|
||||
- local: model_doc/deberta
|
||||
title: DeBERTa
|
||||
- local: model_doc/deberta-v2
|
||||
title: DeBERTa-v2
|
||||
title: 文章モデル
|
||||
- isExpanded: false
|
||||
sections:
|
||||
@ -305,6 +313,10 @@
|
||||
title: ConvNeXT
|
||||
- local: model_doc/convnextv2
|
||||
title: ConvNeXTV2
|
||||
- local: model_doc/cvt
|
||||
title: CvT
|
||||
- local: model_doc/deformable_detr
|
||||
title: Deformable DETR
|
||||
title: ビジョンモデル
|
||||
- isExpanded: false
|
||||
sections:
|
||||
@ -337,7 +349,14 @@
|
||||
title: CLIPSeg
|
||||
- local: model_doc/clvp
|
||||
title: CLVP
|
||||
- local: model_doc/data2vec
|
||||
title: Data2Vec
|
||||
title: マルチモーダルモデル
|
||||
- isExpanded: false
|
||||
sections:
|
||||
- local: model_doc/decision_transformer
|
||||
title: Decision Transformer
|
||||
title: 強化学習モデル
|
||||
- isExpanded: false
|
||||
sections:
|
||||
- local: model_doc/autoformer
|
||||
|
||||
47
docs/source/ja/model_doc/cpmant.md
Normal file
47
docs/source/ja/model_doc/cpmant.md
Normal file
@ -0,0 +1,47 @@
|
||||
<!--Copyright 2022 The HuggingFace Team and The OpenBMB Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# CPMAnt
|
||||
|
||||
## Overview
|
||||
|
||||
CPM-Ant は、10B パラメータを備えたオープンソースの中国語の事前トレーニング済み言語モデル (PLM) です。これは、CPM-Live のライブ トレーニング プロセスの最初のマイルストーンでもあります。トレーニングプロセスは費用対効果が高く、環境に優しいものです。 CPM-Ant は、CUGE ベンチマークでのデルタ チューニングでも有望な結果を達成しています。フル モデルに加えて、さまざまなハードウェア構成の要件を満たすさまざまな圧縮バージョンも提供しています。 [詳細を見る](https://github.com/OpenBMB/CPM-Live/tree/cpm-ant/cpm-live)
|
||||
|
||||
このモデルは [OpenBMB](https://huggingface.co/openbmb) によって提供されました。元のコードは [ここ](https://github.com/OpenBMB/CPM-Live/tree/cpm-ant/cpm-live) にあります。
|
||||
|
||||
## Resources
|
||||
|
||||
- [CPM-Live](https://github.com/OpenBMB/CPM-Live/tree/cpm-ant/cpm-live) に関するチュートリアル。
|
||||
|
||||
## CpmAntConfig
|
||||
|
||||
[[autodoc]] CpmAntConfig
|
||||
- all
|
||||
|
||||
## CpmAntTokenizer
|
||||
|
||||
[[autodoc]] CpmAntTokenizer
|
||||
- all
|
||||
|
||||
## CpmAntModel
|
||||
|
||||
[[autodoc]] CpmAntModel
|
||||
- all
|
||||
|
||||
## CpmAntForCausalLM
|
||||
|
||||
[[autodoc]] CpmAntForCausalLM
|
||||
- all
|
||||
113
docs/source/ja/model_doc/ctrl.md
Normal file
113
docs/source/ja/model_doc/ctrl.md
Normal file
@ -0,0 +1,113 @@
|
||||
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# CTRL
|
||||
|
||||
<div class="flex flex-wrap space-x-1">
|
||||
<a href="https://huggingface.co/models?filter=ctrl">
|
||||
<img alt="Models" src="https://img.shields.io/badge/All_model_pages-ctrl-blueviolet">
|
||||
</a>
|
||||
<a href="https://huggingface.co/spaces/docs-demos/tiny-ctrl">
|
||||
<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
|
||||
</a>
|
||||
</div>
|
||||
|
||||
## Overview
|
||||
|
||||
CTRL モデルは、Nitish Shirish Keskar*、Bryan McCann*、Lav R. Varshney、Caiming Xiong, Richard Socher によって [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) で提案されました。
|
||||
リチャード・ソーチャー。これは、非常に大規模なコーパスの言語モデリングを使用して事前トレーニングされた因果的 (一方向) トランスフォーマーです
|
||||
最初のトークンが制御コード (リンク、書籍、Wikipedia など) として予約されている、約 140 GB のテキスト データ。
|
||||
|
||||
論文の要約は次のとおりです。
|
||||
|
||||
*大規模な言語モデルは有望なテキスト生成機能を示していますが、ユーザーは特定の言語モデルを簡単に制御できません
|
||||
生成されたテキストの側面。 16 億 3,000 万パラメータの条件付きトランスフォーマー言語モデルである CTRL をリリースします。
|
||||
スタイル、コンテンツ、タスク固有の動作を制御する制御コードを条件付けるように訓練されています。制御コードは
|
||||
生のテキストと自然に共生する構造から派生し、教師なし学習の利点を維持しながら、
|
||||
テキスト生成をより明示的に制御できるようになります。これらのコードを使用すると、CTRL でどの部分が予測されるのかを予測することもできます。
|
||||
トレーニング データにはシーケンスが与えられる可能性が最も高くなります。これにより、大量のデータを分析するための潜在的な方法が提供されます。
|
||||
モデルベースのソース帰属を介して。*
|
||||
|
||||
このモデルは、[keskarnitishr](https://huggingface.co/keskarnitishr) によって提供されました。元のコードが見つかる
|
||||
[こちら](https://github.com/salesforce/ctrl)。
|
||||
|
||||
## Usage tips
|
||||
|
||||
- CTRL は制御コードを利用してテキストを生成します。生成を特定の単語や文で開始する必要があります。
|
||||
またはリンクして一貫したテキストを生成します。 [元の実装](https://github.com/salesforce/ctrl) を参照してください。
|
||||
詳しくは。
|
||||
- CTRL は絶対位置埋め込みを備えたモデルであるため、通常は入力を右側にパディングすることをお勧めします。
|
||||
左。
|
||||
- CTRL は因果言語モデリング (CLM) の目的でトレーニングされているため、次の予測に強力です。
|
||||
シーケンス内のトークン。この機能を利用すると、CTRL は構文的に一貫したテキストを生成できるようになります。
|
||||
*run_generation.py* サンプル スクリプトで確認できます。
|
||||
- PyTorch モデルは、以前に計算されたキーと値のアテンション ペアである`past_key_values`を入力として受け取ることができます。
|
||||
TensorFlow モデルは`past`を入力として受け入れます。 `past_key_values`値を使用すると、モデルが再計算されなくなります。
|
||||
テキスト生成のコンテキストで事前に計算された値。 [`forward`](model_doc/ctrl#transformers.CTRLModel.forward) を参照してください。
|
||||
この引数の使用法の詳細については、メソッドを参照してください。
|
||||
|
||||
## Resources
|
||||
|
||||
- [テキスト分類タスクガイド](../tasks/sequence_classification)
|
||||
- [因果言語モデリング タスク ガイド](../tasks/language_modeling)
|
||||
|
||||
## CTRLConfig
|
||||
|
||||
[[autodoc]] CTRLConfig
|
||||
|
||||
## CTRLTokenizer
|
||||
|
||||
[[autodoc]] CTRLTokenizer
|
||||
- save_vocabulary
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
|
||||
## CTRLModel
|
||||
|
||||
[[autodoc]] CTRLModel
|
||||
- forward
|
||||
|
||||
## CTRLLMHeadModel
|
||||
|
||||
[[autodoc]] CTRLLMHeadModel
|
||||
- forward
|
||||
|
||||
## CTRLForSequenceClassification
|
||||
|
||||
[[autodoc]] CTRLForSequenceClassification
|
||||
- forward
|
||||
|
||||
</pt>
|
||||
<tf>
|
||||
|
||||
## TFCTRLModel
|
||||
|
||||
[[autodoc]] TFCTRLModel
|
||||
- call
|
||||
|
||||
## TFCTRLLMHeadModel
|
||||
|
||||
[[autodoc]] TFCTRLLMHeadModel
|
||||
- call
|
||||
|
||||
## TFCTRLForSequenceClassification
|
||||
|
||||
[[autodoc]] TFCTRLForSequenceClassification
|
||||
- call
|
||||
|
||||
</tf>
|
||||
</frameworkcontent>
|
||||
88
docs/source/ja/model_doc/cvt.md
Normal file
88
docs/source/ja/model_doc/cvt.md
Normal file
@ -0,0 +1,88 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Convolutional Vision Transformer (CvT)
|
||||
|
||||
## Overview
|
||||
|
||||
CvT モデルは、Haping Wu、Bin Xiao、Noel Codella、Mengchen Liu、Xiyang Dai、Lu Yuan、Lei Zhang によって [CvT: Introduction Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) で提案されました。畳み込みビジョン トランスフォーマー (CvT) は、ViT に畳み込みを導入して両方の設計の長所を引き出すことにより、[ビジョン トランスフォーマー (ViT)](vit) のパフォーマンスと効率を向上させます。
|
||||
|
||||
論文の要約は次のとおりです。
|
||||
|
||||
*この論文では、ビジョン トランスフォーマー (ViT) を改善する、畳み込みビジョン トランスフォーマー (CvT) と呼ばれる新しいアーキテクチャを紹介します。
|
||||
ViT に畳み込みを導入して両方の設計の長所を引き出すことで、パフォーマンスと効率を向上させます。これは次のようにして実現されます。
|
||||
2 つの主要な変更: 新しい畳み込みトークンの埋め込みを含むトランスフォーマーの階層と、畳み込みトランスフォーマー
|
||||
畳み込み射影を利用したブロック。これらの変更により、畳み込みニューラル ネットワーク (CNN) の望ましい特性が導入されます。
|
||||
トランスフォーマーの利点 (動的な注意力、
|
||||
グローバルなコンテキストとより良い一般化)。私たちは広範な実験を実施することで CvT を検証し、このアプローチが達成できることを示しています。
|
||||
ImageNet-1k 上の他のビジョン トランスフォーマーや ResNet よりも、パラメータが少なく、FLOP が低い、最先端のパフォーマンスを実現します。加えて、
|
||||
より大きなデータセット (例: ImageNet-22k) で事前トレーニングし、下流のタスクに合わせて微調整すると、パフォーマンスの向上が維持されます。事前トレーニング済み
|
||||
ImageNet-22k、当社の CvT-W24 は、ImageNet-1k val set で 87.7\% というトップ 1 の精度を獲得しています。最後に、私たちの結果は、位置エンコーディングが、
|
||||
既存のビジョン トランスフォーマーの重要なコンポーネントであるこのコンポーネントは、モデルでは安全に削除できるため、高解像度のビジョン タスクの設計が簡素化されます。*
|
||||
|
||||
このモデルは [anugunj](https://huggingface.co/anugunj) によって提供されました。元のコードは [ここ](https://github.com/microsoft/CvT) にあります。
|
||||
|
||||
## Usage tips
|
||||
|
||||
- CvT モデルは通常の Vision Transformer ですが、畳み込みでトレーニングされています。 ImageNet-1K および CIFAR-100 で微調整すると、[オリジナル モデル (ViT)](vit) よりも優れたパフォーマンスを発揮します。
|
||||
- カスタム データの微調整だけでなく推論に関するデモ ノートブックも [ここ](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer) で確認できます ([`ViTFeatureExtractor を置き換えるだけで済みます) `] による [`AutoImageProcessor`] および [`ViTForImageClassification`] による [`CvtForImageClassification`])。
|
||||
- 利用可能なチェックポイントは、(1) [ImageNet-22k](http://www.image-net.org/) (1,400 万の画像と 22,000 のクラスのコレクション) でのみ事前トレーニングされている、(2) も問題ありません。 ImageNet-22k で調整、または (3) [ImageNet-1k](http://www.image-net.org/challenges/LSVRC/2012/) (ILSVRC 2012 とも呼ばれるコレクション) でも微調整130万の
|
||||
画像と 1,000 クラス)。
|
||||
|
||||
## Resources
|
||||
|
||||
CvT を始めるのに役立つ公式 Hugging Face およびコミュニティ (🌎 で示される) リソースのリスト。
|
||||
|
||||
<PipelineTag pipeline="image-classification"/>
|
||||
|
||||
- [`CvtForImageClassification`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)。
|
||||
- 参照: [画像分類タスク ガイド](../tasks/image_classification)
|
||||
|
||||
ここに含めるリソースの送信に興味がある場合は、お気軽にプル リクエストを開いてください。審査させていただきます。リソースは、既存のリソースを複製するのではなく、何か新しいものを示すことが理想的です。
|
||||
|
||||
## CvtConfig
|
||||
|
||||
[[autodoc]] CvtConfig
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
|
||||
## CvtModel
|
||||
|
||||
[[autodoc]] CvtModel
|
||||
- forward
|
||||
|
||||
## CvtForImageClassification
|
||||
|
||||
[[autodoc]] CvtForImageClassification
|
||||
- forward
|
||||
|
||||
</pt>
|
||||
<tf>
|
||||
|
||||
## TFCvtModel
|
||||
|
||||
[[autodoc]] TFCvtModel
|
||||
- call
|
||||
|
||||
## TFCvtForImageClassification
|
||||
|
||||
[[autodoc]] TFCvtForImageClassification
|
||||
- call
|
||||
|
||||
</tf>
|
||||
</frameworkcontent>
|
||||
|
||||
187
docs/source/ja/model_doc/data2vec.md
Normal file
187
docs/source/ja/model_doc/data2vec.md
Normal file
@ -0,0 +1,187 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Data2Vec
|
||||
|
||||
## Overview
|
||||
|
||||
Data2Vec モデルは、[data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/pdf/2202.03555) で Alexei Baevski、Wei-Ning Hsu、Qiantong Xu、バArun Babu, Jiatao Gu and Michael Auli.
|
||||
Data2Vec は、テキスト、音声、画像などのさまざまなデータ モダリティにわたる自己教師あり学習のための統一フレームワークを提案します。
|
||||
重要なのは、事前トレーニングの予測ターゲットは、モダリティ固有のコンテキストに依存しないターゲットではなく、入力のコンテキスト化された潜在表現であることです。
|
||||
|
||||
論文の要約は次のとおりです。
|
||||
|
||||
*自己教師あり学習の一般的な考え方はどのモダリティでも同じですが、実際のアルゴリズムと
|
||||
単一のモダリティを念頭に置いて開発されたため、目的は大きく異なります。一般に近づけるために
|
||||
自己教師あり学習では、どちらの音声に対しても同じ学習方法を使用するフレームワークである data2vec を紹介します。
|
||||
NLP またはコンピューター ビジョン。中心となるアイデアは、完全な入力データの潜在的な表現を、
|
||||
標準の Transformer アーキテクチャを使用した自己蒸留セットアップの入力のマスクされたビュー。
|
||||
単語、視覚的トークン、人間の音声単位などのモダリティ固有のターゲットを予測するのではなく、
|
||||
本質的にローカルであるため、data2vec は、からの情報を含む文脈化された潜在表現を予測します。
|
||||
入力全体。音声認識、画像分類、および
|
||||
自然言語理解は、新しい最先端技術や、主流のアプローチに匹敵するパフォーマンスを実証します。
|
||||
モデルとコードは、www.github.com/pytorch/fairseq/tree/master/examples/data2vec.* で入手できます。
|
||||
|
||||
このモデルは、[edugp](https://huggingface.co/edugp) および [patrickvonplaten](https://huggingface.co/patrickvonplaten) によって提供されました。
|
||||
[sayakpaul](https://github.com/sayakpaul) と [Rocketknight1](https://github.com/Rocketknight1) は、TensorFlow のビジョンに Data2Vec を提供しました。
|
||||
|
||||
元のコード (NLP および音声用) は、[こちら](https://github.com/pytorch/fairseq/tree/main/examples/data2vec) にあります。
|
||||
ビジョンの元のコードは [こちら](https://github.com/facebookresearch/data2vec_vision/tree/main/beit) にあります。
|
||||
|
||||
## Usage tips
|
||||
|
||||
- Data2VecAudio、Data2VecText、および Data2VecVision はすべて、同じ自己教師あり学習方法を使用してトレーニングされています。
|
||||
- Data2VecAudio の場合、前処理は特徴抽出を含めて [`Wav2Vec2Model`] と同じです。
|
||||
- Data2VecText の場合、前処理はトークン化を含めて [`RobertaModel`] と同じです。
|
||||
- Data2VecVision の場合、前処理は特徴抽出を含めて [`BeitModel`] と同じです。
|
||||
|
||||
## Resources
|
||||
|
||||
Data2Vec の使用を開始するのに役立つ公式 Hugging Face およびコミュニティ (🌎 で示される) リソースのリスト。
|
||||
|
||||
<PipelineTag pipeline="image-classification"/>
|
||||
|
||||
- [`Data2VecVisionForImageClassification`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) および [ノートブック](https://cola.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)。
|
||||
- カスタム データセットで [`TFData2VecVisionForImageClassification`] を微調整するには、[このノートブック](https://colab.research.google.com/github/sayakpaul/TF-2.0-Hacks/blob/master/data2vec_vision_image_classification.ipynb) を参照してください。 )。
|
||||
|
||||
**Data2VecText ドキュメント リソース**
|
||||
- [テキスト分類タスクガイド](../tasks/sequence_classification)
|
||||
- [トークン分類タスクガイド](../tasks/token_classification)
|
||||
- [質問回答タスク ガイド](../tasks/question_answering)
|
||||
- [因果言語モデリング タスク ガイド](../tasks/language_modeling)
|
||||
- [マスク言語モデリング タスク ガイド](../tasks/masked_language_modeling)
|
||||
- [多肢選択タスク ガイド](../tasks/multiple_choice)
|
||||
|
||||
**Data2VecAudio ドキュメント リソース**
|
||||
- [音声分類タスクガイド](../tasks/audio_classification)
|
||||
- [自動音声認識タスクガイド](../tasks/asr)
|
||||
|
||||
**Data2VecVision ドキュメント リソース**
|
||||
- [画像分類](../tasks/image_classification)
|
||||
- [セマンティック セグメンテーション](../tasks/semantic_segmentation)
|
||||
|
||||
ここに含めるリソースの送信に興味がある場合は、お気軽にプル リクエストを開いてください。審査させていただきます。リソースは、既存のリソースを複製するのではなく、何か新しいものを示すことが理想的です。
|
||||
|
||||
## Data2VecTextConfig
|
||||
|
||||
[[autodoc]] Data2VecTextConfig
|
||||
|
||||
## Data2VecAudioConfig
|
||||
|
||||
[[autodoc]] Data2VecAudioConfig
|
||||
|
||||
## Data2VecVisionConfig
|
||||
|
||||
[[autodoc]] Data2VecVisionConfig
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
|
||||
## Data2VecAudioModel
|
||||
|
||||
[[autodoc]] Data2VecAudioModel
|
||||
- forward
|
||||
|
||||
## Data2VecAudioForAudioFrameClassification
|
||||
|
||||
[[autodoc]] Data2VecAudioForAudioFrameClassification
|
||||
- forward
|
||||
|
||||
## Data2VecAudioForCTC
|
||||
|
||||
[[autodoc]] Data2VecAudioForCTC
|
||||
- forward
|
||||
|
||||
## Data2VecAudioForSequenceClassification
|
||||
|
||||
[[autodoc]] Data2VecAudioForSequenceClassification
|
||||
- forward
|
||||
|
||||
## Data2VecAudioForXVector
|
||||
|
||||
[[autodoc]] Data2VecAudioForXVector
|
||||
- forward
|
||||
|
||||
## Data2VecTextModel
|
||||
|
||||
[[autodoc]] Data2VecTextModel
|
||||
- forward
|
||||
|
||||
## Data2VecTextForCausalLM
|
||||
|
||||
[[autodoc]] Data2VecTextForCausalLM
|
||||
- forward
|
||||
|
||||
## Data2VecTextForMaskedLM
|
||||
|
||||
[[autodoc]] Data2VecTextForMaskedLM
|
||||
- forward
|
||||
|
||||
## Data2VecTextForSequenceClassification
|
||||
|
||||
[[autodoc]] Data2VecTextForSequenceClassification
|
||||
- forward
|
||||
|
||||
## Data2VecTextForMultipleChoice
|
||||
|
||||
[[autodoc]] Data2VecTextForMultipleChoice
|
||||
- forward
|
||||
|
||||
## Data2VecTextForTokenClassification
|
||||
|
||||
[[autodoc]] Data2VecTextForTokenClassification
|
||||
- forward
|
||||
|
||||
## Data2VecTextForQuestionAnswering
|
||||
|
||||
[[autodoc]] Data2VecTextForQuestionAnswering
|
||||
- forward
|
||||
|
||||
## Data2VecVisionModel
|
||||
|
||||
[[autodoc]] Data2VecVisionModel
|
||||
- forward
|
||||
|
||||
## Data2VecVisionForImageClassification
|
||||
|
||||
[[autodoc]] Data2VecVisionForImageClassification
|
||||
- forward
|
||||
|
||||
## Data2VecVisionForSemanticSegmentation
|
||||
|
||||
[[autodoc]] Data2VecVisionForSemanticSegmentation
|
||||
- forward
|
||||
|
||||
</pt>
|
||||
<tf>
|
||||
|
||||
## TFData2VecVisionModel
|
||||
|
||||
[[autodoc]] TFData2VecVisionModel
|
||||
- call
|
||||
|
||||
## TFData2VecVisionForImageClassification
|
||||
|
||||
[[autodoc]] TFData2VecVisionForImageClassification
|
||||
- call
|
||||
|
||||
## TFData2VecVisionForSemanticSegmentation
|
||||
|
||||
[[autodoc]] TFData2VecVisionForSemanticSegmentation
|
||||
- call
|
||||
|
||||
</tf>
|
||||
</frameworkcontent>
|
||||
168
docs/source/ja/model_doc/deberta-v2.md
Normal file
168
docs/source/ja/model_doc/deberta-v2.md
Normal file
@ -0,0 +1,168 @@
|
||||
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# DeBERTa-v2
|
||||
|
||||
## Overview
|
||||
|
||||
DeBERTa モデルは、Pengcheng He、Xiaodong Liu、Jianfeng Gao、Weizhu Chen によって [DeBERTa: Decoding-enhanced BERT with Disentangled Attendant](https://arxiv.org/abs/2006.03654) で提案されました。Google のモデルに基づいています。
|
||||
2018年にリリースされたBERTモデルと2019年にリリースされたFacebookのRoBERTaモデル。
|
||||
|
||||
これは、もつれた注意を解きほぐし、使用されるデータの半分を使用して強化されたマスク デコーダ トレーニングを備えた RoBERTa に基づいて構築されています。
|
||||
ロベルタ。
|
||||
|
||||
論文の要約は次のとおりです。
|
||||
|
||||
*事前トレーニングされたニューラル言語モデルの最近の進歩により、多くの自然言語モデルのパフォーマンスが大幅に向上しました。
|
||||
言語処理 (NLP) タスク。この論文では、新しいモデル アーキテクチャ DeBERTa (Decoding-enhanced BERT with
|
||||
これは、2 つの新しい技術を使用して BERT モデルと RoBERTa モデルを改善します。 1つ目は、
|
||||
もつれを解く注意メカニズム。各単語は、その内容をエンコードする 2 つのベクトルを使用して表現され、
|
||||
単語間の注意の重みは、それらの単語のもつれ解除行列を使用して計算されます。
|
||||
内容と相対的な位置。 2 番目に、強化されたマスク デコーダを使用して、出力ソフトマックス レイヤを次のように置き換えます。
|
||||
モデルの事前トレーニング用にマスクされたトークンを予測します。これら 2 つの手法により効率が大幅に向上することを示します。
|
||||
モデルの事前トレーニングと下流タスクのパフォーマンスの向上。 RoBERTa-Large と比較すると、DeBERTa モデルは半分のレベルでトレーニングされています。
|
||||
トレーニング データは幅広い NLP タスクで一貫して優れたパフォーマンスを示し、MNLI で +0.9% の改善を達成しました。
|
||||
(90.2% 対 91.1%)、SQuAD v2.0 では +2.3% (88.4% 対 90.7%)、RACE では +3.6% (83.2% 対 86.8%) でした。 DeBERTa コードと
|
||||
事前トレーニングされたモデルは https://github.com/microsoft/DeBERTa で公開されます。*
|
||||
|
||||
次の情報は、[元の実装で直接表示されます]
|
||||
リポジトリ](https://github.com/microsoft/DeBERTa)。 DeBERTa v2 は、DeBERTa モデルの 2 番目のバージョンです。それには以下が含まれます
|
||||
SuperGLUE 単一モデルの提出に使用された 1.5B モデルは、人間のベースライン 89.8 に対して 89.9 を達成しました。あなたはできる
|
||||
この投稿に関する詳細については、著者のドキュメントを参照してください。
|
||||
[ブログ](https://www.microsoft.com/en-us/research/blog/microsoft-deberta-surpasses-human-performance-on-the-superglue-benchmark/)
|
||||
|
||||
v2 の新機能:
|
||||
|
||||
- **語彙** v2 では、トレーニング データから構築されたサイズ 128K の新しい語彙を使用するようにトークナイザーが変更されました。
|
||||
GPT2 ベースのトークナイザーの代わりに、トークナイザーは
|
||||
[sentencepiece ベース](https://github.com/google/sentencepiece) トークナイザー。
|
||||
- **nGiE(nGram Induced Input Encoding)** DeBERTa-v2 モデルは、最初の畳み込み層とは別に追加の畳み込み層を使用します。
|
||||
トランスフォーマー層を使用して、入力トークンのローカル依存関係をよりよく学習します。
|
||||
- **位置射影行列を注目レイヤーのコンテンツ射影行列と共有** 以前に基づく
|
||||
実験では、パフォーマンスに影響を与えることなくパラメータを保存できます。
|
||||
- **バケットを適用して相対位置をエンコードします** DeBERTa-v2 モデルはログ バケットを使用して相対位置をエンコードします
|
||||
T5に似ています。
|
||||
- **900M モデル & 1.5B モデル** 2 つの追加モデル サイズ: 900M と 1.5B が利用可能で、これにより、パフォーマンスが大幅に向上します。
|
||||
下流タスクのパフォーマンス。
|
||||
|
||||
このモデルは [DeBERTa](https://huggingface.co/DeBERTa) によって寄稿されました。このモデルの TF 2.0 実装は、
|
||||
[kamalkraj](https://huggingface.co/kamalkraj) による投稿。元のコードは [こちら](https://github.com/microsoft/DeBERTa) にあります。
|
||||
|
||||
## Resources
|
||||
- [テキスト分類タスクガイド](../tasks/sequence_classification)
|
||||
- [トークン分類タスクガイド](../tasks/token_classification)
|
||||
- [質問回答タスク ガイド](../tasks/question_answering)
|
||||
- [マスク言語モデリング タスク ガイド](../tasks/masked_language_modeling)
|
||||
- [多肢選択タスク ガイド](../tasks/multiple_choice)
|
||||
|
||||
## DebertaV2Config
|
||||
|
||||
[[autodoc]] DebertaV2Config
|
||||
|
||||
## DebertaV2Tokenizer
|
||||
|
||||
[[autodoc]] DebertaV2Tokenizer
|
||||
- build_inputs_with_special_tokens
|
||||
- get_special_tokens_mask
|
||||
- create_token_type_ids_from_sequences
|
||||
- save_vocabulary
|
||||
|
||||
## DebertaV2TokenizerFast
|
||||
|
||||
[[autodoc]] DebertaV2TokenizerFast
|
||||
- build_inputs_with_special_tokens
|
||||
- create_token_type_ids_from_sequences
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
|
||||
## DebertaV2Model
|
||||
|
||||
[[autodoc]] DebertaV2Model
|
||||
- forward
|
||||
|
||||
## DebertaV2PreTrainedModel
|
||||
|
||||
[[autodoc]] DebertaV2PreTrainedModel
|
||||
- forward
|
||||
|
||||
## DebertaV2ForMaskedLM
|
||||
|
||||
[[autodoc]] DebertaV2ForMaskedLM
|
||||
- forward
|
||||
|
||||
## DebertaV2ForSequenceClassification
|
||||
|
||||
[[autodoc]] DebertaV2ForSequenceClassification
|
||||
- forward
|
||||
|
||||
## DebertaV2ForTokenClassification
|
||||
|
||||
[[autodoc]] DebertaV2ForTokenClassification
|
||||
- forward
|
||||
|
||||
## DebertaV2ForQuestionAnswering
|
||||
|
||||
[[autodoc]] DebertaV2ForQuestionAnswering
|
||||
- forward
|
||||
|
||||
## DebertaV2ForMultipleChoice
|
||||
|
||||
[[autodoc]] DebertaV2ForMultipleChoice
|
||||
- forward
|
||||
|
||||
</pt>
|
||||
<tf>
|
||||
|
||||
## TFDebertaV2Model
|
||||
|
||||
[[autodoc]] TFDebertaV2Model
|
||||
- call
|
||||
|
||||
## TFDebertaV2PreTrainedModel
|
||||
|
||||
[[autodoc]] TFDebertaV2PreTrainedModel
|
||||
- call
|
||||
|
||||
## TFDebertaV2ForMaskedLM
|
||||
|
||||
[[autodoc]] TFDebertaV2ForMaskedLM
|
||||
- call
|
||||
|
||||
## TFDebertaV2ForSequenceClassification
|
||||
|
||||
[[autodoc]] TFDebertaV2ForSequenceClassification
|
||||
- call
|
||||
|
||||
## TFDebertaV2ForTokenClassification
|
||||
|
||||
[[autodoc]] TFDebertaV2ForTokenClassification
|
||||
- call
|
||||
|
||||
## TFDebertaV2ForQuestionAnswering
|
||||
|
||||
[[autodoc]] TFDebertaV2ForQuestionAnswering
|
||||
- call
|
||||
|
||||
## TFDebertaV2ForMultipleChoice
|
||||
|
||||
[[autodoc]] TFDebertaV2ForMultipleChoice
|
||||
- call
|
||||
|
||||
</tf>
|
||||
</frameworkcontent>
|
||||
|
||||
|
||||
164
docs/source/ja/model_doc/deberta.md
Normal file
164
docs/source/ja/model_doc/deberta.md
Normal file
@ -0,0 +1,164 @@
|
||||
<!--Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# DeBERTa
|
||||
|
||||
## Overview
|
||||
|
||||
DeBERTa モデルは、Pengcheng He、Xiaodong Liu、Jianfeng Gao、Weizhu Chen によって [DeBERTa: Decoding-enhanced BERT with Disentangled Attendant](https://arxiv.org/abs/2006.03654) で提案されました。Google のモデルに基づいています。
|
||||
2018年にリリースされたBERTモデルと2019年にリリースされたFacebookのRoBERTaモデル。
|
||||
|
||||
これは、もつれた注意を解きほぐし、使用されるデータの半分を使用して強化されたマスク デコーダ トレーニングを備えた RoBERTa に基づいて構築されています。
|
||||
ロベルタ。
|
||||
|
||||
論文の要約は次のとおりです。
|
||||
|
||||
*事前トレーニングされたニューラル言語モデルの最近の進歩により、多くの自然言語モデルのパフォーマンスが大幅に向上しました。
|
||||
言語処理 (NLP) タスク。この論文では、新しいモデル アーキテクチャ DeBERTa (Decoding-enhanced BERT with
|
||||
これは、2 つの新しい技術を使用して BERT モデルと RoBERTa モデルを改善します。 1つ目は、
|
||||
もつれを解く注意メカニズム。各単語は、その内容をエンコードする 2 つのベクトルを使用して表現され、
|
||||
単語間の注意の重みは、それらの単語のもつれ解除行列を使用して計算されます。
|
||||
内容と相対的な位置。 2 番目に、強化されたマスク デコーダを使用して、出力ソフトマックス レイヤを次のように置き換えます。
|
||||
モデルの事前トレーニング用にマスクされたトークンを予測します。これら 2 つの手法により効率が大幅に向上することを示します。
|
||||
モデルの事前トレーニングと下流タスクのパフォーマンスの向上。 RoBERTa-Large と比較すると、DeBERTa モデルは半分のレベルでトレーニングされています。
|
||||
トレーニング データは幅広い NLP タスクで一貫して優れたパフォーマンスを示し、MNLI で +0.9% の改善を達成しました。
|
||||
(90.2% 対 91.1%)、SQuAD v2.0 では +2.3% (88.4% 対 90.7%)、RACE では +3.6% (83.2% 対 86.8%) でした。 DeBERTa コードと
|
||||
事前トレーニングされたモデルは https://github.com/microsoft/DeBERTa で公開されます。*
|
||||
|
||||
|
||||
このモデルは [DeBERTa](https://huggingface.co/DeBERTa) によって寄稿されました。このモデルの TF 2.0 実装は、
|
||||
[kamalkraj](https://huggingface.co/kamalkraj) による寄稿。元のコードは [こちら](https://github.com/microsoft/DeBERTa) にあります。
|
||||
|
||||
## Resources
|
||||
|
||||
DeBERTa を使い始めるのに役立つ公式 Hugging Face およびコミュニティ (🌎 で示される) リソースのリスト。ここに含めるリソースの送信に興味がある場合は、お気軽にプル リクエストを開いてください。審査させていただきます。リソースは、既存のリソースを複製するのではなく、何か新しいものを示すことが理想的です。
|
||||
|
||||
<PipelineTag pipeline="text-classification"/>
|
||||
|
||||
- DeBERTa を使用して [DeepSpeed を使用して大規模モデルのトレーニングを加速する](https://huggingface.co/blog/accelerate-deepspeed) 方法に関するブログ投稿。
|
||||
- DeBERTa による [機械学習によるスーパーチャージされた顧客サービス](https://huggingface.co/blog/supercharge-customer-service-with-machine-learning) に関するブログ投稿。
|
||||
- [`DebertaForSequenceClassification`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)。
|
||||
- [`TFDebertaForSequenceClassification`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb)。
|
||||
- [テキスト分類タスクガイド](../tasks/sequence_classification)
|
||||
|
||||
<PipelineTag pipeline="token-classification" />
|
||||
|
||||
- [`DebertaForTokenClassification`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)。
|
||||
- [`TFDebertaForTokenClassification`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb)。
|
||||
- [トークン分類](https://huggingface.co/course/chapter7/2?fw=pt) 🤗 ハグフェイスコースの章。
|
||||
- 🤗 ハグフェイスコースの [バイトペアエンコーディングのトークン化](https://huggingface.co/course/chapter6/5?fw=pt) の章。
|
||||
- [トークン分類タスクガイド](../tasks/token_classification)
|
||||
|
||||
<PipelineTag pipeline="fill-mask"/>
|
||||
|
||||
- [`DebertaForMaskedLM`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling) でサポートされています。 [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)。
|
||||
- [`TFDebertaForMaskedLM`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/lang-modeling#run_mlmpy) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)。
|
||||
- [マスクされた言語モデリング](https://huggingface.co/course/chapter7/3?fw=pt) 🤗 顔のハグ コースの章。
|
||||
- [マスク言語モデリング タスク ガイド](../tasks/masked_language_modeling)
|
||||
|
||||
<PipelineTag pipeline="question-answering"/>
|
||||
|
||||
- [`DebertaForQuestionAnswering`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)。
|
||||
- [`TFDebertaForQuestionAnswering`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb)。
|
||||
- [質問回答](https://huggingface.co/course/chapter7/7?fw=pt) 🤗 ハグフェイスコースの章。
|
||||
- [質問回答タスク ガイド](../tasks/question_answering)
|
||||
|
||||
## DebertaConfig
|
||||
|
||||
[[autodoc]] DebertaConfig
|
||||
|
||||
## DebertaTokenizer
|
||||
|
||||
[[autodoc]] DebertaTokenizer
|
||||
- build_inputs_with_special_tokens
|
||||
- get_special_tokens_mask
|
||||
- create_token_type_ids_from_sequences
|
||||
- save_vocabulary
|
||||
|
||||
## DebertaTokenizerFast
|
||||
|
||||
[[autodoc]] DebertaTokenizerFast
|
||||
- build_inputs_with_special_tokens
|
||||
- create_token_type_ids_from_sequences
|
||||
|
||||
<frameworkcontent>
|
||||
<pt>
|
||||
|
||||
## DebertaModel
|
||||
|
||||
[[autodoc]] DebertaModel
|
||||
- forward
|
||||
|
||||
## DebertaPreTrainedModel
|
||||
|
||||
[[autodoc]] DebertaPreTrainedModel
|
||||
|
||||
## DebertaForMaskedLM
|
||||
|
||||
[[autodoc]] DebertaForMaskedLM
|
||||
- forward
|
||||
|
||||
## DebertaForSequenceClassification
|
||||
|
||||
[[autodoc]] DebertaForSequenceClassification
|
||||
- forward
|
||||
|
||||
## DebertaForTokenClassification
|
||||
|
||||
[[autodoc]] DebertaForTokenClassification
|
||||
- forward
|
||||
|
||||
## DebertaForQuestionAnswering
|
||||
|
||||
[[autodoc]] DebertaForQuestionAnswering
|
||||
- forward
|
||||
|
||||
</pt>
|
||||
<tf>
|
||||
|
||||
## TFDebertaModel
|
||||
|
||||
[[autodoc]] TFDebertaModel
|
||||
- call
|
||||
|
||||
## TFDebertaPreTrainedModel
|
||||
|
||||
[[autodoc]] TFDebertaPreTrainedModel
|
||||
- call
|
||||
|
||||
## TFDebertaForMaskedLM
|
||||
|
||||
[[autodoc]] TFDebertaForMaskedLM
|
||||
- call
|
||||
|
||||
## TFDebertaForSequenceClassification
|
||||
|
||||
[[autodoc]] TFDebertaForSequenceClassification
|
||||
- call
|
||||
|
||||
## TFDebertaForTokenClassification
|
||||
|
||||
[[autodoc]] TFDebertaForTokenClassification
|
||||
- call
|
||||
|
||||
## TFDebertaForQuestionAnswering
|
||||
|
||||
[[autodoc]] TFDebertaForQuestionAnswering
|
||||
- call
|
||||
|
||||
</tf>
|
||||
</frameworkcontent>
|
||||
|
||||
53
docs/source/ja/model_doc/decision_transformer.md
Normal file
53
docs/source/ja/model_doc/decision_transformer.md
Normal file
@ -0,0 +1,53 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Decision Transformer
|
||||
|
||||
## Overview
|
||||
|
||||
Decision Transformer モデルは、[Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) で提案されました。
|
||||
Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
|
||||
|
||||
論文の要約は次のとおりです。
|
||||
|
||||
*強化学習(RL)をシーケンスモデリング問題として抽象化するフレームワークを紹介します。
|
||||
これにより、Transformer アーキテクチャのシンプルさとスケーラビリティ、および関連する進歩を活用できるようになります。
|
||||
GPT-x や BERT などの言語モデリングで。特に、Decision Transformer というアーキテクチャを紹介します。
|
||||
RL の問題を条件付きシーケンス モデリングとして投げかけます。値関数に適合する以前の RL アプローチとは異なり、
|
||||
ポリシー勾配を計算すると、Decision Transformer は因果的にマスクされたアルゴリズムを利用して最適なアクションを出力するだけです。
|
||||
変成器。望ましいリターン (報酬)、過去の状態、アクションに基づいて自己回帰モデルを条件付けすることにより、
|
||||
Decision Transformer モデルは、望ましいリターンを達成する将来のアクションを生成できます。そのシンプルさにも関わらず、
|
||||
Decision Transformer は、最先端のモデルフリーのオフライン RL ベースラインのパフォーマンスと同等、またはそれを超えています。
|
||||
Atari、OpenAI Gym、Key-to-Door タスク*
|
||||
|
||||
このバージョンのモデルは、状態がベクトルであるタスク用です。
|
||||
|
||||
このモデルは、[edbeeching](https://huggingface.co/edbeeching) によって提供されました。元のコードは [ここ](https://github.com/kzl/decion-transformer) にあります。
|
||||
|
||||
## DecisionTransformerConfig
|
||||
|
||||
[[autodoc]] DecisionTransformerConfig
|
||||
|
||||
|
||||
## DecisionTransformerGPT2Model
|
||||
|
||||
[[autodoc]] DecisionTransformerGPT2Model
|
||||
- forward
|
||||
|
||||
## DecisionTransformerModel
|
||||
|
||||
[[autodoc]] DecisionTransformerModel
|
||||
- forward
|
||||
75
docs/source/ja/model_doc/deformable_detr.md
Normal file
75
docs/source/ja/model_doc/deformable_detr.md
Normal file
@ -0,0 +1,75 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Deformable DETR
|
||||
|
||||
## Overview
|
||||
|
||||
変形可能 DETR モデルは、Xizhou Zhu、Weijie Su、Lewei Lu、Bin Li、Xiaogang Wang, Jifeng Dai によって [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) で提案されました
|
||||
変形可能な DETR は、参照周囲の少数の主要なサンプリング ポイントのみに注目する新しい変形可能なアテンション モジュールを利用することにより、収束の遅さの問題と元の [DETR](detr) の制限された特徴の空間解像度を軽減します。
|
||||
|
||||
論文の要約は次のとおりです。
|
||||
|
||||
*DETR は、優れたパフォーマンスを実証しながら、物体検出における多くの手作業で設計されたコンポーネントの必要性を排除するために最近提案されました。ただし、画像特徴マップの処理における Transformer アテンション モジュールの制限により、収束が遅く、特徴の空間解像度が制限されるという問題があります。これらの問題を軽減するために、私たちは Deformable DETR を提案しました。この DETR のアテンション モジュールは、参照周囲の少数の主要なサンプリング ポイントのみに注目します。変形可能な DETR は、10 分の 1 のトレーニング エポックで、DETR よりも優れたパフォーマンス (特に小さなオブジェクトの場合) を達成できます。 COCO ベンチマークに関する広範な実験により、私たちのアプローチの有効性が実証されました。*
|
||||
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/deformable_detr_architecture.png"
|
||||
alt="描画" width="600"/>
|
||||
|
||||
<small> 変形可能な DETR アーキテクチャ。 <a href="https://arxiv.org/abs/2010.04159">元の論文</a>から抜粋。</small>
|
||||
|
||||
このモデルは、[nielsr](https://huggingface.co/nielsr) によって提供されました。元のコードは [ここ](https://github.com/fundamentalvision/Deformable-DETR) にあります。
|
||||
|
||||
## Usage tips
|
||||
|
||||
|
||||
- トレーニング Deformable DETR は、元の [DETR](detr) モデルをトレーニングすることと同等です。デモ ノートブックについては、以下の [resources](#resources) セクションを参照してください。
|
||||
|
||||
## Resources
|
||||
|
||||
Deformable DETR の使用を開始するのに役立つ公式 Hugging Face およびコミュニティ (🌎 で示される) リソースのリスト。
|
||||
|
||||
<PipelineTag pipeline="object-detection"/>
|
||||
|
||||
- [`DeformableDetrForObjectDetection`] のカスタム データセットでの推論と微調整に関するデモ ノートブックは、[こちら](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Deformable-DETR) にあります。
|
||||
- [物体検出タスクガイド](../tasks/object_detection) も参照してください。
|
||||
|
||||
ここに含めるリソースの送信に興味がある場合は、お気軽にプル リクエストを開いてください。審査させていただきます。リソースは、既存のリソースを複製するのではなく、何か新しいものを示すことが理想的です。
|
||||
|
||||
## DeformableDetrImageProcessor
|
||||
|
||||
[[autodoc]] DeformableDetrImageProcessor
|
||||
- preprocess
|
||||
- post_process_object_detection
|
||||
|
||||
## DeformableDetrFeatureExtractor
|
||||
|
||||
[[autodoc]] DeformableDetrFeatureExtractor
|
||||
- __call__
|
||||
- post_process_object_detection
|
||||
|
||||
## DeformableDetrConfig
|
||||
|
||||
[[autodoc]] DeformableDetrConfig
|
||||
|
||||
## DeformableDetrModel
|
||||
|
||||
[[autodoc]] DeformableDetrModel
|
||||
- forward
|
||||
|
||||
## DeformableDetrForObjectDetection
|
||||
|
||||
[[autodoc]] DeformableDetrForObjectDetection
|
||||
- forward
|
||||
@ -518,7 +518,7 @@ DETR モデルをトレーニングできる「ラベル」。画像プロセッ
|
||||
... outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
|
||||
|
||||
... orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
|
||||
... results = im_processor.post_process(outputs, orig_target_sizes) # convert outputs of model to COCO api
|
||||
... results = im_processor.post_process(outputs, orig_target_sizes) # convert outputs of model to Pascal VOC format (xmin, ymin, xmax, ymax)
|
||||
|
||||
... module.add(prediction=results, reference=labels)
|
||||
... del batch
|
||||
|
||||
@ -504,7 +504,7 @@ COCO 데이터 세트를 빌드하는 API는 데이터를 특정 형식으로
|
||||
... outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
|
||||
|
||||
... orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
|
||||
... results = im_processor.post_process(outputs, orig_target_sizes) # convert outputs of model to COCO api
|
||||
... results = im_processor.post_process(outputs, orig_target_sizes) # convert outputs of model to Pascal VOC format (xmin, ymin, xmax, ymax)
|
||||
|
||||
... module.add(prediction=results, reference=labels)
|
||||
... del batch
|
||||
|
||||
@ -73,7 +73,7 @@ pip install datasets
|
||||
`tokenizer`返回一个包含三个重要对象的字典:
|
||||
|
||||
* [input_ids](glossary#input-ids) 是与句子中每个`token`对应的索引。
|
||||
* [attention_mask](glossary#attention-mask) 指示是否应该关注一个`toekn`。
|
||||
* [attention_mask](glossary#attention-mask) 指示是否应该关注一个`token`。
|
||||
* [token_type_ids](glossary#token-type-ids) 在存在多个序列时标识一个`token`属于哪个序列。
|
||||
|
||||
通过解码 `input_ids` 来返回您的输入:
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# Image Captioning (vision-encoder-text-decoder model) training example
|
||||
|
||||
The following example showcases how to finetune a vision-encoder-text-decoder model for image captioning
|
||||
using the JAX/Flax backend, leveraging 🤗 Transformers library's [FlaxVisionEncoderDecoderModel](https://huggingface.co/docs/transformers/model_doc/visionencoderdecoder#transformers.FlaxVisionEncoderDecoderModel).
|
||||
using the JAX/Flax backend, leveraging 🤗 Transformers library's [FlaxVisionEncoderDecoderModel](https://huggingface.co/docs/transformers/model_doc/vision-encoder-decoder#transformers.FlaxVisionEncoderDecoderModel).
|
||||
|
||||
JAX/Flax allows you to trace pure functions and compile them into efficient, fused accelerator code on both GPU and TPU.
|
||||
Models written in JAX/Flax are **immutable** and updated in a purely functional
|
||||
|
||||
@ -62,7 +62,7 @@ from transformers.utils import check_min_version, send_example_telemetry
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
Array = Any
|
||||
Dataset = datasets.arrow_dataset.Dataset
|
||||
|
||||
@ -60,7 +60,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risk.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recogintion/requirements.txt")
|
||||
|
||||
|
||||
@ -55,7 +55,7 @@ from transformers.utils import check_min_version, send_example_telemetry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
Array = Any
|
||||
Dataset = datasets.arrow_dataset.Dataset
|
||||
|
||||
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
|
||||
|
||||
|
||||
@ -45,7 +45,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")
|
||||
|
||||
|
||||
@ -55,7 +55,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
|
||||
|
||||
|
||||
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@ -44,7 +44,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
|
||||
|
||||
|
||||
@ -49,7 +49,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
|
||||
|
||||
|
||||
@ -54,7 +54,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
|
||||
|
||||
|
||||
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||
|
||||
@ -510,7 +510,10 @@ def main():
|
||||
f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
|
||||
f"Using block_size={min(1024, max_pos_embeddings)} instead. You can change that default value by passing --block_size xxx."
|
||||
)
|
||||
block_size = min(1024, max_pos_embeddings)
|
||||
if max_pos_embeddings > 0:
|
||||
block_size = min(1024, max_pos_embeddings)
|
||||
else:
|
||||
block_size = 1024
|
||||
else:
|
||||
if data_args.block_size > tokenizer.model_max_length:
|
||||
logger.warning(
|
||||
|
||||
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@ -54,7 +54,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||
|
||||
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||
|
||||
|
||||
@ -48,7 +48,7 @@ from transformers.utils import PaddingStrategy, check_min_version, send_example_
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@ -56,7 +56,7 @@ from transformers.utils import PaddingStrategy, check_min_version, send_example_
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
# You should update this to your particular problem to have better documentation of `model_type`
|
||||
|
||||
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
|
||||
|
||||
|
||||
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
|
||||
|
||||
|
||||
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
|
||||
|
||||
|
||||
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
|
||||
|
||||
|
||||
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt")
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
|
||||
|
||||
|
||||
@ -53,7 +53,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
|
||||
|
||||
|
||||
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
|
||||
|
||||
|
||||
@ -53,7 +53,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
|
||||
|
||||
|
||||
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
|
||||
|
||||
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
|
||||
|
||||
|
||||
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
|
||||
|
||||
|
||||
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
|
||||
|
||||
|
||||
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
|
||||
|
||||
@ -53,7 +53,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
logger = get_logger(__name__)
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
|
||||
|
||||
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version(
|
||||
"datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt"
|
||||
|
||||
@ -55,7 +55,7 @@ from transformers.utils.versions import require_version
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
|
||||
|
||||
|
||||
@ -51,7 +51,7 @@ from transformers.utils import PaddingStrategy, check_min_version, send_example_
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@ -49,7 +49,7 @@ from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, check_min_version,
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.36.0")
|
||||
check_min_version("4.37.0.dev0")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user